napistu 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -66,7 +66,7 @@ def compartmentalize_species_pairs(
66
66
  Compartmentalize Shortest Paths
67
67
 
68
68
  For a set of origin and destination species pairs, consider each species in every
69
- compartment it operates in, seperately.
69
+ compartment it operates in, seperately.
70
70
 
71
71
  Parameters
72
72
  ----------
@@ -112,22 +112,42 @@ def compartmentalize_species_pairs(
112
112
 
113
113
 
114
114
  def get_minimal_sources_edges(
115
- vertices: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
115
+ vertices: pd.DataFrame,
116
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
117
+ source_total_counts: Optional[pd.Series] = None,
116
118
  ) -> pd.DataFrame | None:
117
- """Assign edges to a set of sources."""
119
+ """
120
+ Assign edges to a set of sources.
121
+
122
+ Parameters
123
+ ----------
124
+ vertices: pd.DataFrame
125
+ A table of vertices.
126
+ sbml_dfs: sbml_dfs_core.SBML_dfs
127
+ A pathway model
128
+ source_total_counts: pd.Series
129
+ A series of the total counts of each source.
130
+
131
+ Returns
132
+ -------
133
+ edge_sources: pd.DataFrame
134
+ A table of edges and the sources they are assigned to.
135
+ """
136
+
118
137
  nodes = vertices["node"].tolist()
119
138
  present_reactions = sbml_dfs.reactions[sbml_dfs.reactions.index.isin(nodes)]
120
139
 
121
140
  if len(present_reactions) == 0:
122
141
  return None
123
142
 
124
- table_schema = sbml_dfs.schema[SBML_DFS.REACTIONS]
125
- source_df = source.unnest_sources(present_reactions, table_schema["source"])
143
+ source_df = source.unnest_sources(present_reactions)
126
144
 
127
145
  if source_df is None:
128
146
  return None
129
147
  else:
130
- edge_sources = source.greedy_set_coverge_of_sources(source_df, table_schema)
148
+ edge_sources = source.source_set_coverage(
149
+ source_df, source_total_counts, sbml_dfs
150
+ )
131
151
  return edge_sources.reset_index()[
132
152
  [SBML_DFS.R_ID, SOURCE_SPEC.PATHWAY_ID, SOURCE_SPEC.NAME]
133
153
  ]
@@ -110,6 +110,62 @@ def precompute_distances(
110
110
  return filtered_precomputed_distances
111
111
 
112
112
 
113
+ def filter_precomputed_distances_top_n(precomputed_distances, top_n=50):
114
+ """
115
+ Filter precomputed distances to only include the top-n pairs for each distance measure.
116
+
117
+ Parameters
118
+ ----------
119
+ precomputed_distances : pd.DataFrame
120
+ Precomputed distances.
121
+ top_n : int, optional
122
+ Top-n pairs to include for each distance measure.
123
+
124
+ Returns
125
+ -------
126
+ pd.DataFrame
127
+ Filtered precomputed distances.
128
+ """
129
+
130
+ # take the union of top-n for each distance measure; and from origin -> dest and dest -> origin
131
+ distance_vars = set(precomputed_distances.columns) - {
132
+ NAPISTU_EDGELIST.SC_ID_ORIGIN,
133
+ NAPISTU_EDGELIST.SC_ID_DEST,
134
+ }
135
+
136
+ valid_pairs = list()
137
+ for distance_var in distance_vars:
138
+ top_n_pairs_by_origin = (
139
+ precomputed_distances.sort_values(by=distance_var, ascending=False)
140
+ .groupby(NAPISTU_EDGELIST.SC_ID_ORIGIN)
141
+ .head(top_n)
142
+ )
143
+ top_n_pairs_by_dest = (
144
+ precomputed_distances.sort_values(by=distance_var, ascending=False)
145
+ .groupby(NAPISTU_EDGELIST.SC_ID_DEST)
146
+ .head(top_n)
147
+ )
148
+
149
+ valid_pairs.append(
150
+ top_n_pairs_by_origin[
151
+ [NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
152
+ ]
153
+ )
154
+ valid_pairs.append(
155
+ top_n_pairs_by_dest[
156
+ [NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
157
+ ]
158
+ )
159
+
160
+ all_valid_pairs = pd.concat(valid_pairs).drop_duplicates()
161
+
162
+ return precomputed_distances.merge(
163
+ all_valid_pairs,
164
+ on=[NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST],
165
+ how="inner",
166
+ )
167
+
168
+
113
169
  def _calculate_distances_subset(
114
170
  napistu_graph: NapistuGraph,
115
171
  vs_to_partition: pd.DataFrame,
napistu/sbml_dfs_utils.py CHANGED
@@ -456,8 +456,14 @@ def infer_entity_type(df: pd.DataFrame) -> str:
456
456
  if entity_schema.get(SCHEMA_DEFS.PK) == df.index.name:
457
457
  return entity_type
458
458
 
459
- # Get DataFrame columns that are also primary keys
460
- df_columns = set(df.columns).intersection(primary_keys)
459
+ # Get DataFrame columns that are also primary keys, including index or MultiIndex names
460
+ index_names = []
461
+ if isinstance(df.index, pd.MultiIndex):
462
+ index_names = [name for name in df.index.names if name is not None]
463
+ elif df.index.name is not None:
464
+ index_names = [df.index.name]
465
+
466
+ df_columns = set(df.columns).union(index_names).intersection(primary_keys)
461
467
 
462
468
  # Check for exact match with primary key + foreign keys
463
469
  for entity_type, entity_schema in schema.items():
napistu/source.py CHANGED
@@ -1,8 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import numpy as np
3
4
  import pandas as pd
5
+ from typing import Optional
6
+
4
7
  from napistu import indices
5
- from napistu.constants import SOURCE_SPEC
8
+ from napistu import sbml_dfs_core
9
+ from napistu import sbml_dfs_utils
10
+ from napistu.statistics import hypothesis_testing
11
+ from napistu.constants import SBML_DFS_SCHEMA, SCHEMA_DEFS, SOURCE_SPEC
6
12
 
7
13
 
8
14
  class Source:
@@ -41,11 +47,18 @@ class Source:
41
47
  Creates an empty source object. This is typically used when creating an SBML_dfs
42
48
  object from a single source.
43
49
  pw_index : indices.PWIndex
50
+ a pathway index object containing the pathway_id and other metadata
44
51
 
45
52
  Returns
46
53
  -------
47
54
  None.
48
55
 
56
+ Raises
57
+ ------
58
+ ValueError:
59
+ if pw_index is not a indices.PWIndex
60
+ ValueError:
61
+ if SOURCE_SPEC.MODEL is not present in source_df
49
62
  """
50
63
 
51
64
  if init is True:
@@ -101,8 +114,27 @@ def create_source_table(
101
114
  """
102
115
  Create Source Table
103
116
 
104
- Create a table with one row per "new_id" and a Source object created from the union
105
- of "old_id" Source objects
117
+ Create a table with one row per "new_id" and a Source object created from the unionof "old_id" Source objects
118
+
119
+ Parameters
120
+ ----------
121
+ lookup_table: pd.Series
122
+ a pd.Series containing the index of the table to create a source table for
123
+ table_schema: dict
124
+ a dictionary containing the schema of the table to create a source table for
125
+ pw_index: indices.PWIndex
126
+ a pathway index object containing the pathway_id and other metadata
127
+
128
+ Returns
129
+ -------
130
+ source_table: pd.DataFrame
131
+ a pd.DataFrame containing the index of the table to create a source table for
132
+ with one row per "new_id" and a Source object created from the union of "old_id" Source objects
133
+
134
+ Raises
135
+ ------
136
+ ValueError:
137
+ if SOURCE_SPEC.SOURCE is not present in table_schema
106
138
  """
107
139
 
108
140
  if SOURCE_SPEC.SOURCE not in table_schema.keys():
@@ -142,8 +174,27 @@ def merge_sources(source_list: list | pd.Series) -> Source:
142
174
 
143
175
  Merge a list of Source objects into a single Source object
144
176
 
177
+ Parameters
178
+ ----------
179
+ source_list: list | pd.Series
180
+ a list of Source objects or a pd.Series of Source objects
181
+
182
+ Returns
183
+ -------
184
+ source: Source
185
+ a Source object created from the union of the Source objects in source_list
186
+
187
+ Raises
188
+ ------
189
+ TypeError:
190
+ if source_list is not a list or pd.Series
145
191
  """
146
192
 
193
+ if not isinstance(source_list, (list, pd.Series)):
194
+ raise TypeError(
195
+ f"source_list must be a list or pd.Series, but was a {type(source_list).__name__}"
196
+ )
197
+
147
198
  # filter to non-empty sources
148
199
  # empty sources have only been initialized; a merge hasn't occured
149
200
  existing_sources = [s.source is not None for s in source_list]
@@ -160,28 +211,35 @@ def merge_sources(source_list: list | pd.Series) -> Source:
160
211
  return Source(pd.concat(existing_source_list))
161
212
 
162
213
 
163
- def unnest_sources(
164
- source_table: pd.DataFrame, source_var: str, verbose: bool = False
165
- ) -> pd.DataFrame:
214
+ def unnest_sources(source_table: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
166
215
  """
167
216
  Unnest Sources
168
217
 
169
218
  Take a pd.DataFrame containing an array of Sources and
170
219
  return one-row per source.
171
220
 
172
- Parameters:
221
+ Parameters
222
+ ----------
173
223
  source_table: pd.DataFrame
174
224
  a table containing an array of Sources
175
- source_var: str
176
- variable containing Sources
225
+ verbose: bool
226
+ print progress
177
227
 
178
- Returns:
228
+ Returns
229
+ -------
179
230
  pd.Dataframe containing the index of source_table but expanded
180
231
  to include one row per source
181
232
 
182
233
  """
183
234
 
184
235
  sources = list()
236
+
237
+ table_type = sbml_dfs_utils.infer_entity_type(source_table)
238
+ source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
239
+ if SCHEMA_DEFS.SOURCE not in source_table_schema.keys():
240
+ raise ValueError(f"{table_type} does not have a source attribute")
241
+
242
+ source_var = source_table_schema[SCHEMA_DEFS.SOURCE]
185
243
  source_table_index = source_table.index.to_frame().reset_index(drop=True)
186
244
 
187
245
  for i in range(source_table.shape[0]):
@@ -216,53 +274,73 @@ def unnest_sources(
216
274
  return pd.concat(sources)
217
275
 
218
276
 
219
- def greedy_set_coverge_of_sources(
220
- source_df: pd.DataFrame, table_schema: dict
277
+ def source_set_coverage(
278
+ select_sources_df: pd.DataFrame,
279
+ source_total_counts: Optional[pd.Series] = None,
280
+ sbml_dfs: Optional[sbml_dfs_core.SBML_dfs] = None,
221
281
  ) -> pd.DataFrame:
222
282
  """
223
283
  Greedy Set Coverage of Sources
224
284
 
225
- Apply the greedy set coverge algorithm to find the minimal set of
226
- sources which cover all entries
285
+ Find the set of pathways covering `select_sources_df`. If `all_sources_df`
286
+ is provided pathways will be selected iteratively based on statistical
287
+ enrichment. If `all_sources_df` is not provided, the largest pathways
288
+ will be chosen iteratively.
227
289
 
228
- Parameters:
229
- source_df: pd.DataFrame
290
+ Parameters
291
+ ----------
292
+ select_sources_df: pd.DataFrame
230
293
  pd.Dataframe containing the index of source_table but expanded to
231
294
  include one row per source. As produced by source.unnest_sources()
232
-
233
- Returns:
295
+ source_total_counts: pd.Series
296
+ pd.Series containing the total counts of each source. As produced by
297
+ source.get_source_total_counts()
298
+ sbml_dfs: sbml_dfs_core.SBML_dfs
299
+ if `source_total_counts` is provided then `sbml_dfs` must be provided
300
+ to calculate the total number of entities in the table.
301
+
302
+ Returns
303
+ -------
234
304
  minimial_sources: [str]
235
305
  A list of pathway_ids of the minimal source set
236
306
 
237
307
  """
238
308
 
309
+ table_type = sbml_dfs_utils.infer_entity_type(select_sources_df)
310
+ pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
311
+
312
+ if source_total_counts is not None:
313
+ if sbml_dfs is None:
314
+ raise ValueError(
315
+ "If `source_total_counts` is provided, `sbml_dfs` must be provided to calculate the total number of entities in the table."
316
+ )
317
+ n_total_entities = sbml_dfs.get_table(table_type).shape[0]
318
+
239
319
  # rollup pathways with identical membership
240
- deduplicated_sources = _deduplicate_source_df(source_df, table_schema)
320
+ deduplicated_sources = _deduplicate_source_df(select_sources_df)
241
321
 
242
322
  unaccounted_for_members = deduplicated_sources
243
323
  retained_pathway_ids = []
244
-
245
324
  while unaccounted_for_members.shape[0] != 0:
246
325
  # find the pathway with the most members
247
- pathway_members = unaccounted_for_members.groupby(SOURCE_SPEC.PATHWAY_ID).size()
248
- top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
326
+
327
+ if source_total_counts is None:
328
+ top_pathway = _select_top_pathway_by_size(unaccounted_for_members)
329
+ else:
330
+ top_pathway = _select_top_pathway_by_enrichment(
331
+ unaccounted_for_members, source_total_counts, n_total_entities, pk
332
+ )
333
+
334
+ if top_pathway is None:
335
+ break
336
+
249
337
  retained_pathway_ids.append(top_pathway)
250
338
 
251
339
  # remove all members associated with the top pathway
252
- members_captured = (
253
- unaccounted_for_members[
254
- unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
255
- ]
256
- .index.get_level_values(table_schema["pk"])
257
- .tolist()
340
+ unaccounted_for_members = _update_unaccounted_for_members(
341
+ top_pathway, unaccounted_for_members
258
342
  )
259
343
 
260
- unaccounted_for_members = unaccounted_for_members[
261
- ~unaccounted_for_members.index.get_level_values(table_schema["pk"]).isin(
262
- members_captured
263
- )
264
- ]
265
-
266
344
  minimial_sources = deduplicated_sources[
267
345
  deduplicated_sources[SOURCE_SPEC.PATHWAY_ID].isin(retained_pathway_ids)
268
346
  ].sort_index()
@@ -270,9 +348,39 @@ def greedy_set_coverge_of_sources(
270
348
  return minimial_sources
271
349
 
272
350
 
273
- def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.DataFrame:
351
+ def get_source_total_counts(
352
+ sbml_dfs: sbml_dfs_core.SBML_dfs, entity_type: str
353
+ ) -> pd.Series:
354
+ """
355
+ Get the total counts of each source.
356
+
357
+ Parameters
358
+ ----------
359
+ sbml_dfs: sbml_dfs_core.SBML_dfs
360
+ sbml_dfs object containing the table to get the total counts of
361
+ entity_type: str
362
+ the type of entity to get the total counts of
363
+
364
+ Returns
365
+ -------
366
+ source_total_counts: pd.Series
367
+ pd.Series containing the total counts of each source.
368
+ """
369
+
370
+ all_sources_table = unnest_sources(sbml_dfs.get_table(entity_type))
371
+ source_total_counts = all_sources_table.value_counts(SOURCE_SPEC.PATHWAY_ID).rename(
372
+ "total_counts"
373
+ )
374
+
375
+ return source_total_counts
376
+
377
+
378
+ def _deduplicate_source_df(source_df: pd.DataFrame) -> pd.DataFrame:
274
379
  """Combine entries in a source table when multiple models have the same members."""
275
380
 
381
+ table_type = sbml_dfs_utils.infer_entity_type(source_df)
382
+ source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
383
+
276
384
  # drop entries which are missing required attributes and throw an error if none are left
277
385
  REQUIRED_NON_NA_ATTRIBUTES = [SOURCE_SPEC.PATHWAY_ID]
278
386
  indexed_sources = (
@@ -296,7 +404,11 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
296
404
  {
297
405
  SOURCE_SPEC.PATHWAY_ID: p,
298
406
  "membership_string": "_".join(
299
- set(indexed_sources.loc[[p]][table_schema["pk"]].tolist())
407
+ set(
408
+ indexed_sources.loc[[p]][
409
+ source_table_schema[SCHEMA_DEFS.PK]
410
+ ].tolist()
411
+ )
300
412
  ),
301
413
  }
302
414
  for p in pathways
@@ -320,16 +432,16 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
320
432
 
321
433
  merged_sources = pd.concat(
322
434
  [
323
- _collapse_by_membership_string(s, membership_categories, table_schema) # type: ignore
435
+ _collapse_by_membership_string(s, membership_categories, source_table_schema) # type: ignore
324
436
  for s in category_index.tolist()
325
437
  ]
326
438
  )
327
439
  merged_sources[SOURCE_SPEC.INDEX_NAME] = merged_sources.groupby(
328
- table_schema["pk"]
440
+ source_table_schema[SCHEMA_DEFS.PK]
329
441
  ).cumcount()
330
442
 
331
443
  return merged_sources.set_index(
332
- [table_schema["pk"], SOURCE_SPEC.INDEX_NAME]
444
+ [source_table_schema[SCHEMA_DEFS.PK], SOURCE_SPEC.INDEX_NAME]
333
445
  ).sort_index()
334
446
 
335
447
 
@@ -345,7 +457,10 @@ def _collapse_by_membership_string(
345
457
  return pd.DataFrame(
346
458
  [
347
459
  pd.concat(
348
- [pd.Series({table_schema["pk"]: ms}), collapsed_source_membership]
460
+ [
461
+ pd.Series({table_schema[SCHEMA_DEFS.PK]: ms}),
462
+ collapsed_source_membership,
463
+ ]
349
464
  )
350
465
  for ms in membership_string.split("_")
351
466
  ]
@@ -398,3 +513,91 @@ def _safe_source_merge(member_Sources: Source | list) -> Source:
398
513
  return merge_sources(member_Sources.tolist())
399
514
  else:
400
515
  raise TypeError("Expecting source.Source or pd.Series")
516
+
517
+
518
+ def _select_top_pathway_by_size(unaccounted_for_members: pd.DataFrame) -> str:
519
+
520
+ pathway_members = unaccounted_for_members.value_counts(SOURCE_SPEC.PATHWAY_ID)
521
+ top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
522
+
523
+ return top_pathway
524
+
525
+
526
+ def _select_top_pathway_by_enrichment(
527
+ unaccounted_for_members: pd.DataFrame,
528
+ source_total_counts: pd.Series,
529
+ n_total_entities: int,
530
+ table_pk: str,
531
+ min_pw_size: int = 5,
532
+ ) -> str:
533
+
534
+ n_observed_entities = len(
535
+ unaccounted_for_members.index.get_level_values(table_pk).unique()
536
+ )
537
+ pathway_members = unaccounted_for_members.value_counts(
538
+ SOURCE_SPEC.PATHWAY_ID
539
+ ).rename("observed_members")
540
+
541
+ pathway_members = pathway_members.loc[pathway_members >= min_pw_size]
542
+ if pathway_members.shape[0] == 0:
543
+ return None
544
+
545
+ wide_contingency_table = (
546
+ pathway_members.to_frame()
547
+ .join(source_total_counts)
548
+ .assign(
549
+ missing_members=lambda x: x["total_counts"] - x["observed_members"],
550
+ observed_nonmembers=lambda x: n_observed_entities - x["observed_members"],
551
+ nonobserved_nonmembers=lambda x: n_total_entities
552
+ - x["observed_nonmembers"]
553
+ - x["missing_members"]
554
+ - x["observed_members"],
555
+ )
556
+ .drop(columns=["total_counts"])
557
+ )
558
+
559
+ # calculate enrichments using a fast vectorized normal approximation
560
+ odds_ratios, _ = hypothesis_testing.fisher_exact_vectorized(
561
+ wide_contingency_table["observed_members"],
562
+ wide_contingency_table["missing_members"],
563
+ wide_contingency_table["observed_nonmembers"],
564
+ wide_contingency_table["nonobserved_nonmembers"],
565
+ )
566
+
567
+ return pathway_members.index[np.argmax(odds_ratios)]
568
+
569
+
570
+ def _update_unaccounted_for_members(
571
+ top_pathway, unaccounted_for_members
572
+ ) -> pd.DataFrame:
573
+ """
574
+ Update the unaccounted for members dataframe by removing the members
575
+ associated with the top pathway.
576
+
577
+ Parameters
578
+ ----------
579
+ top_pathway: str
580
+ the pathway to remove from the unaccounted for members
581
+ unaccounted_for_members: pd.DataFrame
582
+ the dataframe of unaccounted for members
583
+
584
+ Returns
585
+ -------
586
+ unaccounted_for_members: pd.DataFrame
587
+ the dataframe of unaccounted for members with the top pathway removed
588
+ """
589
+
590
+ table_type = sbml_dfs_utils.infer_entity_type(unaccounted_for_members)
591
+ pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
592
+
593
+ members_captured = (
594
+ unaccounted_for_members[
595
+ unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
596
+ ]
597
+ .index.get_level_values(pk)
598
+ .tolist()
599
+ )
600
+
601
+ return unaccounted_for_members[
602
+ ~unaccounted_for_members.index.get_level_values(pk).isin(members_captured)
603
+ ]
@@ -0,0 +1,10 @@
1
+ from __future__ import annotations
2
+
3
+ from importlib.metadata import PackageNotFoundError
4
+ from importlib.metadata import version
5
+
6
+ try:
7
+ __version__ = version("napistu")
8
+ except PackageNotFoundError:
9
+ # package is not installed
10
+ pass
@@ -0,0 +1,66 @@
1
+ from typing import Union
2
+
3
+ import numpy as np
4
+ from scipy.stats import norm
5
+
6
+
7
+ def fisher_exact_vectorized(
8
+ observed_members: Union[list[int], np.ndarray],
9
+ missing_members: Union[list[int], np.ndarray],
10
+ observed_nonmembers: Union[list[int], np.ndarray],
11
+ nonobserved_nonmembers: Union[list[int], np.ndarray],
12
+ ) -> tuple[np.ndarray, np.ndarray]:
13
+ """
14
+ Fast vectorized one-tailed Fisher exact test using normal approximation.
15
+
16
+ Parameters:
17
+ -----------
18
+ observed_members, missing_members, observed_nonmembers, nonobserved_nonmembers : array-like
19
+ The four cells of the 2x2 contingency tables (must be non-negative)
20
+
21
+ Returns:
22
+ --------
23
+ odds_ratios : numpy array
24
+ Odds ratios for each test
25
+ p_values : numpy array
26
+ One-tailed p-values (tests for enrichment)
27
+ """
28
+ # Convert to numpy arrays
29
+ a = np.array(observed_members, dtype=float)
30
+ b = np.array(missing_members, dtype=float)
31
+ c = np.array(observed_nonmembers, dtype=float)
32
+ d = np.array(nonobserved_nonmembers, dtype=float)
33
+
34
+ # Check for negative values and raise error
35
+ if np.any((a < 0) | (b < 0) | (c < 0) | (d < 0)):
36
+ raise ValueError("All contingency table values must be non-negative")
37
+
38
+ # Calculate odds ratios
39
+ odds_ratios = np.divide(
40
+ a * d, b * c, out=np.full_like(a, np.inf, dtype=float), where=(b * c) != 0
41
+ )
42
+
43
+ # Normal approximation to hypergeometric distribution
44
+ n = a + b + c + d
45
+
46
+ # Avoid division by zero in expected value calculation
47
+ expected_a = np.divide(
48
+ (a + b) * (a + c), n, out=np.zeros_like(n, dtype=float), where=n != 0
49
+ )
50
+
51
+ # Variance calculation with protection against division by zero
52
+ var_a = np.divide(
53
+ (a + b) * (c + d) * (a + c) * (b + d),
54
+ n * n * (n - 1),
55
+ out=np.ones_like(n, dtype=float), # Default to 1 to avoid sqrt(0)
56
+ where=(n > 1),
57
+ )
58
+ var_a = np.maximum(var_a, 1e-10) # Ensure positive variance
59
+
60
+ # Continuity correction and z-score
61
+ z = (a - expected_a - 0.5) / np.sqrt(var_a)
62
+
63
+ # One-tailed p-value (upper tail for enrichment)
64
+ p_values = norm.sf(z) # 1 - norm.cdf(z)
65
+
66
+ return odds_ratios, p_values