napistu 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/consensus.py +3 -4
- napistu/gcs/constants.py +5 -5
- napistu/ingestion/constants.py +51 -0
- napistu/ingestion/reactom_fi.py +208 -0
- napistu/network/constants.py +23 -1
- napistu/network/ig_utils.py +161 -1
- napistu/network/net_create.py +3 -3
- napistu/network/net_propagation.py +646 -96
- napistu/network/ng_utils.py +26 -6
- napistu/network/precompute.py +56 -0
- napistu/sbml_dfs_utils.py +8 -2
- napistu/source.py +243 -40
- napistu/statistics/__init__.py +10 -0
- napistu/statistics/hypothesis_testing.py +66 -0
- napistu/statistics/quantiles.py +82 -0
- napistu/utils.py +23 -1
- {napistu-0.4.1.dist-info → napistu-0.4.3.dist-info}/METADATA +1 -1
- {napistu-0.4.1.dist-info → napistu-0.4.3.dist-info}/RECORD +29 -24
- tests/test_network_ig_utils.py +133 -0
- tests/test_network_net_propagation.py +365 -74
- tests/test_network_precompute.py +30 -0
- tests/test_sbml_dfs_utils.py +13 -0
- tests/test_source.py +38 -6
- tests/test_statistics_hypothesis_testing.py +62 -0
- tests/test_statistics_quantiles.py +133 -0
- tests/test_set_coverage.py +0 -50
- {napistu-0.4.1.dist-info → napistu-0.4.3.dist-info}/WHEEL +0 -0
- {napistu-0.4.1.dist-info → napistu-0.4.3.dist-info}/entry_points.txt +0 -0
- {napistu-0.4.1.dist-info → napistu-0.4.3.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.4.1.dist-info → napistu-0.4.3.dist-info}/top_level.txt +0 -0
napistu/network/ng_utils.py
CHANGED
@@ -66,7 +66,7 @@ def compartmentalize_species_pairs(
|
|
66
66
|
Compartmentalize Shortest Paths
|
67
67
|
|
68
68
|
For a set of origin and destination species pairs, consider each species in every
|
69
|
-
|
69
|
+
compartment it operates in, seperately.
|
70
70
|
|
71
71
|
Parameters
|
72
72
|
----------
|
@@ -112,22 +112,42 @@ def compartmentalize_species_pairs(
|
|
112
112
|
|
113
113
|
|
114
114
|
def get_minimal_sources_edges(
|
115
|
-
vertices: pd.DataFrame,
|
115
|
+
vertices: pd.DataFrame,
|
116
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
117
|
+
source_total_counts: Optional[pd.Series] = None,
|
116
118
|
) -> pd.DataFrame | None:
|
117
|
-
"""
|
119
|
+
"""
|
120
|
+
Assign edges to a set of sources.
|
121
|
+
|
122
|
+
Parameters
|
123
|
+
----------
|
124
|
+
vertices: pd.DataFrame
|
125
|
+
A table of vertices.
|
126
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
127
|
+
A pathway model
|
128
|
+
source_total_counts: pd.Series
|
129
|
+
A series of the total counts of each source.
|
130
|
+
|
131
|
+
Returns
|
132
|
+
-------
|
133
|
+
edge_sources: pd.DataFrame
|
134
|
+
A table of edges and the sources they are assigned to.
|
135
|
+
"""
|
136
|
+
|
118
137
|
nodes = vertices["node"].tolist()
|
119
138
|
present_reactions = sbml_dfs.reactions[sbml_dfs.reactions.index.isin(nodes)]
|
120
139
|
|
121
140
|
if len(present_reactions) == 0:
|
122
141
|
return None
|
123
142
|
|
124
|
-
|
125
|
-
source_df = source.unnest_sources(present_reactions, table_schema["source"])
|
143
|
+
source_df = source.unnest_sources(present_reactions)
|
126
144
|
|
127
145
|
if source_df is None:
|
128
146
|
return None
|
129
147
|
else:
|
130
|
-
edge_sources = source.
|
148
|
+
edge_sources = source.source_set_coverage(
|
149
|
+
source_df, source_total_counts, sbml_dfs
|
150
|
+
)
|
131
151
|
return edge_sources.reset_index()[
|
132
152
|
[SBML_DFS.R_ID, SOURCE_SPEC.PATHWAY_ID, SOURCE_SPEC.NAME]
|
133
153
|
]
|
napistu/network/precompute.py
CHANGED
@@ -110,6 +110,62 @@ def precompute_distances(
|
|
110
110
|
return filtered_precomputed_distances
|
111
111
|
|
112
112
|
|
113
|
+
def filter_precomputed_distances_top_n(precomputed_distances, top_n=50):
|
114
|
+
"""
|
115
|
+
Filter precomputed distances to only include the top-n pairs for each distance measure.
|
116
|
+
|
117
|
+
Parameters
|
118
|
+
----------
|
119
|
+
precomputed_distances : pd.DataFrame
|
120
|
+
Precomputed distances.
|
121
|
+
top_n : int, optional
|
122
|
+
Top-n pairs to include for each distance measure.
|
123
|
+
|
124
|
+
Returns
|
125
|
+
-------
|
126
|
+
pd.DataFrame
|
127
|
+
Filtered precomputed distances.
|
128
|
+
"""
|
129
|
+
|
130
|
+
# take the union of top-n for each distance measure; and from origin -> dest and dest -> origin
|
131
|
+
distance_vars = set(precomputed_distances.columns) - {
|
132
|
+
NAPISTU_EDGELIST.SC_ID_ORIGIN,
|
133
|
+
NAPISTU_EDGELIST.SC_ID_DEST,
|
134
|
+
}
|
135
|
+
|
136
|
+
valid_pairs = list()
|
137
|
+
for distance_var in distance_vars:
|
138
|
+
top_n_pairs_by_origin = (
|
139
|
+
precomputed_distances.sort_values(by=distance_var, ascending=False)
|
140
|
+
.groupby(NAPISTU_EDGELIST.SC_ID_ORIGIN)
|
141
|
+
.head(top_n)
|
142
|
+
)
|
143
|
+
top_n_pairs_by_dest = (
|
144
|
+
precomputed_distances.sort_values(by=distance_var, ascending=False)
|
145
|
+
.groupby(NAPISTU_EDGELIST.SC_ID_DEST)
|
146
|
+
.head(top_n)
|
147
|
+
)
|
148
|
+
|
149
|
+
valid_pairs.append(
|
150
|
+
top_n_pairs_by_origin[
|
151
|
+
[NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
|
152
|
+
]
|
153
|
+
)
|
154
|
+
valid_pairs.append(
|
155
|
+
top_n_pairs_by_dest[
|
156
|
+
[NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
|
157
|
+
]
|
158
|
+
)
|
159
|
+
|
160
|
+
all_valid_pairs = pd.concat(valid_pairs).drop_duplicates()
|
161
|
+
|
162
|
+
return precomputed_distances.merge(
|
163
|
+
all_valid_pairs,
|
164
|
+
on=[NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST],
|
165
|
+
how="inner",
|
166
|
+
)
|
167
|
+
|
168
|
+
|
113
169
|
def _calculate_distances_subset(
|
114
170
|
napistu_graph: NapistuGraph,
|
115
171
|
vs_to_partition: pd.DataFrame,
|
napistu/sbml_dfs_utils.py
CHANGED
@@ -456,8 +456,14 @@ def infer_entity_type(df: pd.DataFrame) -> str:
|
|
456
456
|
if entity_schema.get(SCHEMA_DEFS.PK) == df.index.name:
|
457
457
|
return entity_type
|
458
458
|
|
459
|
-
# Get DataFrame columns that are also primary keys
|
460
|
-
|
459
|
+
# Get DataFrame columns that are also primary keys, including index or MultiIndex names
|
460
|
+
index_names = []
|
461
|
+
if isinstance(df.index, pd.MultiIndex):
|
462
|
+
index_names = [name for name in df.index.names if name is not None]
|
463
|
+
elif df.index.name is not None:
|
464
|
+
index_names = [df.index.name]
|
465
|
+
|
466
|
+
df_columns = set(df.columns).union(index_names).intersection(primary_keys)
|
461
467
|
|
462
468
|
# Check for exact match with primary key + foreign keys
|
463
469
|
for entity_type, entity_schema in schema.items():
|
napistu/source.py
CHANGED
@@ -1,8 +1,14 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import numpy as np
|
3
4
|
import pandas as pd
|
5
|
+
from typing import Optional
|
6
|
+
|
4
7
|
from napistu import indices
|
5
|
-
from napistu
|
8
|
+
from napistu import sbml_dfs_core
|
9
|
+
from napistu import sbml_dfs_utils
|
10
|
+
from napistu.statistics import hypothesis_testing
|
11
|
+
from napistu.constants import SBML_DFS_SCHEMA, SCHEMA_DEFS, SOURCE_SPEC
|
6
12
|
|
7
13
|
|
8
14
|
class Source:
|
@@ -41,11 +47,18 @@ class Source:
|
|
41
47
|
Creates an empty source object. This is typically used when creating an SBML_dfs
|
42
48
|
object from a single source.
|
43
49
|
pw_index : indices.PWIndex
|
50
|
+
a pathway index object containing the pathway_id and other metadata
|
44
51
|
|
45
52
|
Returns
|
46
53
|
-------
|
47
54
|
None.
|
48
55
|
|
56
|
+
Raises
|
57
|
+
------
|
58
|
+
ValueError:
|
59
|
+
if pw_index is not a indices.PWIndex
|
60
|
+
ValueError:
|
61
|
+
if SOURCE_SPEC.MODEL is not present in source_df
|
49
62
|
"""
|
50
63
|
|
51
64
|
if init is True:
|
@@ -101,8 +114,27 @@ def create_source_table(
|
|
101
114
|
"""
|
102
115
|
Create Source Table
|
103
116
|
|
104
|
-
Create a table with one row per "new_id" and a Source object created from the
|
105
|
-
|
117
|
+
Create a table with one row per "new_id" and a Source object created from the unionof "old_id" Source objects
|
118
|
+
|
119
|
+
Parameters
|
120
|
+
----------
|
121
|
+
lookup_table: pd.Series
|
122
|
+
a pd.Series containing the index of the table to create a source table for
|
123
|
+
table_schema: dict
|
124
|
+
a dictionary containing the schema of the table to create a source table for
|
125
|
+
pw_index: indices.PWIndex
|
126
|
+
a pathway index object containing the pathway_id and other metadata
|
127
|
+
|
128
|
+
Returns
|
129
|
+
-------
|
130
|
+
source_table: pd.DataFrame
|
131
|
+
a pd.DataFrame containing the index of the table to create a source table for
|
132
|
+
with one row per "new_id" and a Source object created from the union of "old_id" Source objects
|
133
|
+
|
134
|
+
Raises
|
135
|
+
------
|
136
|
+
ValueError:
|
137
|
+
if SOURCE_SPEC.SOURCE is not present in table_schema
|
106
138
|
"""
|
107
139
|
|
108
140
|
if SOURCE_SPEC.SOURCE not in table_schema.keys():
|
@@ -142,8 +174,27 @@ def merge_sources(source_list: list | pd.Series) -> Source:
|
|
142
174
|
|
143
175
|
Merge a list of Source objects into a single Source object
|
144
176
|
|
177
|
+
Parameters
|
178
|
+
----------
|
179
|
+
source_list: list | pd.Series
|
180
|
+
a list of Source objects or a pd.Series of Source objects
|
181
|
+
|
182
|
+
Returns
|
183
|
+
-------
|
184
|
+
source: Source
|
185
|
+
a Source object created from the union of the Source objects in source_list
|
186
|
+
|
187
|
+
Raises
|
188
|
+
------
|
189
|
+
TypeError:
|
190
|
+
if source_list is not a list or pd.Series
|
145
191
|
"""
|
146
192
|
|
193
|
+
if not isinstance(source_list, (list, pd.Series)):
|
194
|
+
raise TypeError(
|
195
|
+
f"source_list must be a list or pd.Series, but was a {type(source_list).__name__}"
|
196
|
+
)
|
197
|
+
|
147
198
|
# filter to non-empty sources
|
148
199
|
# empty sources have only been initialized; a merge hasn't occured
|
149
200
|
existing_sources = [s.source is not None for s in source_list]
|
@@ -160,28 +211,35 @@ def merge_sources(source_list: list | pd.Series) -> Source:
|
|
160
211
|
return Source(pd.concat(existing_source_list))
|
161
212
|
|
162
213
|
|
163
|
-
def unnest_sources(
|
164
|
-
source_table: pd.DataFrame, source_var: str, verbose: bool = False
|
165
|
-
) -> pd.DataFrame:
|
214
|
+
def unnest_sources(source_table: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
|
166
215
|
"""
|
167
216
|
Unnest Sources
|
168
217
|
|
169
218
|
Take a pd.DataFrame containing an array of Sources and
|
170
219
|
return one-row per source.
|
171
220
|
|
172
|
-
Parameters
|
221
|
+
Parameters
|
222
|
+
----------
|
173
223
|
source_table: pd.DataFrame
|
174
224
|
a table containing an array of Sources
|
175
|
-
|
176
|
-
|
225
|
+
verbose: bool
|
226
|
+
print progress
|
177
227
|
|
178
|
-
Returns
|
228
|
+
Returns
|
229
|
+
-------
|
179
230
|
pd.Dataframe containing the index of source_table but expanded
|
180
231
|
to include one row per source
|
181
232
|
|
182
233
|
"""
|
183
234
|
|
184
235
|
sources = list()
|
236
|
+
|
237
|
+
table_type = sbml_dfs_utils.infer_entity_type(source_table)
|
238
|
+
source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
|
239
|
+
if SCHEMA_DEFS.SOURCE not in source_table_schema.keys():
|
240
|
+
raise ValueError(f"{table_type} does not have a source attribute")
|
241
|
+
|
242
|
+
source_var = source_table_schema[SCHEMA_DEFS.SOURCE]
|
185
243
|
source_table_index = source_table.index.to_frame().reset_index(drop=True)
|
186
244
|
|
187
245
|
for i in range(source_table.shape[0]):
|
@@ -216,53 +274,73 @@ def unnest_sources(
|
|
216
274
|
return pd.concat(sources)
|
217
275
|
|
218
276
|
|
219
|
-
def
|
220
|
-
|
277
|
+
def source_set_coverage(
|
278
|
+
select_sources_df: pd.DataFrame,
|
279
|
+
source_total_counts: Optional[pd.Series] = None,
|
280
|
+
sbml_dfs: Optional[sbml_dfs_core.SBML_dfs] = None,
|
221
281
|
) -> pd.DataFrame:
|
222
282
|
"""
|
223
283
|
Greedy Set Coverage of Sources
|
224
284
|
|
225
|
-
|
226
|
-
|
285
|
+
Find the set of pathways covering `select_sources_df`. If `all_sources_df`
|
286
|
+
is provided pathways will be selected iteratively based on statistical
|
287
|
+
enrichment. If `all_sources_df` is not provided, the largest pathways
|
288
|
+
will be chosen iteratively.
|
227
289
|
|
228
|
-
Parameters
|
229
|
-
|
290
|
+
Parameters
|
291
|
+
----------
|
292
|
+
select_sources_df: pd.DataFrame
|
230
293
|
pd.Dataframe containing the index of source_table but expanded to
|
231
294
|
include one row per source. As produced by source.unnest_sources()
|
232
|
-
|
233
|
-
|
295
|
+
source_total_counts: pd.Series
|
296
|
+
pd.Series containing the total counts of each source. As produced by
|
297
|
+
source.get_source_total_counts()
|
298
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
299
|
+
if `source_total_counts` is provided then `sbml_dfs` must be provided
|
300
|
+
to calculate the total number of entities in the table.
|
301
|
+
|
302
|
+
Returns
|
303
|
+
-------
|
234
304
|
minimial_sources: [str]
|
235
305
|
A list of pathway_ids of the minimal source set
|
236
306
|
|
237
307
|
"""
|
238
308
|
|
309
|
+
table_type = sbml_dfs_utils.infer_entity_type(select_sources_df)
|
310
|
+
pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
|
311
|
+
|
312
|
+
if source_total_counts is not None:
|
313
|
+
if sbml_dfs is None:
|
314
|
+
raise ValueError(
|
315
|
+
"If `source_total_counts` is provided, `sbml_dfs` must be provided to calculate the total number of entities in the table."
|
316
|
+
)
|
317
|
+
n_total_entities = sbml_dfs.get_table(table_type).shape[0]
|
318
|
+
|
239
319
|
# rollup pathways with identical membership
|
240
|
-
deduplicated_sources = _deduplicate_source_df(
|
320
|
+
deduplicated_sources = _deduplicate_source_df(select_sources_df)
|
241
321
|
|
242
322
|
unaccounted_for_members = deduplicated_sources
|
243
323
|
retained_pathway_ids = []
|
244
|
-
|
245
324
|
while unaccounted_for_members.shape[0] != 0:
|
246
325
|
# find the pathway with the most members
|
247
|
-
|
248
|
-
|
326
|
+
|
327
|
+
if source_total_counts is None:
|
328
|
+
top_pathway = _select_top_pathway_by_size(unaccounted_for_members)
|
329
|
+
else:
|
330
|
+
top_pathway = _select_top_pathway_by_enrichment(
|
331
|
+
unaccounted_for_members, source_total_counts, n_total_entities, pk
|
332
|
+
)
|
333
|
+
|
334
|
+
if top_pathway is None:
|
335
|
+
break
|
336
|
+
|
249
337
|
retained_pathway_ids.append(top_pathway)
|
250
338
|
|
251
339
|
# remove all members associated with the top pathway
|
252
|
-
|
253
|
-
unaccounted_for_members
|
254
|
-
unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
|
255
|
-
]
|
256
|
-
.index.get_level_values(table_schema["pk"])
|
257
|
-
.tolist()
|
340
|
+
unaccounted_for_members = _update_unaccounted_for_members(
|
341
|
+
top_pathway, unaccounted_for_members
|
258
342
|
)
|
259
343
|
|
260
|
-
unaccounted_for_members = unaccounted_for_members[
|
261
|
-
~unaccounted_for_members.index.get_level_values(table_schema["pk"]).isin(
|
262
|
-
members_captured
|
263
|
-
)
|
264
|
-
]
|
265
|
-
|
266
344
|
minimial_sources = deduplicated_sources[
|
267
345
|
deduplicated_sources[SOURCE_SPEC.PATHWAY_ID].isin(retained_pathway_ids)
|
268
346
|
].sort_index()
|
@@ -270,9 +348,39 @@ def greedy_set_coverge_of_sources(
|
|
270
348
|
return minimial_sources
|
271
349
|
|
272
350
|
|
273
|
-
def
|
351
|
+
def get_source_total_counts(
|
352
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, entity_type: str
|
353
|
+
) -> pd.Series:
|
354
|
+
"""
|
355
|
+
Get the total counts of each source.
|
356
|
+
|
357
|
+
Parameters
|
358
|
+
----------
|
359
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
360
|
+
sbml_dfs object containing the table to get the total counts of
|
361
|
+
entity_type: str
|
362
|
+
the type of entity to get the total counts of
|
363
|
+
|
364
|
+
Returns
|
365
|
+
-------
|
366
|
+
source_total_counts: pd.Series
|
367
|
+
pd.Series containing the total counts of each source.
|
368
|
+
"""
|
369
|
+
|
370
|
+
all_sources_table = unnest_sources(sbml_dfs.get_table(entity_type))
|
371
|
+
source_total_counts = all_sources_table.value_counts(SOURCE_SPEC.PATHWAY_ID).rename(
|
372
|
+
"total_counts"
|
373
|
+
)
|
374
|
+
|
375
|
+
return source_total_counts
|
376
|
+
|
377
|
+
|
378
|
+
def _deduplicate_source_df(source_df: pd.DataFrame) -> pd.DataFrame:
|
274
379
|
"""Combine entries in a source table when multiple models have the same members."""
|
275
380
|
|
381
|
+
table_type = sbml_dfs_utils.infer_entity_type(source_df)
|
382
|
+
source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
|
383
|
+
|
276
384
|
# drop entries which are missing required attributes and throw an error if none are left
|
277
385
|
REQUIRED_NON_NA_ATTRIBUTES = [SOURCE_SPEC.PATHWAY_ID]
|
278
386
|
indexed_sources = (
|
@@ -296,7 +404,11 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
|
|
296
404
|
{
|
297
405
|
SOURCE_SPEC.PATHWAY_ID: p,
|
298
406
|
"membership_string": "_".join(
|
299
|
-
set(
|
407
|
+
set(
|
408
|
+
indexed_sources.loc[[p]][
|
409
|
+
source_table_schema[SCHEMA_DEFS.PK]
|
410
|
+
].tolist()
|
411
|
+
)
|
300
412
|
),
|
301
413
|
}
|
302
414
|
for p in pathways
|
@@ -320,16 +432,16 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
|
|
320
432
|
|
321
433
|
merged_sources = pd.concat(
|
322
434
|
[
|
323
|
-
_collapse_by_membership_string(s, membership_categories,
|
435
|
+
_collapse_by_membership_string(s, membership_categories, source_table_schema) # type: ignore
|
324
436
|
for s in category_index.tolist()
|
325
437
|
]
|
326
438
|
)
|
327
439
|
merged_sources[SOURCE_SPEC.INDEX_NAME] = merged_sources.groupby(
|
328
|
-
|
440
|
+
source_table_schema[SCHEMA_DEFS.PK]
|
329
441
|
).cumcount()
|
330
442
|
|
331
443
|
return merged_sources.set_index(
|
332
|
-
[
|
444
|
+
[source_table_schema[SCHEMA_DEFS.PK], SOURCE_SPEC.INDEX_NAME]
|
333
445
|
).sort_index()
|
334
446
|
|
335
447
|
|
@@ -345,7 +457,10 @@ def _collapse_by_membership_string(
|
|
345
457
|
return pd.DataFrame(
|
346
458
|
[
|
347
459
|
pd.concat(
|
348
|
-
[
|
460
|
+
[
|
461
|
+
pd.Series({table_schema[SCHEMA_DEFS.PK]: ms}),
|
462
|
+
collapsed_source_membership,
|
463
|
+
]
|
349
464
|
)
|
350
465
|
for ms in membership_string.split("_")
|
351
466
|
]
|
@@ -398,3 +513,91 @@ def _safe_source_merge(member_Sources: Source | list) -> Source:
|
|
398
513
|
return merge_sources(member_Sources.tolist())
|
399
514
|
else:
|
400
515
|
raise TypeError("Expecting source.Source or pd.Series")
|
516
|
+
|
517
|
+
|
518
|
+
def _select_top_pathway_by_size(unaccounted_for_members: pd.DataFrame) -> str:
|
519
|
+
|
520
|
+
pathway_members = unaccounted_for_members.value_counts(SOURCE_SPEC.PATHWAY_ID)
|
521
|
+
top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
|
522
|
+
|
523
|
+
return top_pathway
|
524
|
+
|
525
|
+
|
526
|
+
def _select_top_pathway_by_enrichment(
|
527
|
+
unaccounted_for_members: pd.DataFrame,
|
528
|
+
source_total_counts: pd.Series,
|
529
|
+
n_total_entities: int,
|
530
|
+
table_pk: str,
|
531
|
+
min_pw_size: int = 5,
|
532
|
+
) -> str:
|
533
|
+
|
534
|
+
n_observed_entities = len(
|
535
|
+
unaccounted_for_members.index.get_level_values(table_pk).unique()
|
536
|
+
)
|
537
|
+
pathway_members = unaccounted_for_members.value_counts(
|
538
|
+
SOURCE_SPEC.PATHWAY_ID
|
539
|
+
).rename("observed_members")
|
540
|
+
|
541
|
+
pathway_members = pathway_members.loc[pathway_members >= min_pw_size]
|
542
|
+
if pathway_members.shape[0] == 0:
|
543
|
+
return None
|
544
|
+
|
545
|
+
wide_contingency_table = (
|
546
|
+
pathway_members.to_frame()
|
547
|
+
.join(source_total_counts)
|
548
|
+
.assign(
|
549
|
+
missing_members=lambda x: x["total_counts"] - x["observed_members"],
|
550
|
+
observed_nonmembers=lambda x: n_observed_entities - x["observed_members"],
|
551
|
+
nonobserved_nonmembers=lambda x: n_total_entities
|
552
|
+
- x["observed_nonmembers"]
|
553
|
+
- x["missing_members"]
|
554
|
+
- x["observed_members"],
|
555
|
+
)
|
556
|
+
.drop(columns=["total_counts"])
|
557
|
+
)
|
558
|
+
|
559
|
+
# calculate enrichments using a fast vectorized normal approximation
|
560
|
+
odds_ratios, _ = hypothesis_testing.fisher_exact_vectorized(
|
561
|
+
wide_contingency_table["observed_members"],
|
562
|
+
wide_contingency_table["missing_members"],
|
563
|
+
wide_contingency_table["observed_nonmembers"],
|
564
|
+
wide_contingency_table["nonobserved_nonmembers"],
|
565
|
+
)
|
566
|
+
|
567
|
+
return pathway_members.index[np.argmax(odds_ratios)]
|
568
|
+
|
569
|
+
|
570
|
+
def _update_unaccounted_for_members(
|
571
|
+
top_pathway, unaccounted_for_members
|
572
|
+
) -> pd.DataFrame:
|
573
|
+
"""
|
574
|
+
Update the unaccounted for members dataframe by removing the members
|
575
|
+
associated with the top pathway.
|
576
|
+
|
577
|
+
Parameters
|
578
|
+
----------
|
579
|
+
top_pathway: str
|
580
|
+
the pathway to remove from the unaccounted for members
|
581
|
+
unaccounted_for_members: pd.DataFrame
|
582
|
+
the dataframe of unaccounted for members
|
583
|
+
|
584
|
+
Returns
|
585
|
+
-------
|
586
|
+
unaccounted_for_members: pd.DataFrame
|
587
|
+
the dataframe of unaccounted for members with the top pathway removed
|
588
|
+
"""
|
589
|
+
|
590
|
+
table_type = sbml_dfs_utils.infer_entity_type(unaccounted_for_members)
|
591
|
+
pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
|
592
|
+
|
593
|
+
members_captured = (
|
594
|
+
unaccounted_for_members[
|
595
|
+
unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
|
596
|
+
]
|
597
|
+
.index.get_level_values(pk)
|
598
|
+
.tolist()
|
599
|
+
)
|
600
|
+
|
601
|
+
return unaccounted_for_members[
|
602
|
+
~unaccounted_for_members.index.get_level_values(pk).isin(members_captured)
|
603
|
+
]
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from typing import Union
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
from scipy.stats import norm
|
5
|
+
|
6
|
+
|
7
|
+
def fisher_exact_vectorized(
|
8
|
+
observed_members: Union[list[int], np.ndarray],
|
9
|
+
missing_members: Union[list[int], np.ndarray],
|
10
|
+
observed_nonmembers: Union[list[int], np.ndarray],
|
11
|
+
nonobserved_nonmembers: Union[list[int], np.ndarray],
|
12
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
13
|
+
"""
|
14
|
+
Fast vectorized one-tailed Fisher exact test using normal approximation.
|
15
|
+
|
16
|
+
Parameters:
|
17
|
+
-----------
|
18
|
+
observed_members, missing_members, observed_nonmembers, nonobserved_nonmembers : array-like
|
19
|
+
The four cells of the 2x2 contingency tables (must be non-negative)
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
--------
|
23
|
+
odds_ratios : numpy array
|
24
|
+
Odds ratios for each test
|
25
|
+
p_values : numpy array
|
26
|
+
One-tailed p-values (tests for enrichment)
|
27
|
+
"""
|
28
|
+
# Convert to numpy arrays
|
29
|
+
a = np.array(observed_members, dtype=float)
|
30
|
+
b = np.array(missing_members, dtype=float)
|
31
|
+
c = np.array(observed_nonmembers, dtype=float)
|
32
|
+
d = np.array(nonobserved_nonmembers, dtype=float)
|
33
|
+
|
34
|
+
# Check for negative values and raise error
|
35
|
+
if np.any((a < 0) | (b < 0) | (c < 0) | (d < 0)):
|
36
|
+
raise ValueError("All contingency table values must be non-negative")
|
37
|
+
|
38
|
+
# Calculate odds ratios
|
39
|
+
odds_ratios = np.divide(
|
40
|
+
a * d, b * c, out=np.full_like(a, np.inf, dtype=float), where=(b * c) != 0
|
41
|
+
)
|
42
|
+
|
43
|
+
# Normal approximation to hypergeometric distribution
|
44
|
+
n = a + b + c + d
|
45
|
+
|
46
|
+
# Avoid division by zero in expected value calculation
|
47
|
+
expected_a = np.divide(
|
48
|
+
(a + b) * (a + c), n, out=np.zeros_like(n, dtype=float), where=n != 0
|
49
|
+
)
|
50
|
+
|
51
|
+
# Variance calculation with protection against division by zero
|
52
|
+
var_a = np.divide(
|
53
|
+
(a + b) * (c + d) * (a + c) * (b + d),
|
54
|
+
n * n * (n - 1),
|
55
|
+
out=np.ones_like(n, dtype=float), # Default to 1 to avoid sqrt(0)
|
56
|
+
where=(n > 1),
|
57
|
+
)
|
58
|
+
var_a = np.maximum(var_a, 1e-10) # Ensure positive variance
|
59
|
+
|
60
|
+
# Continuity correction and z-score
|
61
|
+
z = (a - expected_a - 0.5) / np.sqrt(var_a)
|
62
|
+
|
63
|
+
# One-tailed p-value (upper tail for enrichment)
|
64
|
+
p_values = norm.sf(z) # 1 - norm.cdf(z)
|
65
|
+
|
66
|
+
return odds_ratios, p_values
|