napistu 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/consensus.py +3 -4
- napistu/ingestion/constants.py +51 -0
- napistu/ingestion/reactom_fi.py +208 -0
- napistu/network/neighborhoods.py +28 -7
- napistu/network/ng_utils.py +26 -6
- napistu/network/precompute.py +56 -0
- napistu/sbml_dfs_utils.py +8 -2
- napistu/source.py +243 -40
- napistu/statistics/hypothesis_testing.py +66 -0
- napistu/utils.py +23 -1
- {napistu-0.4.2.dist-info → napistu-0.4.4.dist-info}/METADATA +1 -1
- {napistu-0.4.2.dist-info → napistu-0.4.4.dist-info}/RECORD +20 -18
- tests/test_network_precompute.py +30 -0
- tests/test_sbml_dfs_utils.py +13 -0
- tests/test_source.py +38 -6
- tests/test_statistics_hypothesis_testing.py +62 -0
- tests/test_set_coverage.py +0 -50
- {napistu-0.4.2.dist-info → napistu-0.4.4.dist-info}/WHEEL +0 -0
- {napistu-0.4.2.dist-info → napistu-0.4.4.dist-info}/entry_points.txt +0 -0
- {napistu-0.4.2.dist-info → napistu-0.4.4.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.4.2.dist-info → napistu-0.4.4.dist-info}/top_level.txt +0 -0
napistu/source.py
CHANGED
@@ -1,8 +1,14 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import numpy as np
|
3
4
|
import pandas as pd
|
5
|
+
from typing import Optional
|
6
|
+
|
4
7
|
from napistu import indices
|
5
|
-
from napistu
|
8
|
+
from napistu import sbml_dfs_core
|
9
|
+
from napistu import sbml_dfs_utils
|
10
|
+
from napistu.statistics import hypothesis_testing
|
11
|
+
from napistu.constants import SBML_DFS_SCHEMA, SCHEMA_DEFS, SOURCE_SPEC
|
6
12
|
|
7
13
|
|
8
14
|
class Source:
|
@@ -41,11 +47,18 @@ class Source:
|
|
41
47
|
Creates an empty source object. This is typically used when creating an SBML_dfs
|
42
48
|
object from a single source.
|
43
49
|
pw_index : indices.PWIndex
|
50
|
+
a pathway index object containing the pathway_id and other metadata
|
44
51
|
|
45
52
|
Returns
|
46
53
|
-------
|
47
54
|
None.
|
48
55
|
|
56
|
+
Raises
|
57
|
+
------
|
58
|
+
ValueError:
|
59
|
+
if pw_index is not a indices.PWIndex
|
60
|
+
ValueError:
|
61
|
+
if SOURCE_SPEC.MODEL is not present in source_df
|
49
62
|
"""
|
50
63
|
|
51
64
|
if init is True:
|
@@ -101,8 +114,27 @@ def create_source_table(
|
|
101
114
|
"""
|
102
115
|
Create Source Table
|
103
116
|
|
104
|
-
Create a table with one row per "new_id" and a Source object created from the
|
105
|
-
|
117
|
+
Create a table with one row per "new_id" and a Source object created from the unionof "old_id" Source objects
|
118
|
+
|
119
|
+
Parameters
|
120
|
+
----------
|
121
|
+
lookup_table: pd.Series
|
122
|
+
a pd.Series containing the index of the table to create a source table for
|
123
|
+
table_schema: dict
|
124
|
+
a dictionary containing the schema of the table to create a source table for
|
125
|
+
pw_index: indices.PWIndex
|
126
|
+
a pathway index object containing the pathway_id and other metadata
|
127
|
+
|
128
|
+
Returns
|
129
|
+
-------
|
130
|
+
source_table: pd.DataFrame
|
131
|
+
a pd.DataFrame containing the index of the table to create a source table for
|
132
|
+
with one row per "new_id" and a Source object created from the union of "old_id" Source objects
|
133
|
+
|
134
|
+
Raises
|
135
|
+
------
|
136
|
+
ValueError:
|
137
|
+
if SOURCE_SPEC.SOURCE is not present in table_schema
|
106
138
|
"""
|
107
139
|
|
108
140
|
if SOURCE_SPEC.SOURCE not in table_schema.keys():
|
@@ -142,8 +174,27 @@ def merge_sources(source_list: list | pd.Series) -> Source:
|
|
142
174
|
|
143
175
|
Merge a list of Source objects into a single Source object
|
144
176
|
|
177
|
+
Parameters
|
178
|
+
----------
|
179
|
+
source_list: list | pd.Series
|
180
|
+
a list of Source objects or a pd.Series of Source objects
|
181
|
+
|
182
|
+
Returns
|
183
|
+
-------
|
184
|
+
source: Source
|
185
|
+
a Source object created from the union of the Source objects in source_list
|
186
|
+
|
187
|
+
Raises
|
188
|
+
------
|
189
|
+
TypeError:
|
190
|
+
if source_list is not a list or pd.Series
|
145
191
|
"""
|
146
192
|
|
193
|
+
if not isinstance(source_list, (list, pd.Series)):
|
194
|
+
raise TypeError(
|
195
|
+
f"source_list must be a list or pd.Series, but was a {type(source_list).__name__}"
|
196
|
+
)
|
197
|
+
|
147
198
|
# filter to non-empty sources
|
148
199
|
# empty sources have only been initialized; a merge hasn't occured
|
149
200
|
existing_sources = [s.source is not None for s in source_list]
|
@@ -160,28 +211,35 @@ def merge_sources(source_list: list | pd.Series) -> Source:
|
|
160
211
|
return Source(pd.concat(existing_source_list))
|
161
212
|
|
162
213
|
|
163
|
-
def unnest_sources(
|
164
|
-
source_table: pd.DataFrame, source_var: str, verbose: bool = False
|
165
|
-
) -> pd.DataFrame:
|
214
|
+
def unnest_sources(source_table: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
|
166
215
|
"""
|
167
216
|
Unnest Sources
|
168
217
|
|
169
218
|
Take a pd.DataFrame containing an array of Sources and
|
170
219
|
return one-row per source.
|
171
220
|
|
172
|
-
Parameters
|
221
|
+
Parameters
|
222
|
+
----------
|
173
223
|
source_table: pd.DataFrame
|
174
224
|
a table containing an array of Sources
|
175
|
-
|
176
|
-
|
225
|
+
verbose: bool
|
226
|
+
print progress
|
177
227
|
|
178
|
-
Returns
|
228
|
+
Returns
|
229
|
+
-------
|
179
230
|
pd.Dataframe containing the index of source_table but expanded
|
180
231
|
to include one row per source
|
181
232
|
|
182
233
|
"""
|
183
234
|
|
184
235
|
sources = list()
|
236
|
+
|
237
|
+
table_type = sbml_dfs_utils.infer_entity_type(source_table)
|
238
|
+
source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
|
239
|
+
if SCHEMA_DEFS.SOURCE not in source_table_schema.keys():
|
240
|
+
raise ValueError(f"{table_type} does not have a source attribute")
|
241
|
+
|
242
|
+
source_var = source_table_schema[SCHEMA_DEFS.SOURCE]
|
185
243
|
source_table_index = source_table.index.to_frame().reset_index(drop=True)
|
186
244
|
|
187
245
|
for i in range(source_table.shape[0]):
|
@@ -216,53 +274,73 @@ def unnest_sources(
|
|
216
274
|
return pd.concat(sources)
|
217
275
|
|
218
276
|
|
219
|
-
def
|
220
|
-
|
277
|
+
def source_set_coverage(
|
278
|
+
select_sources_df: pd.DataFrame,
|
279
|
+
source_total_counts: Optional[pd.Series] = None,
|
280
|
+
sbml_dfs: Optional[sbml_dfs_core.SBML_dfs] = None,
|
221
281
|
) -> pd.DataFrame:
|
222
282
|
"""
|
223
283
|
Greedy Set Coverage of Sources
|
224
284
|
|
225
|
-
|
226
|
-
|
285
|
+
Find the set of pathways covering `select_sources_df`. If `all_sources_df`
|
286
|
+
is provided pathways will be selected iteratively based on statistical
|
287
|
+
enrichment. If `all_sources_df` is not provided, the largest pathways
|
288
|
+
will be chosen iteratively.
|
227
289
|
|
228
|
-
Parameters
|
229
|
-
|
290
|
+
Parameters
|
291
|
+
----------
|
292
|
+
select_sources_df: pd.DataFrame
|
230
293
|
pd.Dataframe containing the index of source_table but expanded to
|
231
294
|
include one row per source. As produced by source.unnest_sources()
|
232
|
-
|
233
|
-
|
295
|
+
source_total_counts: pd.Series
|
296
|
+
pd.Series containing the total counts of each source. As produced by
|
297
|
+
source.get_source_total_counts()
|
298
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
299
|
+
if `source_total_counts` is provided then `sbml_dfs` must be provided
|
300
|
+
to calculate the total number of entities in the table.
|
301
|
+
|
302
|
+
Returns
|
303
|
+
-------
|
234
304
|
minimial_sources: [str]
|
235
305
|
A list of pathway_ids of the minimal source set
|
236
306
|
|
237
307
|
"""
|
238
308
|
|
309
|
+
table_type = sbml_dfs_utils.infer_entity_type(select_sources_df)
|
310
|
+
pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
|
311
|
+
|
312
|
+
if source_total_counts is not None:
|
313
|
+
if sbml_dfs is None:
|
314
|
+
raise ValueError(
|
315
|
+
"If `source_total_counts` is provided, `sbml_dfs` must be provided to calculate the total number of entities in the table."
|
316
|
+
)
|
317
|
+
n_total_entities = sbml_dfs.get_table(table_type).shape[0]
|
318
|
+
|
239
319
|
# rollup pathways with identical membership
|
240
|
-
deduplicated_sources = _deduplicate_source_df(
|
320
|
+
deduplicated_sources = _deduplicate_source_df(select_sources_df)
|
241
321
|
|
242
322
|
unaccounted_for_members = deduplicated_sources
|
243
323
|
retained_pathway_ids = []
|
244
|
-
|
245
324
|
while unaccounted_for_members.shape[0] != 0:
|
246
325
|
# find the pathway with the most members
|
247
|
-
|
248
|
-
|
326
|
+
|
327
|
+
if source_total_counts is None:
|
328
|
+
top_pathway = _select_top_pathway_by_size(unaccounted_for_members)
|
329
|
+
else:
|
330
|
+
top_pathway = _select_top_pathway_by_enrichment(
|
331
|
+
unaccounted_for_members, source_total_counts, n_total_entities, pk
|
332
|
+
)
|
333
|
+
|
334
|
+
if top_pathway is None:
|
335
|
+
break
|
336
|
+
|
249
337
|
retained_pathway_ids.append(top_pathway)
|
250
338
|
|
251
339
|
# remove all members associated with the top pathway
|
252
|
-
|
253
|
-
unaccounted_for_members
|
254
|
-
unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
|
255
|
-
]
|
256
|
-
.index.get_level_values(table_schema["pk"])
|
257
|
-
.tolist()
|
340
|
+
unaccounted_for_members = _update_unaccounted_for_members(
|
341
|
+
top_pathway, unaccounted_for_members
|
258
342
|
)
|
259
343
|
|
260
|
-
unaccounted_for_members = unaccounted_for_members[
|
261
|
-
~unaccounted_for_members.index.get_level_values(table_schema["pk"]).isin(
|
262
|
-
members_captured
|
263
|
-
)
|
264
|
-
]
|
265
|
-
|
266
344
|
minimial_sources = deduplicated_sources[
|
267
345
|
deduplicated_sources[SOURCE_SPEC.PATHWAY_ID].isin(retained_pathway_ids)
|
268
346
|
].sort_index()
|
@@ -270,9 +348,39 @@ def greedy_set_coverge_of_sources(
|
|
270
348
|
return minimial_sources
|
271
349
|
|
272
350
|
|
273
|
-
def
|
351
|
+
def get_source_total_counts(
|
352
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, entity_type: str
|
353
|
+
) -> pd.Series:
|
354
|
+
"""
|
355
|
+
Get the total counts of each source.
|
356
|
+
|
357
|
+
Parameters
|
358
|
+
----------
|
359
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
360
|
+
sbml_dfs object containing the table to get the total counts of
|
361
|
+
entity_type: str
|
362
|
+
the type of entity to get the total counts of
|
363
|
+
|
364
|
+
Returns
|
365
|
+
-------
|
366
|
+
source_total_counts: pd.Series
|
367
|
+
pd.Series containing the total counts of each source.
|
368
|
+
"""
|
369
|
+
|
370
|
+
all_sources_table = unnest_sources(sbml_dfs.get_table(entity_type))
|
371
|
+
source_total_counts = all_sources_table.value_counts(SOURCE_SPEC.PATHWAY_ID).rename(
|
372
|
+
"total_counts"
|
373
|
+
)
|
374
|
+
|
375
|
+
return source_total_counts
|
376
|
+
|
377
|
+
|
378
|
+
def _deduplicate_source_df(source_df: pd.DataFrame) -> pd.DataFrame:
|
274
379
|
"""Combine entries in a source table when multiple models have the same members."""
|
275
380
|
|
381
|
+
table_type = sbml_dfs_utils.infer_entity_type(source_df)
|
382
|
+
source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
|
383
|
+
|
276
384
|
# drop entries which are missing required attributes and throw an error if none are left
|
277
385
|
REQUIRED_NON_NA_ATTRIBUTES = [SOURCE_SPEC.PATHWAY_ID]
|
278
386
|
indexed_sources = (
|
@@ -296,7 +404,11 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
|
|
296
404
|
{
|
297
405
|
SOURCE_SPEC.PATHWAY_ID: p,
|
298
406
|
"membership_string": "_".join(
|
299
|
-
set(
|
407
|
+
set(
|
408
|
+
indexed_sources.loc[[p]][
|
409
|
+
source_table_schema[SCHEMA_DEFS.PK]
|
410
|
+
].tolist()
|
411
|
+
)
|
300
412
|
),
|
301
413
|
}
|
302
414
|
for p in pathways
|
@@ -320,16 +432,16 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
|
|
320
432
|
|
321
433
|
merged_sources = pd.concat(
|
322
434
|
[
|
323
|
-
_collapse_by_membership_string(s, membership_categories,
|
435
|
+
_collapse_by_membership_string(s, membership_categories, source_table_schema) # type: ignore
|
324
436
|
for s in category_index.tolist()
|
325
437
|
]
|
326
438
|
)
|
327
439
|
merged_sources[SOURCE_SPEC.INDEX_NAME] = merged_sources.groupby(
|
328
|
-
|
440
|
+
source_table_schema[SCHEMA_DEFS.PK]
|
329
441
|
).cumcount()
|
330
442
|
|
331
443
|
return merged_sources.set_index(
|
332
|
-
[
|
444
|
+
[source_table_schema[SCHEMA_DEFS.PK], SOURCE_SPEC.INDEX_NAME]
|
333
445
|
).sort_index()
|
334
446
|
|
335
447
|
|
@@ -345,7 +457,10 @@ def _collapse_by_membership_string(
|
|
345
457
|
return pd.DataFrame(
|
346
458
|
[
|
347
459
|
pd.concat(
|
348
|
-
[
|
460
|
+
[
|
461
|
+
pd.Series({table_schema[SCHEMA_DEFS.PK]: ms}),
|
462
|
+
collapsed_source_membership,
|
463
|
+
]
|
349
464
|
)
|
350
465
|
for ms in membership_string.split("_")
|
351
466
|
]
|
@@ -398,3 +513,91 @@ def _safe_source_merge(member_Sources: Source | list) -> Source:
|
|
398
513
|
return merge_sources(member_Sources.tolist())
|
399
514
|
else:
|
400
515
|
raise TypeError("Expecting source.Source or pd.Series")
|
516
|
+
|
517
|
+
|
518
|
+
def _select_top_pathway_by_size(unaccounted_for_members: pd.DataFrame) -> str:
|
519
|
+
|
520
|
+
pathway_members = unaccounted_for_members.value_counts(SOURCE_SPEC.PATHWAY_ID)
|
521
|
+
top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
|
522
|
+
|
523
|
+
return top_pathway
|
524
|
+
|
525
|
+
|
526
|
+
def _select_top_pathway_by_enrichment(
|
527
|
+
unaccounted_for_members: pd.DataFrame,
|
528
|
+
source_total_counts: pd.Series,
|
529
|
+
n_total_entities: int,
|
530
|
+
table_pk: str,
|
531
|
+
min_pw_size: int = 5,
|
532
|
+
) -> str:
|
533
|
+
|
534
|
+
n_observed_entities = len(
|
535
|
+
unaccounted_for_members.index.get_level_values(table_pk).unique()
|
536
|
+
)
|
537
|
+
pathway_members = unaccounted_for_members.value_counts(
|
538
|
+
SOURCE_SPEC.PATHWAY_ID
|
539
|
+
).rename("observed_members")
|
540
|
+
|
541
|
+
pathway_members = pathway_members.loc[pathway_members >= min_pw_size]
|
542
|
+
if pathway_members.shape[0] == 0:
|
543
|
+
return None
|
544
|
+
|
545
|
+
wide_contingency_table = (
|
546
|
+
pathway_members.to_frame()
|
547
|
+
.join(source_total_counts)
|
548
|
+
.assign(
|
549
|
+
missing_members=lambda x: x["total_counts"] - x["observed_members"],
|
550
|
+
observed_nonmembers=lambda x: n_observed_entities - x["observed_members"],
|
551
|
+
nonobserved_nonmembers=lambda x: n_total_entities
|
552
|
+
- x["observed_nonmembers"]
|
553
|
+
- x["missing_members"]
|
554
|
+
- x["observed_members"],
|
555
|
+
)
|
556
|
+
.drop(columns=["total_counts"])
|
557
|
+
)
|
558
|
+
|
559
|
+
# calculate enrichments using a fast vectorized normal approximation
|
560
|
+
odds_ratios, _ = hypothesis_testing.fisher_exact_vectorized(
|
561
|
+
wide_contingency_table["observed_members"],
|
562
|
+
wide_contingency_table["missing_members"],
|
563
|
+
wide_contingency_table["observed_nonmembers"],
|
564
|
+
wide_contingency_table["nonobserved_nonmembers"],
|
565
|
+
)
|
566
|
+
|
567
|
+
return pathway_members.index[np.argmax(odds_ratios)]
|
568
|
+
|
569
|
+
|
570
|
+
def _update_unaccounted_for_members(
|
571
|
+
top_pathway, unaccounted_for_members
|
572
|
+
) -> pd.DataFrame:
|
573
|
+
"""
|
574
|
+
Update the unaccounted for members dataframe by removing the members
|
575
|
+
associated with the top pathway.
|
576
|
+
|
577
|
+
Parameters
|
578
|
+
----------
|
579
|
+
top_pathway: str
|
580
|
+
the pathway to remove from the unaccounted for members
|
581
|
+
unaccounted_for_members: pd.DataFrame
|
582
|
+
the dataframe of unaccounted for members
|
583
|
+
|
584
|
+
Returns
|
585
|
+
-------
|
586
|
+
unaccounted_for_members: pd.DataFrame
|
587
|
+
the dataframe of unaccounted for members with the top pathway removed
|
588
|
+
"""
|
589
|
+
|
590
|
+
table_type = sbml_dfs_utils.infer_entity_type(unaccounted_for_members)
|
591
|
+
pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
|
592
|
+
|
593
|
+
members_captured = (
|
594
|
+
unaccounted_for_members[
|
595
|
+
unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
|
596
|
+
]
|
597
|
+
.index.get_level_values(pk)
|
598
|
+
.tolist()
|
599
|
+
)
|
600
|
+
|
601
|
+
return unaccounted_for_members[
|
602
|
+
~unaccounted_for_members.index.get_level_values(pk).isin(members_captured)
|
603
|
+
]
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from typing import Union
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
from scipy.stats import norm
|
5
|
+
|
6
|
+
|
7
|
+
def fisher_exact_vectorized(
|
8
|
+
observed_members: Union[list[int], np.ndarray],
|
9
|
+
missing_members: Union[list[int], np.ndarray],
|
10
|
+
observed_nonmembers: Union[list[int], np.ndarray],
|
11
|
+
nonobserved_nonmembers: Union[list[int], np.ndarray],
|
12
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
13
|
+
"""
|
14
|
+
Fast vectorized one-tailed Fisher exact test using normal approximation.
|
15
|
+
|
16
|
+
Parameters:
|
17
|
+
-----------
|
18
|
+
observed_members, missing_members, observed_nonmembers, nonobserved_nonmembers : array-like
|
19
|
+
The four cells of the 2x2 contingency tables (must be non-negative)
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
--------
|
23
|
+
odds_ratios : numpy array
|
24
|
+
Odds ratios for each test
|
25
|
+
p_values : numpy array
|
26
|
+
One-tailed p-values (tests for enrichment)
|
27
|
+
"""
|
28
|
+
# Convert to numpy arrays
|
29
|
+
a = np.array(observed_members, dtype=float)
|
30
|
+
b = np.array(missing_members, dtype=float)
|
31
|
+
c = np.array(observed_nonmembers, dtype=float)
|
32
|
+
d = np.array(nonobserved_nonmembers, dtype=float)
|
33
|
+
|
34
|
+
# Check for negative values and raise error
|
35
|
+
if np.any((a < 0) | (b < 0) | (c < 0) | (d < 0)):
|
36
|
+
raise ValueError("All contingency table values must be non-negative")
|
37
|
+
|
38
|
+
# Calculate odds ratios
|
39
|
+
odds_ratios = np.divide(
|
40
|
+
a * d, b * c, out=np.full_like(a, np.inf, dtype=float), where=(b * c) != 0
|
41
|
+
)
|
42
|
+
|
43
|
+
# Normal approximation to hypergeometric distribution
|
44
|
+
n = a + b + c + d
|
45
|
+
|
46
|
+
# Avoid division by zero in expected value calculation
|
47
|
+
expected_a = np.divide(
|
48
|
+
(a + b) * (a + c), n, out=np.zeros_like(n, dtype=float), where=n != 0
|
49
|
+
)
|
50
|
+
|
51
|
+
# Variance calculation with protection against division by zero
|
52
|
+
var_a = np.divide(
|
53
|
+
(a + b) * (c + d) * (a + c) * (b + d),
|
54
|
+
n * n * (n - 1),
|
55
|
+
out=np.ones_like(n, dtype=float), # Default to 1 to avoid sqrt(0)
|
56
|
+
where=(n > 1),
|
57
|
+
)
|
58
|
+
var_a = np.maximum(var_a, 1e-10) # Ensure positive variance
|
59
|
+
|
60
|
+
# Continuity correction and z-score
|
61
|
+
z = (a - expected_a - 0.5) / np.sqrt(var_a)
|
62
|
+
|
63
|
+
# One-tailed p-value (upper tail for enrichment)
|
64
|
+
p_values = norm.sf(z) # 1 - norm.cdf(z)
|
65
|
+
|
66
|
+
return odds_ratios, p_values
|
napistu/utils.py
CHANGED
@@ -14,7 +14,7 @@ import zipfile
|
|
14
14
|
from contextlib import closing
|
15
15
|
from itertools import starmap
|
16
16
|
from textwrap import fill
|
17
|
-
from typing import Any,
|
17
|
+
from typing import Any, Dict, Optional, List, Union
|
18
18
|
from urllib.parse import urlparse
|
19
19
|
from pathlib import Path
|
20
20
|
from requests.adapters import HTTPAdapter
|
@@ -1131,6 +1131,28 @@ def safe_fill(x: str, fill_width: int = 15) -> str:
|
|
1131
1131
|
return fill(x, fill_width)
|
1132
1132
|
|
1133
1133
|
|
1134
|
+
def match_regex_dict(s: str, regex_dict: Dict[str, any]) -> Optional[any]:
|
1135
|
+
"""
|
1136
|
+
Apply each regex in regex_dict to the string s. If a regex matches, return its value.
|
1137
|
+
If no regex matches, return None.
|
1138
|
+
|
1139
|
+
Parameters
|
1140
|
+
----------
|
1141
|
+
s : str
|
1142
|
+
The string to test.
|
1143
|
+
regex_dict : dict
|
1144
|
+
Dictionary where keys are regex patterns (str), and values are the values to return.
|
1145
|
+
|
1146
|
+
Returns
|
1147
|
+
-------
|
1148
|
+
The value associated with the first matching regex, or None if no match.
|
1149
|
+
"""
|
1150
|
+
for pattern, value in regex_dict.items():
|
1151
|
+
if re.search(pattern, s):
|
1152
|
+
return value
|
1153
|
+
return None
|
1154
|
+
|
1155
|
+
|
1134
1156
|
def _add_nameness_score_wrapper(df, name_var, table_schema):
|
1135
1157
|
"""Call _add_nameness_score with default value."""
|
1136
1158
|
|
@@ -1,13 +1,13 @@
|
|
1
1
|
napistu/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
|
2
2
|
napistu/__main__.py,sha256=xwlbh_0Ig3a-yG6BIJRiDPSN9R2HnX2pEBvlodlO6h4,29015
|
3
|
-
napistu/consensus.py,sha256=
|
3
|
+
napistu/consensus.py,sha256=SDw58vkDivzy5AiOQUnf5vUbFxmSrMGMMmptDMZhk0E,69807
|
4
4
|
napistu/constants.py,sha256=8sp1l0cxu2rsnCrWBEEwhcBKvDtc4u0D0f_72zILLW0,13427
|
5
5
|
napistu/identifiers.py,sha256=e2-nTVzr5AINa0y1ER9218bKXyF2kAeJ9At22S4Z00o,33914
|
6
6
|
napistu/indices.py,sha256=Zjg3gE0JQ3T879lCPazYg-WXVE6hvcAr713ZKpJ32rk,9830
|
7
7
|
napistu/sbml_dfs_core.py,sha256=s0OyoHs-AjOcbZu1d3KNkW_PI7Rxbhu5ZLpfQeO4iY8,72639
|
8
|
-
napistu/sbml_dfs_utils.py,sha256=
|
9
|
-
napistu/source.py,sha256=
|
10
|
-
napistu/utils.py,sha256=
|
8
|
+
napistu/sbml_dfs_utils.py,sha256=SOy1Ii2hDFOfQa7pFAJS9EfAmfBVD_sHvDJBVmCN_p8,46456
|
9
|
+
napistu/source.py,sha256=iDDKpN-4k_W_tyxEjqe_z-yPJv7uoFRRBhkiBtOH5C8,20416
|
10
|
+
napistu/utils.py,sha256=p2sJxTklmV30XS6hanJRjcdfgeaZpkULuMyQX3BPP0c,36404
|
11
11
|
napistu/context/__init__.py,sha256=LQBEqipcHKK0E5UlDEg1ct-ymCs93IlUrUaH8BCevf0,242
|
12
12
|
napistu/context/discretize.py,sha256=Qq7zg46F_I-PvQIT2_pEDQV7YEtUQCxKoRvT5Gu9QsE,15052
|
13
13
|
napistu/context/filtering.py,sha256=l1oq-43ysSGqU9VmhTOO_pYT4DSMf20yxvktPC1MI0I,13696
|
@@ -17,13 +17,14 @@ napistu/gcs/downloads.py,sha256=SvGv9WYr_Vt3guzyz1QiAuBndeKPTBtWSFLj1-QbLf4,6348
|
|
17
17
|
napistu/gcs/utils.py,sha256=eLSsvewWJdCguyj2k0ozUGP5BTemaE1PZg41Z3aY5kM,571
|
18
18
|
napistu/ingestion/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
|
19
19
|
napistu/ingestion/bigg.py,sha256=f65--8ARe248eYCUJpFMF284Wz53sLyFyBuwelxHmJA,4340
|
20
|
-
napistu/ingestion/constants.py,sha256=
|
20
|
+
napistu/ingestion/constants.py,sha256=jo3v8Z7Y_tNNhTmEcokVOh1HBJFAXc-Z38S4mG58qfo,10059
|
21
21
|
napistu/ingestion/gtex.py,sha256=X0hSC1yrpf4xSJWFhpeNcnHwJzKDII2MvjfUqYA0JN8,3720
|
22
22
|
napistu/ingestion/hpa.py,sha256=R27ExrryKQ4Crxv9ATXmBJCa-yd01TMOrDjkeBhIQac,5054
|
23
23
|
napistu/ingestion/identifiers_etl.py,sha256=6ppDUA6lEZurdmVbiFLOUzphYbr-hndMhtqsQnq_yAc,5009
|
24
24
|
napistu/ingestion/napistu_edgelist.py,sha256=4RLXsoIk_-Atu-Nqme_t1JpEpBET26VIY2Y_Hcd3sMw,3580
|
25
25
|
napistu/ingestion/obo.py,sha256=AQkIPWbjA464Lma0tx91JucWkIwLjC7Jgv5VHGRTDkE,9601
|
26
26
|
napistu/ingestion/psi_mi.py,sha256=5eJjm7XWogL9oTyGqR52kntHClLwLsTePKqCvUGyi-w,10111
|
27
|
+
napistu/ingestion/reactom_fi.py,sha256=hKdOY2wNtcNk6WlnHnNalryiXv6mtcWUiBW9isXPB0Y,6991
|
27
28
|
napistu/ingestion/reactome.py,sha256=Hn9X-vDp4o_HK-OtaQvel3vJeZ8_TC1-4N2rruK9Oks,7099
|
28
29
|
napistu/ingestion/sbml.py,sha256=l8Z98yWuOIRGns8G4UNnoQz7v_xmukZb_IZ_5ye34Ko,25296
|
29
30
|
napistu/ingestion/string.py,sha256=go1WGTkoLJejX7GQWf9bFeInFGAw4jNSpS2B_Zr5f_s,11364
|
@@ -61,14 +62,14 @@ napistu/network/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,2
|
|
61
62
|
napistu/network/constants.py,sha256=nG_lUZYLgop8oxOGjDYqvxXJzVdOwKZ3aWnxlhtSaIo,6915
|
62
63
|
napistu/network/data_handling.py,sha256=KncrAKjXI3169BgVE-SnY8FkpVF60JnUwfMHtbqvsTc,14725
|
63
64
|
napistu/network/ig_utils.py,sha256=MuyEyOVtSHndil6QuuRCimBZrJ2jTaF5qQESgYlu02M,17042
|
64
|
-
napistu/network/neighborhoods.py,sha256=
|
65
|
+
napistu/network/neighborhoods.py,sha256=kXoD5d3plcTEw-6XCbb5QjaCt0jsKwn17VdAvnGoFhY,57041
|
65
66
|
napistu/network/net_create.py,sha256=66kV_xoWnu4BVLaJZ1TAC7wBSsjPDqjoAXH-X9ShV3s,59091
|
66
67
|
napistu/network/net_create_utils.py,sha256=zajwaz2xAij_9fEnD77SgBw_EnNAnJ8jBCmmK2rk_bA,24672
|
67
68
|
napistu/network/net_propagation.py,sha256=Il5nDOWh3nLz8gRhDFHGp2LxcvJ9C1twiSZjDeiZMUo,23490
|
68
69
|
napistu/network/ng_core.py,sha256=dGnTUKR4WtnvaYMyIHqqF55FY4mJSa7wjA2LZ4cVB6U,11720
|
69
|
-
napistu/network/ng_utils.py,sha256=
|
70
|
+
napistu/network/ng_utils.py,sha256=ahSm-8M2pV662V7MMVcGaoguBM55_y-F7LDmZSVp9ag,15951
|
70
71
|
napistu/network/paths.py,sha256=r6LVKVvX7i3ctBA5r-xvHfpH5Zsd0VDHUCtin2iag20,17453
|
71
|
-
napistu/network/precompute.py,sha256=
|
72
|
+
napistu/network/precompute.py,sha256=ARU2tktWnxFISaHAY8chpkg8pusZPv7TT5jSIB9eFF0,10081
|
72
73
|
napistu/ontologies/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
|
73
74
|
napistu/ontologies/constants.py,sha256=GyOFvezSxDK1VigATcruTKtNhjcYaid1ggulEf_HEtQ,4345
|
74
75
|
napistu/ontologies/dogma.py,sha256=VVj6NKBgNym4SdOSu8g22OohALj7cbObhIJmdY2Sfy0,8860
|
@@ -84,8 +85,9 @@ napistu/scverse/__init__.py,sha256=Lgxr3iMQAkTzXE9BNz93CndNP5djzerLvmHM-D0PU3I,3
|
|
84
85
|
napistu/scverse/constants.py,sha256=0iAkhyJUIeFGHdLLU3fCaEU1O3Oix4qAsxr3CxGTjVs,653
|
85
86
|
napistu/scverse/loading.py,sha256=jqiE71XB-wdV50GyZrauFNY0Lai4bX9Fm2Gv80VR8t8,27016
|
86
87
|
napistu/statistics/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
|
88
|
+
napistu/statistics/hypothesis_testing.py,sha256=k0mBFAMF0XHVcKwS26aPnEbq_FIUVwXU1gZ6cKfFbCk,2190
|
87
89
|
napistu/statistics/quantiles.py,sha256=1-LnmVzC2CQWxCKUh0yi6YfKrbsZM1-kkD7nu2-aS5s,3042
|
88
|
-
napistu-0.4.
|
90
|
+
napistu-0.4.4.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
|
89
91
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
90
92
|
tests/conftest.py,sha256=t-GHb0MvSsC-MyhkFpOy2K3t5fi7eaig_Rc2xEQC-t8,9678
|
91
93
|
tests/test_consensus.py,sha256=Hzfrgp4SpkRDnEMVMD3f0UInSycndB8kKzC4wDDvRas,15076
|
@@ -114,7 +116,7 @@ tests/test_network_net_propagation.py,sha256=kZeDHD93iMrLVvxO4OyfRH5_vgsYeQyC40O
|
|
114
116
|
tests/test_network_ng_core.py,sha256=w-iNBTtenennJhaLFauk952pEsk7W0-Fa8lPvIRqHyY,628
|
115
117
|
tests/test_network_ng_utils.py,sha256=QVVuRnvCRfTSIlGdwQTIF9lr0wOwoc5gGeXAUY_AdgE,713
|
116
118
|
tests/test_network_paths.py,sha256=TWZnxY5bF3m6gahcxcYJGrBIawh2-_vUcec1LyPmXV8,1686
|
117
|
-
tests/test_network_precompute.py,sha256=
|
119
|
+
tests/test_network_precompute.py,sha256=IPr1KhtxBD0fXx_2TvZqnevrD-Iig35otb8yloRFpRc,10014
|
118
120
|
tests/test_ontologies_genodexito.py,sha256=6fINyUiubHZqu7qxye09DQfJXw28ZMAJc3clPb-cCoY,2298
|
119
121
|
tests/test_ontologies_id_tables.py,sha256=CpwpbmQvTc1BaVd6jbDKHAVE2etwN0vx93nC8jpnMlE,7265
|
120
122
|
tests/test_ontologies_mygene.py,sha256=VkdRcKIWmcG6V-2dpfvsBiOJN5dO-j0RqZNxtJRcyBU,1583
|
@@ -124,18 +126,18 @@ tests/test_rpy2_callr.py,sha256=V4a-QH5krgYOQRgqzksMzIkGAFjBqKOAqgprxrH6bE0,2904
|
|
124
126
|
tests/test_rpy2_init.py,sha256=T3gnxC1O7XNvYM2P4018ikpPPAy-kwQLm7Erj0RfA-4,5895
|
125
127
|
tests/test_sbml.py,sha256=f25zj1NogYrmLluvBDboLameTuCiQ309433Qn3iPvhg,1483
|
126
128
|
tests/test_sbml_dfs_core.py,sha256=nnLPpZTVtCznOBohk7CX67x6sMqktJWt-sZMWQKoaDs,26521
|
127
|
-
tests/test_sbml_dfs_utils.py,sha256=
|
129
|
+
tests/test_sbml_dfs_utils.py,sha256=ZD9x2B81fsfYEjAV9wphHOR7ywjNcfvfw1LGNv4PxUA,11471
|
128
130
|
tests/test_sbo.py,sha256=x_PENFaXYsrZIzOZu9cj_Wrej7i7SNGxgBYYvcigLs0,308
|
129
131
|
tests/test_scverse_loading.py,sha256=bnU1lQSYYWhOAs0IIBoi4ZohqPokDQJ0n_rtkAfEyMU,29948
|
130
|
-
tests/
|
131
|
-
tests/
|
132
|
+
tests/test_source.py,sha256=iV-Yyu8flhIGWF17SCL8msG2bjqwb9w2IZ694b0iZ-o,2985
|
133
|
+
tests/test_statistics_hypothesis_testing.py,sha256=qD-oS9zo5JlH-jdtiOrWAKI4nKFuZvvh6361_pFSpIs,2259
|
132
134
|
tests/test_statistics_quantiles.py,sha256=yNDeqwgbP-1Rx3C_dLX_wnwT_Lr-iJWClmeKmElqmTE,4984
|
133
135
|
tests/test_uncompartmentalize.py,sha256=nAk5kfAVLU9a2VWe2x2HYVcKqj-EnwmwddERIPRax8c,1289
|
134
136
|
tests/test_utils.py,sha256=qPSpV-Q9b6vmdycgaDmQqtcvzKnAVnN9j5xJ9x-T6bg,23959
|
135
137
|
tests/utils.py,sha256=SoWQ_5roJteFGcMaOeEiQ5ucwq3Z2Fa3AAs9iXHTsJY,749
|
136
138
|
tests/test_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
137
|
-
napistu-0.4.
|
138
|
-
napistu-0.4.
|
139
|
-
napistu-0.4.
|
140
|
-
napistu-0.4.
|
141
|
-
napistu-0.4.
|
139
|
+
napistu-0.4.4.dist-info/METADATA,sha256=E15A5Ve2RZTn4HtXGD2rDO1Q7AEaTfSdo3fgLuwravE,4078
|
140
|
+
napistu-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
141
|
+
napistu-0.4.4.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
|
142
|
+
napistu-0.4.4.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
|
143
|
+
napistu-0.4.4.dist-info/RECORD,,
|
tests/test_network_precompute.py
CHANGED
@@ -276,3 +276,33 @@ def test_precomputed_distances_serialization():
|
|
276
276
|
# Clean up the temporary file
|
277
277
|
if os.path.exists(temp_path):
|
278
278
|
os.remove(temp_path)
|
279
|
+
|
280
|
+
|
281
|
+
def test_filter_precomputed_distances_top_n_subset():
|
282
|
+
# Use a small top_n for a quick test
|
283
|
+
top_n = 5
|
284
|
+
filtered = precompute.filter_precomputed_distances_top_n(
|
285
|
+
precomputed_distances, top_n=top_n
|
286
|
+
)
|
287
|
+
# Check that the filtered DataFrame is a subset of the original
|
288
|
+
merged = filtered.merge(
|
289
|
+
precomputed_distances,
|
290
|
+
on=[
|
291
|
+
precompute.NAPISTU_EDGELIST.SC_ID_ORIGIN,
|
292
|
+
precompute.NAPISTU_EDGELIST.SC_ID_DEST,
|
293
|
+
],
|
294
|
+
how="left",
|
295
|
+
indicator=True,
|
296
|
+
)
|
297
|
+
assert (
|
298
|
+
merged["_merge"] == "both"
|
299
|
+
).all(), "Filtered rows must be present in the original DataFrame"
|
300
|
+
# Check that columns are preserved
|
301
|
+
assert set(
|
302
|
+
[
|
303
|
+
precompute.NAPISTU_EDGELIST.SC_ID_ORIGIN,
|
304
|
+
precompute.NAPISTU_EDGELIST.SC_ID_DEST,
|
305
|
+
]
|
306
|
+
).issubset(filtered.columns)
|
307
|
+
# Optionally, check that the number of rows is less than or equal to the input
|
308
|
+
assert filtered.shape[0] <= precomputed_distances.shape[0]
|
tests/test_sbml_dfs_utils.py
CHANGED
@@ -334,3 +334,16 @@ def test_infer_entity_type_errors():
|
|
334
334
|
) # Two primary keys
|
335
335
|
with pytest.raises(ValueError):
|
336
336
|
sbml_dfs_utils.infer_entity_type(df)
|
337
|
+
|
338
|
+
|
339
|
+
def test_infer_entity_type_multindex_reactions():
|
340
|
+
# DataFrame with MultiIndex (r_id, foo), should infer as reactions
|
341
|
+
import pandas as pd
|
342
|
+
from napistu.constants import SBML_DFS
|
343
|
+
|
344
|
+
df = pd.DataFrame({"some_col": [1, 2]})
|
345
|
+
df.index = pd.MultiIndex.from_tuples(
|
346
|
+
[("rxn1", "a"), ("rxn2", "b")], names=[SBML_DFS.R_ID, "foo"]
|
347
|
+
)
|
348
|
+
result = sbml_dfs_utils.infer_entity_type(df)
|
349
|
+
assert result == SBML_DFS.REACTIONS
|