napistu 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
napistu/source.py CHANGED
@@ -1,8 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import numpy as np
3
4
  import pandas as pd
5
+ from typing import Optional
6
+
4
7
  from napistu import indices
5
- from napistu.constants import SOURCE_SPEC
8
+ from napistu import sbml_dfs_core
9
+ from napistu import sbml_dfs_utils
10
+ from napistu.statistics import hypothesis_testing
11
+ from napistu.constants import SBML_DFS_SCHEMA, SCHEMA_DEFS, SOURCE_SPEC
6
12
 
7
13
 
8
14
  class Source:
@@ -41,11 +47,18 @@ class Source:
41
47
  Creates an empty source object. This is typically used when creating an SBML_dfs
42
48
  object from a single source.
43
49
  pw_index : indices.PWIndex
50
+ a pathway index object containing the pathway_id and other metadata
44
51
 
45
52
  Returns
46
53
  -------
47
54
  None.
48
55
 
56
+ Raises
57
+ ------
58
+ ValueError:
59
+ if pw_index is not a indices.PWIndex
60
+ ValueError:
61
+ if SOURCE_SPEC.MODEL is not present in source_df
49
62
  """
50
63
 
51
64
  if init is True:
@@ -101,8 +114,27 @@ def create_source_table(
101
114
  """
102
115
  Create Source Table
103
116
 
104
- Create a table with one row per "new_id" and a Source object created from the union
105
- of "old_id" Source objects
117
+ Create a table with one row per "new_id" and a Source object created from the unionof "old_id" Source objects
118
+
119
+ Parameters
120
+ ----------
121
+ lookup_table: pd.Series
122
+ a pd.Series containing the index of the table to create a source table for
123
+ table_schema: dict
124
+ a dictionary containing the schema of the table to create a source table for
125
+ pw_index: indices.PWIndex
126
+ a pathway index object containing the pathway_id and other metadata
127
+
128
+ Returns
129
+ -------
130
+ source_table: pd.DataFrame
131
+ a pd.DataFrame containing the index of the table to create a source table for
132
+ with one row per "new_id" and a Source object created from the union of "old_id" Source objects
133
+
134
+ Raises
135
+ ------
136
+ ValueError:
137
+ if SOURCE_SPEC.SOURCE is not present in table_schema
106
138
  """
107
139
 
108
140
  if SOURCE_SPEC.SOURCE not in table_schema.keys():
@@ -142,8 +174,27 @@ def merge_sources(source_list: list | pd.Series) -> Source:
142
174
 
143
175
  Merge a list of Source objects into a single Source object
144
176
 
177
+ Parameters
178
+ ----------
179
+ source_list: list | pd.Series
180
+ a list of Source objects or a pd.Series of Source objects
181
+
182
+ Returns
183
+ -------
184
+ source: Source
185
+ a Source object created from the union of the Source objects in source_list
186
+
187
+ Raises
188
+ ------
189
+ TypeError:
190
+ if source_list is not a list or pd.Series
145
191
  """
146
192
 
193
+ if not isinstance(source_list, (list, pd.Series)):
194
+ raise TypeError(
195
+ f"source_list must be a list or pd.Series, but was a {type(source_list).__name__}"
196
+ )
197
+
147
198
  # filter to non-empty sources
148
199
  # empty sources have only been initialized; a merge hasn't occured
149
200
  existing_sources = [s.source is not None for s in source_list]
@@ -160,28 +211,35 @@ def merge_sources(source_list: list | pd.Series) -> Source:
160
211
  return Source(pd.concat(existing_source_list))
161
212
 
162
213
 
163
- def unnest_sources(
164
- source_table: pd.DataFrame, source_var: str, verbose: bool = False
165
- ) -> pd.DataFrame:
214
+ def unnest_sources(source_table: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
166
215
  """
167
216
  Unnest Sources
168
217
 
169
218
  Take a pd.DataFrame containing an array of Sources and
170
219
  return one-row per source.
171
220
 
172
- Parameters:
221
+ Parameters
222
+ ----------
173
223
  source_table: pd.DataFrame
174
224
  a table containing an array of Sources
175
- source_var: str
176
- variable containing Sources
225
+ verbose: bool
226
+ print progress
177
227
 
178
- Returns:
228
+ Returns
229
+ -------
179
230
  pd.Dataframe containing the index of source_table but expanded
180
231
  to include one row per source
181
232
 
182
233
  """
183
234
 
184
235
  sources = list()
236
+
237
+ table_type = sbml_dfs_utils.infer_entity_type(source_table)
238
+ source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
239
+ if SCHEMA_DEFS.SOURCE not in source_table_schema.keys():
240
+ raise ValueError(f"{table_type} does not have a source attribute")
241
+
242
+ source_var = source_table_schema[SCHEMA_DEFS.SOURCE]
185
243
  source_table_index = source_table.index.to_frame().reset_index(drop=True)
186
244
 
187
245
  for i in range(source_table.shape[0]):
@@ -216,53 +274,73 @@ def unnest_sources(
216
274
  return pd.concat(sources)
217
275
 
218
276
 
219
- def greedy_set_coverge_of_sources(
220
- source_df: pd.DataFrame, table_schema: dict
277
+ def source_set_coverage(
278
+ select_sources_df: pd.DataFrame,
279
+ source_total_counts: Optional[pd.Series] = None,
280
+ sbml_dfs: Optional[sbml_dfs_core.SBML_dfs] = None,
221
281
  ) -> pd.DataFrame:
222
282
  """
223
283
  Greedy Set Coverage of Sources
224
284
 
225
- Apply the greedy set coverge algorithm to find the minimal set of
226
- sources which cover all entries
285
+ Find the set of pathways covering `select_sources_df`. If `all_sources_df`
286
+ is provided pathways will be selected iteratively based on statistical
287
+ enrichment. If `all_sources_df` is not provided, the largest pathways
288
+ will be chosen iteratively.
227
289
 
228
- Parameters:
229
- source_df: pd.DataFrame
290
+ Parameters
291
+ ----------
292
+ select_sources_df: pd.DataFrame
230
293
  pd.Dataframe containing the index of source_table but expanded to
231
294
  include one row per source. As produced by source.unnest_sources()
232
-
233
- Returns:
295
+ source_total_counts: pd.Series
296
+ pd.Series containing the total counts of each source. As produced by
297
+ source.get_source_total_counts()
298
+ sbml_dfs: sbml_dfs_core.SBML_dfs
299
+ if `source_total_counts` is provided then `sbml_dfs` must be provided
300
+ to calculate the total number of entities in the table.
301
+
302
+ Returns
303
+ -------
234
304
  minimial_sources: [str]
235
305
  A list of pathway_ids of the minimal source set
236
306
 
237
307
  """
238
308
 
309
+ table_type = sbml_dfs_utils.infer_entity_type(select_sources_df)
310
+ pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
311
+
312
+ if source_total_counts is not None:
313
+ if sbml_dfs is None:
314
+ raise ValueError(
315
+ "If `source_total_counts` is provided, `sbml_dfs` must be provided to calculate the total number of entities in the table."
316
+ )
317
+ n_total_entities = sbml_dfs.get_table(table_type).shape[0]
318
+
239
319
  # rollup pathways with identical membership
240
- deduplicated_sources = _deduplicate_source_df(source_df, table_schema)
320
+ deduplicated_sources = _deduplicate_source_df(select_sources_df)
241
321
 
242
322
  unaccounted_for_members = deduplicated_sources
243
323
  retained_pathway_ids = []
244
-
245
324
  while unaccounted_for_members.shape[0] != 0:
246
325
  # find the pathway with the most members
247
- pathway_members = unaccounted_for_members.groupby(SOURCE_SPEC.PATHWAY_ID).size()
248
- top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
326
+
327
+ if source_total_counts is None:
328
+ top_pathway = _select_top_pathway_by_size(unaccounted_for_members)
329
+ else:
330
+ top_pathway = _select_top_pathway_by_enrichment(
331
+ unaccounted_for_members, source_total_counts, n_total_entities, pk
332
+ )
333
+
334
+ if top_pathway is None:
335
+ break
336
+
249
337
  retained_pathway_ids.append(top_pathway)
250
338
 
251
339
  # remove all members associated with the top pathway
252
- members_captured = (
253
- unaccounted_for_members[
254
- unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
255
- ]
256
- .index.get_level_values(table_schema["pk"])
257
- .tolist()
340
+ unaccounted_for_members = _update_unaccounted_for_members(
341
+ top_pathway, unaccounted_for_members
258
342
  )
259
343
 
260
- unaccounted_for_members = unaccounted_for_members[
261
- ~unaccounted_for_members.index.get_level_values(table_schema["pk"]).isin(
262
- members_captured
263
- )
264
- ]
265
-
266
344
  minimial_sources = deduplicated_sources[
267
345
  deduplicated_sources[SOURCE_SPEC.PATHWAY_ID].isin(retained_pathway_ids)
268
346
  ].sort_index()
@@ -270,9 +348,39 @@ def greedy_set_coverge_of_sources(
270
348
  return minimial_sources
271
349
 
272
350
 
273
- def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.DataFrame:
351
+ def get_source_total_counts(
352
+ sbml_dfs: sbml_dfs_core.SBML_dfs, entity_type: str
353
+ ) -> pd.Series:
354
+ """
355
+ Get the total counts of each source.
356
+
357
+ Parameters
358
+ ----------
359
+ sbml_dfs: sbml_dfs_core.SBML_dfs
360
+ sbml_dfs object containing the table to get the total counts of
361
+ entity_type: str
362
+ the type of entity to get the total counts of
363
+
364
+ Returns
365
+ -------
366
+ source_total_counts: pd.Series
367
+ pd.Series containing the total counts of each source.
368
+ """
369
+
370
+ all_sources_table = unnest_sources(sbml_dfs.get_table(entity_type))
371
+ source_total_counts = all_sources_table.value_counts(SOURCE_SPEC.PATHWAY_ID).rename(
372
+ "total_counts"
373
+ )
374
+
375
+ return source_total_counts
376
+
377
+
378
+ def _deduplicate_source_df(source_df: pd.DataFrame) -> pd.DataFrame:
274
379
  """Combine entries in a source table when multiple models have the same members."""
275
380
 
381
+ table_type = sbml_dfs_utils.infer_entity_type(source_df)
382
+ source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
383
+
276
384
  # drop entries which are missing required attributes and throw an error if none are left
277
385
  REQUIRED_NON_NA_ATTRIBUTES = [SOURCE_SPEC.PATHWAY_ID]
278
386
  indexed_sources = (
@@ -296,7 +404,11 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
296
404
  {
297
405
  SOURCE_SPEC.PATHWAY_ID: p,
298
406
  "membership_string": "_".join(
299
- set(indexed_sources.loc[[p]][table_schema["pk"]].tolist())
407
+ set(
408
+ indexed_sources.loc[[p]][
409
+ source_table_schema[SCHEMA_DEFS.PK]
410
+ ].tolist()
411
+ )
300
412
  ),
301
413
  }
302
414
  for p in pathways
@@ -320,16 +432,16 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
320
432
 
321
433
  merged_sources = pd.concat(
322
434
  [
323
- _collapse_by_membership_string(s, membership_categories, table_schema) # type: ignore
435
+ _collapse_by_membership_string(s, membership_categories, source_table_schema) # type: ignore
324
436
  for s in category_index.tolist()
325
437
  ]
326
438
  )
327
439
  merged_sources[SOURCE_SPEC.INDEX_NAME] = merged_sources.groupby(
328
- table_schema["pk"]
440
+ source_table_schema[SCHEMA_DEFS.PK]
329
441
  ).cumcount()
330
442
 
331
443
  return merged_sources.set_index(
332
- [table_schema["pk"], SOURCE_SPEC.INDEX_NAME]
444
+ [source_table_schema[SCHEMA_DEFS.PK], SOURCE_SPEC.INDEX_NAME]
333
445
  ).sort_index()
334
446
 
335
447
 
@@ -345,7 +457,10 @@ def _collapse_by_membership_string(
345
457
  return pd.DataFrame(
346
458
  [
347
459
  pd.concat(
348
- [pd.Series({table_schema["pk"]: ms}), collapsed_source_membership]
460
+ [
461
+ pd.Series({table_schema[SCHEMA_DEFS.PK]: ms}),
462
+ collapsed_source_membership,
463
+ ]
349
464
  )
350
465
  for ms in membership_string.split("_")
351
466
  ]
@@ -398,3 +513,91 @@ def _safe_source_merge(member_Sources: Source | list) -> Source:
398
513
  return merge_sources(member_Sources.tolist())
399
514
  else:
400
515
  raise TypeError("Expecting source.Source or pd.Series")
516
+
517
+
518
+ def _select_top_pathway_by_size(unaccounted_for_members: pd.DataFrame) -> str:
519
+
520
+ pathway_members = unaccounted_for_members.value_counts(SOURCE_SPEC.PATHWAY_ID)
521
+ top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
522
+
523
+ return top_pathway
524
+
525
+
526
+ def _select_top_pathway_by_enrichment(
527
+ unaccounted_for_members: pd.DataFrame,
528
+ source_total_counts: pd.Series,
529
+ n_total_entities: int,
530
+ table_pk: str,
531
+ min_pw_size: int = 5,
532
+ ) -> str:
533
+
534
+ n_observed_entities = len(
535
+ unaccounted_for_members.index.get_level_values(table_pk).unique()
536
+ )
537
+ pathway_members = unaccounted_for_members.value_counts(
538
+ SOURCE_SPEC.PATHWAY_ID
539
+ ).rename("observed_members")
540
+
541
+ pathway_members = pathway_members.loc[pathway_members >= min_pw_size]
542
+ if pathway_members.shape[0] == 0:
543
+ return None
544
+
545
+ wide_contingency_table = (
546
+ pathway_members.to_frame()
547
+ .join(source_total_counts)
548
+ .assign(
549
+ missing_members=lambda x: x["total_counts"] - x["observed_members"],
550
+ observed_nonmembers=lambda x: n_observed_entities - x["observed_members"],
551
+ nonobserved_nonmembers=lambda x: n_total_entities
552
+ - x["observed_nonmembers"]
553
+ - x["missing_members"]
554
+ - x["observed_members"],
555
+ )
556
+ .drop(columns=["total_counts"])
557
+ )
558
+
559
+ # calculate enrichments using a fast vectorized normal approximation
560
+ odds_ratios, _ = hypothesis_testing.fisher_exact_vectorized(
561
+ wide_contingency_table["observed_members"],
562
+ wide_contingency_table["missing_members"],
563
+ wide_contingency_table["observed_nonmembers"],
564
+ wide_contingency_table["nonobserved_nonmembers"],
565
+ )
566
+
567
+ return pathway_members.index[np.argmax(odds_ratios)]
568
+
569
+
570
+ def _update_unaccounted_for_members(
571
+ top_pathway, unaccounted_for_members
572
+ ) -> pd.DataFrame:
573
+ """
574
+ Update the unaccounted for members dataframe by removing the members
575
+ associated with the top pathway.
576
+
577
+ Parameters
578
+ ----------
579
+ top_pathway: str
580
+ the pathway to remove from the unaccounted for members
581
+ unaccounted_for_members: pd.DataFrame
582
+ the dataframe of unaccounted for members
583
+
584
+ Returns
585
+ -------
586
+ unaccounted_for_members: pd.DataFrame
587
+ the dataframe of unaccounted for members with the top pathway removed
588
+ """
589
+
590
+ table_type = sbml_dfs_utils.infer_entity_type(unaccounted_for_members)
591
+ pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
592
+
593
+ members_captured = (
594
+ unaccounted_for_members[
595
+ unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
596
+ ]
597
+ .index.get_level_values(pk)
598
+ .tolist()
599
+ )
600
+
601
+ return unaccounted_for_members[
602
+ ~unaccounted_for_members.index.get_level_values(pk).isin(members_captured)
603
+ ]
@@ -0,0 +1,66 @@
1
+ from typing import Union
2
+
3
+ import numpy as np
4
+ from scipy.stats import norm
5
+
6
+
7
+ def fisher_exact_vectorized(
8
+ observed_members: Union[list[int], np.ndarray],
9
+ missing_members: Union[list[int], np.ndarray],
10
+ observed_nonmembers: Union[list[int], np.ndarray],
11
+ nonobserved_nonmembers: Union[list[int], np.ndarray],
12
+ ) -> tuple[np.ndarray, np.ndarray]:
13
+ """
14
+ Fast vectorized one-tailed Fisher exact test using normal approximation.
15
+
16
+ Parameters:
17
+ -----------
18
+ observed_members, missing_members, observed_nonmembers, nonobserved_nonmembers : array-like
19
+ The four cells of the 2x2 contingency tables (must be non-negative)
20
+
21
+ Returns:
22
+ --------
23
+ odds_ratios : numpy array
24
+ Odds ratios for each test
25
+ p_values : numpy array
26
+ One-tailed p-values (tests for enrichment)
27
+ """
28
+ # Convert to numpy arrays
29
+ a = np.array(observed_members, dtype=float)
30
+ b = np.array(missing_members, dtype=float)
31
+ c = np.array(observed_nonmembers, dtype=float)
32
+ d = np.array(nonobserved_nonmembers, dtype=float)
33
+
34
+ # Check for negative values and raise error
35
+ if np.any((a < 0) | (b < 0) | (c < 0) | (d < 0)):
36
+ raise ValueError("All contingency table values must be non-negative")
37
+
38
+ # Calculate odds ratios
39
+ odds_ratios = np.divide(
40
+ a * d, b * c, out=np.full_like(a, np.inf, dtype=float), where=(b * c) != 0
41
+ )
42
+
43
+ # Normal approximation to hypergeometric distribution
44
+ n = a + b + c + d
45
+
46
+ # Avoid division by zero in expected value calculation
47
+ expected_a = np.divide(
48
+ (a + b) * (a + c), n, out=np.zeros_like(n, dtype=float), where=n != 0
49
+ )
50
+
51
+ # Variance calculation with protection against division by zero
52
+ var_a = np.divide(
53
+ (a + b) * (c + d) * (a + c) * (b + d),
54
+ n * n * (n - 1),
55
+ out=np.ones_like(n, dtype=float), # Default to 1 to avoid sqrt(0)
56
+ where=(n > 1),
57
+ )
58
+ var_a = np.maximum(var_a, 1e-10) # Ensure positive variance
59
+
60
+ # Continuity correction and z-score
61
+ z = (a - expected_a - 0.5) / np.sqrt(var_a)
62
+
63
+ # One-tailed p-value (upper tail for enrichment)
64
+ p_values = norm.sf(z) # 1 - norm.cdf(z)
65
+
66
+ return odds_ratios, p_values
napistu/utils.py CHANGED
@@ -14,7 +14,7 @@ import zipfile
14
14
  from contextlib import closing
15
15
  from itertools import starmap
16
16
  from textwrap import fill
17
- from typing import Any, List, Optional, Union
17
+ from typing import Any, Dict, Optional, List, Union
18
18
  from urllib.parse import urlparse
19
19
  from pathlib import Path
20
20
  from requests.adapters import HTTPAdapter
@@ -1131,6 +1131,28 @@ def safe_fill(x: str, fill_width: int = 15) -> str:
1131
1131
  return fill(x, fill_width)
1132
1132
 
1133
1133
 
1134
+ def match_regex_dict(s: str, regex_dict: Dict[str, any]) -> Optional[any]:
1135
+ """
1136
+ Apply each regex in regex_dict to the string s. If a regex matches, return its value.
1137
+ If no regex matches, return None.
1138
+
1139
+ Parameters
1140
+ ----------
1141
+ s : str
1142
+ The string to test.
1143
+ regex_dict : dict
1144
+ Dictionary where keys are regex patterns (str), and values are the values to return.
1145
+
1146
+ Returns
1147
+ -------
1148
+ The value associated with the first matching regex, or None if no match.
1149
+ """
1150
+ for pattern, value in regex_dict.items():
1151
+ if re.search(pattern, s):
1152
+ return value
1153
+ return None
1154
+
1155
+
1134
1156
  def _add_nameness_score_wrapper(df, name_var, table_schema):
1135
1157
  """Call _add_nameness_score with default value."""
1136
1158
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: napistu
3
- Version: 0.4.2
3
+ Version: 0.4.4
4
4
  Summary: Connecting high-dimensional data to curated pathways
5
5
  Home-page: https://github.com/napistu/napistu-py
6
6
  Author: Sean Hackett
@@ -1,13 +1,13 @@
1
1
  napistu/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
2
2
  napistu/__main__.py,sha256=xwlbh_0Ig3a-yG6BIJRiDPSN9R2HnX2pEBvlodlO6h4,29015
3
- napistu/consensus.py,sha256=xWXiqIM6ot-SSPJZXTrVpohbINSCkZXBtRi-5REfk_g,69897
3
+ napistu/consensus.py,sha256=SDw58vkDivzy5AiOQUnf5vUbFxmSrMGMMmptDMZhk0E,69807
4
4
  napistu/constants.py,sha256=8sp1l0cxu2rsnCrWBEEwhcBKvDtc4u0D0f_72zILLW0,13427
5
5
  napistu/identifiers.py,sha256=e2-nTVzr5AINa0y1ER9218bKXyF2kAeJ9At22S4Z00o,33914
6
6
  napistu/indices.py,sha256=Zjg3gE0JQ3T879lCPazYg-WXVE6hvcAr713ZKpJ32rk,9830
7
7
  napistu/sbml_dfs_core.py,sha256=s0OyoHs-AjOcbZu1d3KNkW_PI7Rxbhu5ZLpfQeO4iY8,72639
8
- napistu/sbml_dfs_utils.py,sha256=w5dFcJFDKnKDK9jxPOCuCW8IccxdXmyNmP9vCUhVdf8,46184
9
- napistu/source.py,sha256=UGpN70bqbC9gnKmM0ivSdQYim9hfzgABeXoQKzRr9oU,13646
10
- napistu/utils.py,sha256=PEAsLn7VGN8JlNJQcAMYpjF1gr2mWmb5IqBsypP9hi0,35768
8
+ napistu/sbml_dfs_utils.py,sha256=SOy1Ii2hDFOfQa7pFAJS9EfAmfBVD_sHvDJBVmCN_p8,46456
9
+ napistu/source.py,sha256=iDDKpN-4k_W_tyxEjqe_z-yPJv7uoFRRBhkiBtOH5C8,20416
10
+ napistu/utils.py,sha256=p2sJxTklmV30XS6hanJRjcdfgeaZpkULuMyQX3BPP0c,36404
11
11
  napistu/context/__init__.py,sha256=LQBEqipcHKK0E5UlDEg1ct-ymCs93IlUrUaH8BCevf0,242
12
12
  napistu/context/discretize.py,sha256=Qq7zg46F_I-PvQIT2_pEDQV7YEtUQCxKoRvT5Gu9QsE,15052
13
13
  napistu/context/filtering.py,sha256=l1oq-43ysSGqU9VmhTOO_pYT4DSMf20yxvktPC1MI0I,13696
@@ -17,13 +17,14 @@ napistu/gcs/downloads.py,sha256=SvGv9WYr_Vt3guzyz1QiAuBndeKPTBtWSFLj1-QbLf4,6348
17
17
  napistu/gcs/utils.py,sha256=eLSsvewWJdCguyj2k0ozUGP5BTemaE1PZg41Z3aY5kM,571
18
18
  napistu/ingestion/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
19
19
  napistu/ingestion/bigg.py,sha256=f65--8ARe248eYCUJpFMF284Wz53sLyFyBuwelxHmJA,4340
20
- napistu/ingestion/constants.py,sha256=9UP47VImZ11q0kz17N3EJg2155USqLewwNWyKpA-cbA,8089
20
+ napistu/ingestion/constants.py,sha256=jo3v8Z7Y_tNNhTmEcokVOh1HBJFAXc-Z38S4mG58qfo,10059
21
21
  napistu/ingestion/gtex.py,sha256=X0hSC1yrpf4xSJWFhpeNcnHwJzKDII2MvjfUqYA0JN8,3720
22
22
  napistu/ingestion/hpa.py,sha256=R27ExrryKQ4Crxv9ATXmBJCa-yd01TMOrDjkeBhIQac,5054
23
23
  napistu/ingestion/identifiers_etl.py,sha256=6ppDUA6lEZurdmVbiFLOUzphYbr-hndMhtqsQnq_yAc,5009
24
24
  napistu/ingestion/napistu_edgelist.py,sha256=4RLXsoIk_-Atu-Nqme_t1JpEpBET26VIY2Y_Hcd3sMw,3580
25
25
  napistu/ingestion/obo.py,sha256=AQkIPWbjA464Lma0tx91JucWkIwLjC7Jgv5VHGRTDkE,9601
26
26
  napistu/ingestion/psi_mi.py,sha256=5eJjm7XWogL9oTyGqR52kntHClLwLsTePKqCvUGyi-w,10111
27
+ napistu/ingestion/reactom_fi.py,sha256=hKdOY2wNtcNk6WlnHnNalryiXv6mtcWUiBW9isXPB0Y,6991
27
28
  napistu/ingestion/reactome.py,sha256=Hn9X-vDp4o_HK-OtaQvel3vJeZ8_TC1-4N2rruK9Oks,7099
28
29
  napistu/ingestion/sbml.py,sha256=l8Z98yWuOIRGns8G4UNnoQz7v_xmukZb_IZ_5ye34Ko,25296
29
30
  napistu/ingestion/string.py,sha256=go1WGTkoLJejX7GQWf9bFeInFGAw4jNSpS2B_Zr5f_s,11364
@@ -61,14 +62,14 @@ napistu/network/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,2
61
62
  napistu/network/constants.py,sha256=nG_lUZYLgop8oxOGjDYqvxXJzVdOwKZ3aWnxlhtSaIo,6915
62
63
  napistu/network/data_handling.py,sha256=KncrAKjXI3169BgVE-SnY8FkpVF60JnUwfMHtbqvsTc,14725
63
64
  napistu/network/ig_utils.py,sha256=MuyEyOVtSHndil6QuuRCimBZrJ2jTaF5qQESgYlu02M,17042
64
- napistu/network/neighborhoods.py,sha256=g5QeGaizSfW4nNe9YZY86g8q79EQmuvSwipaNPnOVqA,56121
65
+ napistu/network/neighborhoods.py,sha256=kXoD5d3plcTEw-6XCbb5QjaCt0jsKwn17VdAvnGoFhY,57041
65
66
  napistu/network/net_create.py,sha256=66kV_xoWnu4BVLaJZ1TAC7wBSsjPDqjoAXH-X9ShV3s,59091
66
67
  napistu/network/net_create_utils.py,sha256=zajwaz2xAij_9fEnD77SgBw_EnNAnJ8jBCmmK2rk_bA,24672
67
68
  napistu/network/net_propagation.py,sha256=Il5nDOWh3nLz8gRhDFHGp2LxcvJ9C1twiSZjDeiZMUo,23490
68
69
  napistu/network/ng_core.py,sha256=dGnTUKR4WtnvaYMyIHqqF55FY4mJSa7wjA2LZ4cVB6U,11720
69
- napistu/network/ng_utils.py,sha256=c1tHXz_JcH01D5KovNQmRLTEVxpCkCe36otULq-liz8,15579
70
+ napistu/network/ng_utils.py,sha256=ahSm-8M2pV662V7MMVcGaoguBM55_y-F7LDmZSVp9ag,15951
70
71
  napistu/network/paths.py,sha256=r6LVKVvX7i3ctBA5r-xvHfpH5Zsd0VDHUCtin2iag20,17453
71
- napistu/network/precompute.py,sha256=ibL0ByY7Wp5kEfIG3LUDpQKdvAeQX0DNkT_46g2YrGc,8367
72
+ napistu/network/precompute.py,sha256=ARU2tktWnxFISaHAY8chpkg8pusZPv7TT5jSIB9eFF0,10081
72
73
  napistu/ontologies/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
73
74
  napistu/ontologies/constants.py,sha256=GyOFvezSxDK1VigATcruTKtNhjcYaid1ggulEf_HEtQ,4345
74
75
  napistu/ontologies/dogma.py,sha256=VVj6NKBgNym4SdOSu8g22OohALj7cbObhIJmdY2Sfy0,8860
@@ -84,8 +85,9 @@ napistu/scverse/__init__.py,sha256=Lgxr3iMQAkTzXE9BNz93CndNP5djzerLvmHM-D0PU3I,3
84
85
  napistu/scverse/constants.py,sha256=0iAkhyJUIeFGHdLLU3fCaEU1O3Oix4qAsxr3CxGTjVs,653
85
86
  napistu/scverse/loading.py,sha256=jqiE71XB-wdV50GyZrauFNY0Lai4bX9Fm2Gv80VR8t8,27016
86
87
  napistu/statistics/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
88
+ napistu/statistics/hypothesis_testing.py,sha256=k0mBFAMF0XHVcKwS26aPnEbq_FIUVwXU1gZ6cKfFbCk,2190
87
89
  napistu/statistics/quantiles.py,sha256=1-LnmVzC2CQWxCKUh0yi6YfKrbsZM1-kkD7nu2-aS5s,3042
88
- napistu-0.4.2.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
90
+ napistu-0.4.4.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
89
91
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
92
  tests/conftest.py,sha256=t-GHb0MvSsC-MyhkFpOy2K3t5fi7eaig_Rc2xEQC-t8,9678
91
93
  tests/test_consensus.py,sha256=Hzfrgp4SpkRDnEMVMD3f0UInSycndB8kKzC4wDDvRas,15076
@@ -114,7 +116,7 @@ tests/test_network_net_propagation.py,sha256=kZeDHD93iMrLVvxO4OyfRH5_vgsYeQyC40O
114
116
  tests/test_network_ng_core.py,sha256=w-iNBTtenennJhaLFauk952pEsk7W0-Fa8lPvIRqHyY,628
115
117
  tests/test_network_ng_utils.py,sha256=QVVuRnvCRfTSIlGdwQTIF9lr0wOwoc5gGeXAUY_AdgE,713
116
118
  tests/test_network_paths.py,sha256=TWZnxY5bF3m6gahcxcYJGrBIawh2-_vUcec1LyPmXV8,1686
117
- tests/test_network_precompute.py,sha256=zwJrKNC3s8rIrsyAQfQMYxbl8HZXUr7u09nMJ_K8jiU,9005
119
+ tests/test_network_precompute.py,sha256=IPr1KhtxBD0fXx_2TvZqnevrD-Iig35otb8yloRFpRc,10014
118
120
  tests/test_ontologies_genodexito.py,sha256=6fINyUiubHZqu7qxye09DQfJXw28ZMAJc3clPb-cCoY,2298
119
121
  tests/test_ontologies_id_tables.py,sha256=CpwpbmQvTc1BaVd6jbDKHAVE2etwN0vx93nC8jpnMlE,7265
120
122
  tests/test_ontologies_mygene.py,sha256=VkdRcKIWmcG6V-2dpfvsBiOJN5dO-j0RqZNxtJRcyBU,1583
@@ -124,18 +126,18 @@ tests/test_rpy2_callr.py,sha256=V4a-QH5krgYOQRgqzksMzIkGAFjBqKOAqgprxrH6bE0,2904
124
126
  tests/test_rpy2_init.py,sha256=T3gnxC1O7XNvYM2P4018ikpPPAy-kwQLm7Erj0RfA-4,5895
125
127
  tests/test_sbml.py,sha256=f25zj1NogYrmLluvBDboLameTuCiQ309433Qn3iPvhg,1483
126
128
  tests/test_sbml_dfs_core.py,sha256=nnLPpZTVtCznOBohk7CX67x6sMqktJWt-sZMWQKoaDs,26521
127
- tests/test_sbml_dfs_utils.py,sha256=gWIhzUEtQlOR9c1TiCyhlSAELmWnBSncn6vCEqH5hl0,11029
129
+ tests/test_sbml_dfs_utils.py,sha256=ZD9x2B81fsfYEjAV9wphHOR7ywjNcfvfw1LGNv4PxUA,11471
128
130
  tests/test_sbo.py,sha256=x_PENFaXYsrZIzOZu9cj_Wrej7i7SNGxgBYYvcigLs0,308
129
131
  tests/test_scverse_loading.py,sha256=bnU1lQSYYWhOAs0IIBoi4ZohqPokDQJ0n_rtkAfEyMU,29948
130
- tests/test_set_coverage.py,sha256=J-6m6LuOjcQa9pxRuWglSfJk4Ltm7kt_eOrn_Q-7P6Q,1604
131
- tests/test_source.py,sha256=hT0IlpexR5zP0OhWl5BBaho9d1aCYQlFZLwRIRRnw_Y,1969
132
+ tests/test_source.py,sha256=iV-Yyu8flhIGWF17SCL8msG2bjqwb9w2IZ694b0iZ-o,2985
133
+ tests/test_statistics_hypothesis_testing.py,sha256=qD-oS9zo5JlH-jdtiOrWAKI4nKFuZvvh6361_pFSpIs,2259
132
134
  tests/test_statistics_quantiles.py,sha256=yNDeqwgbP-1Rx3C_dLX_wnwT_Lr-iJWClmeKmElqmTE,4984
133
135
  tests/test_uncompartmentalize.py,sha256=nAk5kfAVLU9a2VWe2x2HYVcKqj-EnwmwddERIPRax8c,1289
134
136
  tests/test_utils.py,sha256=qPSpV-Q9b6vmdycgaDmQqtcvzKnAVnN9j5xJ9x-T6bg,23959
135
137
  tests/utils.py,sha256=SoWQ_5roJteFGcMaOeEiQ5ucwq3Z2Fa3AAs9iXHTsJY,749
136
138
  tests/test_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
- napistu-0.4.2.dist-info/METADATA,sha256=6P_9Mmno6pVu4Me-3QdcMtiGOhCcajTqm5LP_Hns4lI,4078
138
- napistu-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
139
- napistu-0.4.2.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
140
- napistu-0.4.2.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
141
- napistu-0.4.2.dist-info/RECORD,,
139
+ napistu-0.4.4.dist-info/METADATA,sha256=E15A5Ve2RZTn4HtXGD2rDO1Q7AEaTfSdo3fgLuwravE,4078
140
+ napistu-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
141
+ napistu-0.4.4.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
142
+ napistu-0.4.4.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
143
+ napistu-0.4.4.dist-info/RECORD,,
@@ -276,3 +276,33 @@ def test_precomputed_distances_serialization():
276
276
  # Clean up the temporary file
277
277
  if os.path.exists(temp_path):
278
278
  os.remove(temp_path)
279
+
280
+
281
+ def test_filter_precomputed_distances_top_n_subset():
282
+ # Use a small top_n for a quick test
283
+ top_n = 5
284
+ filtered = precompute.filter_precomputed_distances_top_n(
285
+ precomputed_distances, top_n=top_n
286
+ )
287
+ # Check that the filtered DataFrame is a subset of the original
288
+ merged = filtered.merge(
289
+ precomputed_distances,
290
+ on=[
291
+ precompute.NAPISTU_EDGELIST.SC_ID_ORIGIN,
292
+ precompute.NAPISTU_EDGELIST.SC_ID_DEST,
293
+ ],
294
+ how="left",
295
+ indicator=True,
296
+ )
297
+ assert (
298
+ merged["_merge"] == "both"
299
+ ).all(), "Filtered rows must be present in the original DataFrame"
300
+ # Check that columns are preserved
301
+ assert set(
302
+ [
303
+ precompute.NAPISTU_EDGELIST.SC_ID_ORIGIN,
304
+ precompute.NAPISTU_EDGELIST.SC_ID_DEST,
305
+ ]
306
+ ).issubset(filtered.columns)
307
+ # Optionally, check that the number of rows is less than or equal to the input
308
+ assert filtered.shape[0] <= precomputed_distances.shape[0]
@@ -334,3 +334,16 @@ def test_infer_entity_type_errors():
334
334
  ) # Two primary keys
335
335
  with pytest.raises(ValueError):
336
336
  sbml_dfs_utils.infer_entity_type(df)
337
+
338
+
339
+ def test_infer_entity_type_multindex_reactions():
340
+ # DataFrame with MultiIndex (r_id, foo), should infer as reactions
341
+ import pandas as pd
342
+ from napistu.constants import SBML_DFS
343
+
344
+ df = pd.DataFrame({"some_col": [1, 2]})
345
+ df.index = pd.MultiIndex.from_tuples(
346
+ [("rxn1", "a"), ("rxn2", "b")], names=[SBML_DFS.R_ID, "foo"]
347
+ )
348
+ result = sbml_dfs_utils.infer_entity_type(df)
349
+ assert result == SBML_DFS.REACTIONS