napistu 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
napistu/consensus.py CHANGED
@@ -426,7 +426,7 @@ def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> s
426
426
 
427
427
  # get the sources of species in the consensus model
428
428
  consensus_sbmldf_tbl_var_sc = (
429
- source.unnest_sources(sbml_dfs.species, SBML_DFS.S_SOURCE, verbose=False)
429
+ source.unnest_sources(sbml_dfs.species, verbose=False)
430
430
  .reset_index()
431
431
  .sort_values([SOURCE_SPEC.NAME])
432
432
  )
@@ -504,12 +504,11 @@ def post_consensus_source_check(
504
504
  ) -> pd.DataFrame:
505
505
  """Provide sources of tables in a consensus model; the output df will be used to determine whether models are merged."""
506
506
 
507
- table_source = sbml_dfs.schema[table_name][SOURCE_SPEC.SOURCE]
508
- table_pk = sbml_dfs.schema[table_name]["pk"]
507
+ table_pk = sbml_dfs.schema[table_name][SCHEMA_DEFS.PK]
509
508
 
510
509
  sbml_dfs_tbl = getattr(sbml_dfs, table_name)
511
510
  sbml_dfs_tbl_pathway_source = (
512
- source.unnest_sources(sbml_dfs_tbl, table_source, verbose=False)
511
+ source.unnest_sources(sbml_dfs_tbl, verbose=False)
513
512
  .reset_index()
514
513
  .sort_values(["name"])
515
514
  )
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
 
4
4
  from types import SimpleNamespace
5
5
 
6
+ from napistu.constants import SBOTERM_NAMES
6
7
 
7
8
  SPECIES_FULL_NAME_HUMAN = "Homo sapiens"
8
9
  SPECIES_FULL_NAME_MOUSE = "Mus musculus"
@@ -90,6 +91,56 @@ REACTOME_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathways.
90
91
  REACTOME_PATHWAY_INDEX_COLUMNS = ["file", "source", "species", "pathway_id", "name"]
91
92
  REACTOME_PATHWAY_LIST_COLUMNS = ["pathway_id", "name", "species"]
92
93
 
94
+ # REACTOME FI
95
+ REACTOME_FI_URL = "http://cpws.reactome.org/caBigR3WebApp2025/FIsInGene_04142025_with_annotations.txt.zip"
96
+
97
+ REACTOME_FI = SimpleNamespace(
98
+ GENE1="Gene1",
99
+ GENE2="Gene2",
100
+ ANNOTATION="Annotation",
101
+ DIRECTION="Direction",
102
+ SCORE="Score",
103
+ )
104
+
105
+ REACTOME_FI_DIRECTIONS = SimpleNamespace(
106
+ UNDIRECTED="-",
107
+ STIMULATED_BY="<-",
108
+ STIMULATES="->",
109
+ STIMULATES_AND_STIMULATED_BY="<->",
110
+ INHIBITED_BY="|-",
111
+ INHIBITS="-|",
112
+ INHIBITS_AND_INHIBITED_BY="|-|",
113
+ STIMULATES_AND_INHIBITED_BY="|->",
114
+ INHIBITS_AND_STIMULATED_BY="<-|",
115
+ )
116
+
117
+ VALID_REACTOME_FI_DIRECTIONS = REACTOME_FI_DIRECTIONS.__dict__.values()
118
+
119
+ REACTOME_FI_RULES_REVERSE = SimpleNamespace(
120
+ NAME_RULES={"catalyzed by": SBOTERM_NAMES.CATALYST},
121
+ DIRECTION_RULES={
122
+ REACTOME_FI_DIRECTIONS.STIMULATED_BY: SBOTERM_NAMES.STIMULATOR,
123
+ REACTOME_FI_DIRECTIONS.STIMULATES_AND_STIMULATED_BY: SBOTERM_NAMES.STIMULATOR,
124
+ REACTOME_FI_DIRECTIONS.INHIBITED_BY: SBOTERM_NAMES.INHIBITOR,
125
+ REACTOME_FI_DIRECTIONS.INHIBITS_AND_INHIBITED_BY: SBOTERM_NAMES.INHIBITOR,
126
+ REACTOME_FI_DIRECTIONS.STIMULATES_AND_INHIBITED_BY: SBOTERM_NAMES.INHIBITOR,
127
+ REACTOME_FI_DIRECTIONS.UNDIRECTED: SBOTERM_NAMES.INTERACTOR,
128
+ },
129
+ )
130
+
131
+ REACTOME_FI_RULES_FORWARD = SimpleNamespace(
132
+ NAME_RULES={"catalyze(;$)": SBOTERM_NAMES.CATALYST},
133
+ DIRECTION_RULES={
134
+ REACTOME_FI_DIRECTIONS.STIMULATES: SBOTERM_NAMES.STIMULATOR,
135
+ REACTOME_FI_DIRECTIONS.STIMULATES_AND_STIMULATED_BY: SBOTERM_NAMES.STIMULATOR,
136
+ REACTOME_FI_DIRECTIONS.STIMULATES_AND_INHIBITED_BY: SBOTERM_NAMES.STIMULATOR,
137
+ REACTOME_FI_DIRECTIONS.INHIBITS: SBOTERM_NAMES.INHIBITOR,
138
+ REACTOME_FI_DIRECTIONS.INHIBITS_AND_INHIBITED_BY: SBOTERM_NAMES.INHIBITOR,
139
+ REACTOME_FI_DIRECTIONS.INHIBITS_AND_STIMULATED_BY: SBOTERM_NAMES.INHIBITOR,
140
+ REACTOME_FI_DIRECTIONS.UNDIRECTED: SBOTERM_NAMES.INTERACTOR,
141
+ },
142
+ )
143
+
93
144
  # SBML
94
145
  SBML_DEFS = SimpleNamespace(
95
146
  ERROR_NUMBER="error_number",
@@ -0,0 +1,208 @@
1
+ import logging
2
+ import pandas as pd
3
+
4
+ from napistu.identifiers import Identifiers
5
+ from napistu import utils
6
+ from napistu.ingestion.constants import (
7
+ REACTOME_FI,
8
+ REACTOME_FI_RULES_FORWARD,
9
+ REACTOME_FI_RULES_REVERSE,
10
+ REACTOME_FI_URL,
11
+ VALID_REACTOME_FI_DIRECTIONS,
12
+ )
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def download_reactome_fi(target_uri: str, url: str = REACTOME_FI_URL) -> None:
19
+ """
20
+ Download the Reactome Functional Interactions (FI) dataset as a TSV file.
21
+
22
+ Parameters
23
+ ----------
24
+ target_uri : str
25
+ The URI where the Reactome FI data should be saved. Should end with .tsv
26
+ url : str, optional
27
+ URL to download the zipped Reactome functional interactions TSV from.
28
+ Defaults to REACTOME_FI_URL.
29
+
30
+ Returns
31
+ -------
32
+ None
33
+
34
+ Raises
35
+ ------
36
+ ValueError
37
+ If target_uri does not end with .tsv
38
+ """
39
+
40
+ if not target_uri.endswith(".tsv"):
41
+ raise ValueError(f"Target URI must end with .tsv, got {target_uri}")
42
+
43
+ file_ext = url.split(".")[-1]
44
+ target_filename = url.split("/")[-1].split(f".{file_ext}")[0]
45
+ logger.info("Start downloading proteinatlas %s to %s", url, target_uri)
46
+ # target_filename is the name of the file in the zip file which will be renamed to target_uri
47
+ utils.download_wget(url, target_uri, target_filename=target_filename)
48
+
49
+ return None
50
+
51
+
52
+ def format_reactome_fi_edgelist(interactions: pd.DataFrame):
53
+ """
54
+ Format the Reactome FI interactions DataFrame as an edgelist for network analysis.
55
+
56
+ Parameters
57
+ ----------
58
+ interactions : pd.DataFrame
59
+ DataFrame containing Reactome FI interactions.
60
+
61
+ Returns
62
+ -------
63
+ Dictonary of:
64
+
65
+ interaction_edgelist : pd.DataFrame
66
+ Table containing molecular interactions with columns:
67
+ - upstream_name : str, matches "s_name" from species_df
68
+ - downstream_name : str, matches "s_name" from species_df
69
+ - upstream_compartment : str, matches "c_name" from compartments_df
70
+ - downstream_compartment : str, matches "c_name" from compartments_df
71
+ - r_name : str, name for the interaction
72
+ - sbo_term : str, SBO term defining interaction type
73
+ - r_Identifiers : identifiers.Identifiers, supporting identifiers
74
+ - r_isreversible : bool, whether reaction is reversible
75
+ species_df : pd.DataFrame
76
+ Table defining molecular species with columns:
77
+ - s_name : str, name of molecular species
78
+ - s_Identifiers : identifiers.Identifiers, species identifiers
79
+ compartments_df : pd.DataFrame
80
+ Table defining compartments with columns:
81
+ - c_name : str, name of compartment
82
+ - c_Identifiers : identifiers.Identifiers, compartment identifiers
83
+
84
+ Notes
85
+ -----
86
+ This function is not yet implemented and will raise NotImplementedError.
87
+ """
88
+
89
+ raise NotImplementedError("TO DO - This function is incomplete")
90
+
91
+ formatted_annotations = _parse_reactome_fi_annotations(interactions)
92
+
93
+ # this join will expand some rows to 2 since the bidirectional relationships are captured as separate edges in Napistu
94
+ annotated_interactions = interactions.merge(
95
+ formatted_annotations,
96
+ on=[REACTOME_FI.ANNOTATION, REACTOME_FI.DIRECTION],
97
+ how="left",
98
+ )
99
+
100
+ # flip reverse entries so all relationships are forward or undirected
101
+ formatted_interactions = (
102
+ pd.concat(
103
+ [
104
+ annotated_interactions.query("polarity == 'forward'"),
105
+ (
106
+ annotated_interactions.query("polarity == 'reverse'").rename(
107
+ columns={
108
+ REACTOME_FI.GENE1: REACTOME_FI.GENE2,
109
+ REACTOME_FI.GENE2: REACTOME_FI.GENE1,
110
+ }
111
+ )
112
+ ),
113
+ ]
114
+ )[[REACTOME_FI.GENE1, REACTOME_FI.GENE2, "sbo_term_name", "Score"]]
115
+ # looks like they were already unique edges
116
+ .sort_values("Score", ascending=False)
117
+ .groupby([REACTOME_FI.GENE1, REACTOME_FI.GENE2])
118
+ .first()
119
+ )
120
+
121
+ fi_edgelist = (
122
+ formatted_interactions.reset_index()
123
+ .rename(
124
+ columns={
125
+ REACTOME_FI.GENE1: "upstream_name",
126
+ REACTOME_FI.GENE2: "downstream_name",
127
+ }
128
+ )
129
+ .assign(r_Identifiers=Identifiers([]))
130
+ )
131
+
132
+ return fi_edgelist
133
+
134
+
135
+ def _parse_reactome_fi_annotations(interactions: pd.DataFrame) -> pd.DataFrame:
136
+ """
137
+ Parse and annotate Reactome FI interaction types and directions using regex-based rules.
138
+
139
+ Parameters
140
+ ----------
141
+ interactions : pd.DataFrame
142
+ DataFrame containing Reactome FI interactions, with annotation and direction columns.
143
+
144
+ Returns
145
+ -------
146
+ pd.DataFrame
147
+ DataFrame with annotation, direction, SBO term name, and polarity for each unique annotation/direction pair.
148
+
149
+ Raises
150
+ ------
151
+ ValueError
152
+ If an annotation/direction pair cannot be matched to a rule or if invalid directions are found.
153
+ """
154
+
155
+ distinct_annotations = (
156
+ interactions[[REACTOME_FI.ANNOTATION, REACTOME_FI.DIRECTION]]
157
+ .drop_duplicates()
158
+ .reset_index(drop=True)
159
+ )
160
+ invalid_directions = distinct_annotations.loc[
161
+ ~distinct_annotations[REACTOME_FI.DIRECTION].isin(VALID_REACTOME_FI_DIRECTIONS),
162
+ "Direction",
163
+ ]
164
+ if len(invalid_directions) > 0:
165
+ raise ValueError(f"Invalid directions: {invalid_directions}")
166
+
167
+ annotations = list()
168
+ for _, vals in distinct_annotations.iterrows():
169
+ annot, direction = vals
170
+
171
+ forward_match = utils.match_regex_dict(
172
+ annot, REACTOME_FI_RULES_FORWARD.NAME_RULES
173
+ )
174
+ if not forward_match:
175
+ if direction in REACTOME_FI_RULES_FORWARD.DIRECTION_RULES:
176
+ forward_match = REACTOME_FI_RULES_FORWARD.DIRECTION_RULES[direction]
177
+
178
+ reverse_match = utils.match_regex_dict(
179
+ annot, REACTOME_FI_RULES_REVERSE.NAME_RULES
180
+ )
181
+ if not reverse_match:
182
+ if direction in REACTOME_FI_RULES_REVERSE.DIRECTION_RULES:
183
+ reverse_match = REACTOME_FI_RULES_REVERSE.DIRECTION_RULES[direction]
184
+
185
+ if not (forward_match or reverse_match):
186
+ raise ValueError(f"No match found for {annot} with direction {direction}")
187
+
188
+ if forward_match:
189
+ annotations.append(
190
+ {
191
+ REACTOME_FI.ANNOTATION: annot,
192
+ REACTOME_FI.DIRECTION: direction,
193
+ "sbo_term_name": forward_match,
194
+ "polarity": "forward",
195
+ }
196
+ )
197
+
198
+ if reverse_match:
199
+ annotations.append(
200
+ {
201
+ REACTOME_FI.ANNOTATION: annot,
202
+ REACTOME_FI.DIRECTION: direction,
203
+ "sbo_term_name": reverse_match,
204
+ "polarity": "reverse",
205
+ }
206
+ )
207
+
208
+ return pd.DataFrame(annotations)
@@ -66,7 +66,7 @@ def compartmentalize_species_pairs(
66
66
  Compartmentalize Shortest Paths
67
67
 
68
68
  For a set of origin and destination species pairs, consider each species in every
69
- compartment it operates in, seperately.
69
+ compartment it operates in, seperately.
70
70
 
71
71
  Parameters
72
72
  ----------
@@ -112,22 +112,42 @@ def compartmentalize_species_pairs(
112
112
 
113
113
 
114
114
  def get_minimal_sources_edges(
115
- vertices: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
115
+ vertices: pd.DataFrame,
116
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
117
+ source_total_counts: Optional[pd.Series] = None,
116
118
  ) -> pd.DataFrame | None:
117
- """Assign edges to a set of sources."""
119
+ """
120
+ Assign edges to a set of sources.
121
+
122
+ Parameters
123
+ ----------
124
+ vertices: pd.DataFrame
125
+ A table of vertices.
126
+ sbml_dfs: sbml_dfs_core.SBML_dfs
127
+ A pathway model
128
+ source_total_counts: pd.Series
129
+ A series of the total counts of each source.
130
+
131
+ Returns
132
+ -------
133
+ edge_sources: pd.DataFrame
134
+ A table of edges and the sources they are assigned to.
135
+ """
136
+
118
137
  nodes = vertices["node"].tolist()
119
138
  present_reactions = sbml_dfs.reactions[sbml_dfs.reactions.index.isin(nodes)]
120
139
 
121
140
  if len(present_reactions) == 0:
122
141
  return None
123
142
 
124
- table_schema = sbml_dfs.schema[SBML_DFS.REACTIONS]
125
- source_df = source.unnest_sources(present_reactions, table_schema["source"])
143
+ source_df = source.unnest_sources(present_reactions)
126
144
 
127
145
  if source_df is None:
128
146
  return None
129
147
  else:
130
- edge_sources = source.greedy_set_coverge_of_sources(source_df, table_schema)
148
+ edge_sources = source.source_set_coverage(
149
+ source_df, source_total_counts, sbml_dfs
150
+ )
131
151
  return edge_sources.reset_index()[
132
152
  [SBML_DFS.R_ID, SOURCE_SPEC.PATHWAY_ID, SOURCE_SPEC.NAME]
133
153
  ]
@@ -110,6 +110,62 @@ def precompute_distances(
110
110
  return filtered_precomputed_distances
111
111
 
112
112
 
113
+ def filter_precomputed_distances_top_n(precomputed_distances, top_n=50):
114
+ """
115
+ Filter precomputed distances to only include the top-n pairs for each distance measure.
116
+
117
+ Parameters
118
+ ----------
119
+ precomputed_distances : pd.DataFrame
120
+ Precomputed distances.
121
+ top_n : int, optional
122
+ Top-n pairs to include for each distance measure.
123
+
124
+ Returns
125
+ -------
126
+ pd.DataFrame
127
+ Filtered precomputed distances.
128
+ """
129
+
130
+ # take the union of top-n for each distance measure; and from origin -> dest and dest -> origin
131
+ distance_vars = set(precomputed_distances.columns) - {
132
+ NAPISTU_EDGELIST.SC_ID_ORIGIN,
133
+ NAPISTU_EDGELIST.SC_ID_DEST,
134
+ }
135
+
136
+ valid_pairs = list()
137
+ for distance_var in distance_vars:
138
+ top_n_pairs_by_origin = (
139
+ precomputed_distances.sort_values(by=distance_var, ascending=False)
140
+ .groupby(NAPISTU_EDGELIST.SC_ID_ORIGIN)
141
+ .head(top_n)
142
+ )
143
+ top_n_pairs_by_dest = (
144
+ precomputed_distances.sort_values(by=distance_var, ascending=False)
145
+ .groupby(NAPISTU_EDGELIST.SC_ID_DEST)
146
+ .head(top_n)
147
+ )
148
+
149
+ valid_pairs.append(
150
+ top_n_pairs_by_origin[
151
+ [NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
152
+ ]
153
+ )
154
+ valid_pairs.append(
155
+ top_n_pairs_by_dest[
156
+ [NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
157
+ ]
158
+ )
159
+
160
+ all_valid_pairs = pd.concat(valid_pairs).drop_duplicates()
161
+
162
+ return precomputed_distances.merge(
163
+ all_valid_pairs,
164
+ on=[NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST],
165
+ how="inner",
166
+ )
167
+
168
+
113
169
  def _calculate_distances_subset(
114
170
  napistu_graph: NapistuGraph,
115
171
  vs_to_partition: pd.DataFrame,
napistu/sbml_dfs_utils.py CHANGED
@@ -456,8 +456,14 @@ def infer_entity_type(df: pd.DataFrame) -> str:
456
456
  if entity_schema.get(SCHEMA_DEFS.PK) == df.index.name:
457
457
  return entity_type
458
458
 
459
- # Get DataFrame columns that are also primary keys
460
- df_columns = set(df.columns).intersection(primary_keys)
459
+ # Get DataFrame columns that are also primary keys, including index or MultiIndex names
460
+ index_names = []
461
+ if isinstance(df.index, pd.MultiIndex):
462
+ index_names = [name for name in df.index.names if name is not None]
463
+ elif df.index.name is not None:
464
+ index_names = [df.index.name]
465
+
466
+ df_columns = set(df.columns).union(index_names).intersection(primary_keys)
461
467
 
462
468
  # Check for exact match with primary key + foreign keys
463
469
  for entity_type, entity_schema in schema.items():
napistu/source.py CHANGED
@@ -1,8 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import numpy as np
3
4
  import pandas as pd
5
+ from typing import Optional
6
+
4
7
  from napistu import indices
5
- from napistu.constants import SOURCE_SPEC
8
+ from napistu import sbml_dfs_core
9
+ from napistu import sbml_dfs_utils
10
+ from napistu.statistics import hypothesis_testing
11
+ from napistu.constants import SBML_DFS_SCHEMA, SCHEMA_DEFS, SOURCE_SPEC
6
12
 
7
13
 
8
14
  class Source:
@@ -41,11 +47,18 @@ class Source:
41
47
  Creates an empty source object. This is typically used when creating an SBML_dfs
42
48
  object from a single source.
43
49
  pw_index : indices.PWIndex
50
+ a pathway index object containing the pathway_id and other metadata
44
51
 
45
52
  Returns
46
53
  -------
47
54
  None.
48
55
 
56
+ Raises
57
+ ------
58
+ ValueError:
59
+ if pw_index is not a indices.PWIndex
60
+ ValueError:
61
+ if SOURCE_SPEC.MODEL is not present in source_df
49
62
  """
50
63
 
51
64
  if init is True:
@@ -101,8 +114,27 @@ def create_source_table(
101
114
  """
102
115
  Create Source Table
103
116
 
104
- Create a table with one row per "new_id" and a Source object created from the union
105
- of "old_id" Source objects
117
+ Create a table with one row per "new_id" and a Source object created from the unionof "old_id" Source objects
118
+
119
+ Parameters
120
+ ----------
121
+ lookup_table: pd.Series
122
+ a pd.Series containing the index of the table to create a source table for
123
+ table_schema: dict
124
+ a dictionary containing the schema of the table to create a source table for
125
+ pw_index: indices.PWIndex
126
+ a pathway index object containing the pathway_id and other metadata
127
+
128
+ Returns
129
+ -------
130
+ source_table: pd.DataFrame
131
+ a pd.DataFrame containing the index of the table to create a source table for
132
+ with one row per "new_id" and a Source object created from the union of "old_id" Source objects
133
+
134
+ Raises
135
+ ------
136
+ ValueError:
137
+ if SOURCE_SPEC.SOURCE is not present in table_schema
106
138
  """
107
139
 
108
140
  if SOURCE_SPEC.SOURCE not in table_schema.keys():
@@ -142,8 +174,27 @@ def merge_sources(source_list: list | pd.Series) -> Source:
142
174
 
143
175
  Merge a list of Source objects into a single Source object
144
176
 
177
+ Parameters
178
+ ----------
179
+ source_list: list | pd.Series
180
+ a list of Source objects or a pd.Series of Source objects
181
+
182
+ Returns
183
+ -------
184
+ source: Source
185
+ a Source object created from the union of the Source objects in source_list
186
+
187
+ Raises
188
+ ------
189
+ TypeError:
190
+ if source_list is not a list or pd.Series
145
191
  """
146
192
 
193
+ if not isinstance(source_list, (list, pd.Series)):
194
+ raise TypeError(
195
+ f"source_list must be a list or pd.Series, but was a {type(source_list).__name__}"
196
+ )
197
+
147
198
  # filter to non-empty sources
148
199
  # empty sources have only been initialized; a merge hasn't occured
149
200
  existing_sources = [s.source is not None for s in source_list]
@@ -160,28 +211,35 @@ def merge_sources(source_list: list | pd.Series) -> Source:
160
211
  return Source(pd.concat(existing_source_list))
161
212
 
162
213
 
163
- def unnest_sources(
164
- source_table: pd.DataFrame, source_var: str, verbose: bool = False
165
- ) -> pd.DataFrame:
214
+ def unnest_sources(source_table: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
166
215
  """
167
216
  Unnest Sources
168
217
 
169
218
  Take a pd.DataFrame containing an array of Sources and
170
219
  return one-row per source.
171
220
 
172
- Parameters:
221
+ Parameters
222
+ ----------
173
223
  source_table: pd.DataFrame
174
224
  a table containing an array of Sources
175
- source_var: str
176
- variable containing Sources
225
+ verbose: bool
226
+ print progress
177
227
 
178
- Returns:
228
+ Returns
229
+ -------
179
230
  pd.Dataframe containing the index of source_table but expanded
180
231
  to include one row per source
181
232
 
182
233
  """
183
234
 
184
235
  sources = list()
236
+
237
+ table_type = sbml_dfs_utils.infer_entity_type(source_table)
238
+ source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
239
+ if SCHEMA_DEFS.SOURCE not in source_table_schema.keys():
240
+ raise ValueError(f"{table_type} does not have a source attribute")
241
+
242
+ source_var = source_table_schema[SCHEMA_DEFS.SOURCE]
185
243
  source_table_index = source_table.index.to_frame().reset_index(drop=True)
186
244
 
187
245
  for i in range(source_table.shape[0]):
@@ -216,53 +274,73 @@ def unnest_sources(
216
274
  return pd.concat(sources)
217
275
 
218
276
 
219
- def greedy_set_coverge_of_sources(
220
- source_df: pd.DataFrame, table_schema: dict
277
+ def source_set_coverage(
278
+ select_sources_df: pd.DataFrame,
279
+ source_total_counts: Optional[pd.Series] = None,
280
+ sbml_dfs: Optional[sbml_dfs_core.SBML_dfs] = None,
221
281
  ) -> pd.DataFrame:
222
282
  """
223
283
  Greedy Set Coverage of Sources
224
284
 
225
- Apply the greedy set coverge algorithm to find the minimal set of
226
- sources which cover all entries
285
+ Find the set of pathways covering `select_sources_df`. If `all_sources_df`
286
+ is provided pathways will be selected iteratively based on statistical
287
+ enrichment. If `all_sources_df` is not provided, the largest pathways
288
+ will be chosen iteratively.
227
289
 
228
- Parameters:
229
- source_df: pd.DataFrame
290
+ Parameters
291
+ ----------
292
+ select_sources_df: pd.DataFrame
230
293
  pd.Dataframe containing the index of source_table but expanded to
231
294
  include one row per source. As produced by source.unnest_sources()
232
-
233
- Returns:
295
+ source_total_counts: pd.Series
296
+ pd.Series containing the total counts of each source. As produced by
297
+ source.get_source_total_counts()
298
+ sbml_dfs: sbml_dfs_core.SBML_dfs
299
+ if `source_total_counts` is provided then `sbml_dfs` must be provided
300
+ to calculate the total number of entities in the table.
301
+
302
+ Returns
303
+ -------
234
304
  minimial_sources: [str]
235
305
  A list of pathway_ids of the minimal source set
236
306
 
237
307
  """
238
308
 
309
+ table_type = sbml_dfs_utils.infer_entity_type(select_sources_df)
310
+ pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
311
+
312
+ if source_total_counts is not None:
313
+ if sbml_dfs is None:
314
+ raise ValueError(
315
+ "If `source_total_counts` is provided, `sbml_dfs` must be provided to calculate the total number of entities in the table."
316
+ )
317
+ n_total_entities = sbml_dfs.get_table(table_type).shape[0]
318
+
239
319
  # rollup pathways with identical membership
240
- deduplicated_sources = _deduplicate_source_df(source_df, table_schema)
320
+ deduplicated_sources = _deduplicate_source_df(select_sources_df)
241
321
 
242
322
  unaccounted_for_members = deduplicated_sources
243
323
  retained_pathway_ids = []
244
-
245
324
  while unaccounted_for_members.shape[0] != 0:
246
325
  # find the pathway with the most members
247
- pathway_members = unaccounted_for_members.groupby(SOURCE_SPEC.PATHWAY_ID).size()
248
- top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
326
+
327
+ if source_total_counts is None:
328
+ top_pathway = _select_top_pathway_by_size(unaccounted_for_members)
329
+ else:
330
+ top_pathway = _select_top_pathway_by_enrichment(
331
+ unaccounted_for_members, source_total_counts, n_total_entities, pk
332
+ )
333
+
334
+ if top_pathway is None:
335
+ break
336
+
249
337
  retained_pathway_ids.append(top_pathway)
250
338
 
251
339
  # remove all members associated with the top pathway
252
- members_captured = (
253
- unaccounted_for_members[
254
- unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
255
- ]
256
- .index.get_level_values(table_schema["pk"])
257
- .tolist()
340
+ unaccounted_for_members = _update_unaccounted_for_members(
341
+ top_pathway, unaccounted_for_members
258
342
  )
259
343
 
260
- unaccounted_for_members = unaccounted_for_members[
261
- ~unaccounted_for_members.index.get_level_values(table_schema["pk"]).isin(
262
- members_captured
263
- )
264
- ]
265
-
266
344
  minimial_sources = deduplicated_sources[
267
345
  deduplicated_sources[SOURCE_SPEC.PATHWAY_ID].isin(retained_pathway_ids)
268
346
  ].sort_index()
@@ -270,9 +348,39 @@ def greedy_set_coverge_of_sources(
270
348
  return minimial_sources
271
349
 
272
350
 
273
- def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.DataFrame:
351
+ def get_source_total_counts(
352
+ sbml_dfs: sbml_dfs_core.SBML_dfs, entity_type: str
353
+ ) -> pd.Series:
354
+ """
355
+ Get the total counts of each source.
356
+
357
+ Parameters
358
+ ----------
359
+ sbml_dfs: sbml_dfs_core.SBML_dfs
360
+ sbml_dfs object containing the table to get the total counts of
361
+ entity_type: str
362
+ the type of entity to get the total counts of
363
+
364
+ Returns
365
+ -------
366
+ source_total_counts: pd.Series
367
+ pd.Series containing the total counts of each source.
368
+ """
369
+
370
+ all_sources_table = unnest_sources(sbml_dfs.get_table(entity_type))
371
+ source_total_counts = all_sources_table.value_counts(SOURCE_SPEC.PATHWAY_ID).rename(
372
+ "total_counts"
373
+ )
374
+
375
+ return source_total_counts
376
+
377
+
378
+ def _deduplicate_source_df(source_df: pd.DataFrame) -> pd.DataFrame:
274
379
  """Combine entries in a source table when multiple models have the same members."""
275
380
 
381
+ table_type = sbml_dfs_utils.infer_entity_type(source_df)
382
+ source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
383
+
276
384
  # drop entries which are missing required attributes and throw an error if none are left
277
385
  REQUIRED_NON_NA_ATTRIBUTES = [SOURCE_SPEC.PATHWAY_ID]
278
386
  indexed_sources = (
@@ -296,7 +404,11 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
296
404
  {
297
405
  SOURCE_SPEC.PATHWAY_ID: p,
298
406
  "membership_string": "_".join(
299
- set(indexed_sources.loc[[p]][table_schema["pk"]].tolist())
407
+ set(
408
+ indexed_sources.loc[[p]][
409
+ source_table_schema[SCHEMA_DEFS.PK]
410
+ ].tolist()
411
+ )
300
412
  ),
301
413
  }
302
414
  for p in pathways
@@ -320,16 +432,16 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
320
432
 
321
433
  merged_sources = pd.concat(
322
434
  [
323
- _collapse_by_membership_string(s, membership_categories, table_schema) # type: ignore
435
+ _collapse_by_membership_string(s, membership_categories, source_table_schema) # type: ignore
324
436
  for s in category_index.tolist()
325
437
  ]
326
438
  )
327
439
  merged_sources[SOURCE_SPEC.INDEX_NAME] = merged_sources.groupby(
328
- table_schema["pk"]
440
+ source_table_schema[SCHEMA_DEFS.PK]
329
441
  ).cumcount()
330
442
 
331
443
  return merged_sources.set_index(
332
- [table_schema["pk"], SOURCE_SPEC.INDEX_NAME]
444
+ [source_table_schema[SCHEMA_DEFS.PK], SOURCE_SPEC.INDEX_NAME]
333
445
  ).sort_index()
334
446
 
335
447
 
@@ -345,7 +457,10 @@ def _collapse_by_membership_string(
345
457
  return pd.DataFrame(
346
458
  [
347
459
  pd.concat(
348
- [pd.Series({table_schema["pk"]: ms}), collapsed_source_membership]
460
+ [
461
+ pd.Series({table_schema[SCHEMA_DEFS.PK]: ms}),
462
+ collapsed_source_membership,
463
+ ]
349
464
  )
350
465
  for ms in membership_string.split("_")
351
466
  ]
@@ -398,3 +513,91 @@ def _safe_source_merge(member_Sources: Source | list) -> Source:
398
513
  return merge_sources(member_Sources.tolist())
399
514
  else:
400
515
  raise TypeError("Expecting source.Source or pd.Series")
516
+
517
+
518
+ def _select_top_pathway_by_size(unaccounted_for_members: pd.DataFrame) -> str:
519
+
520
+ pathway_members = unaccounted_for_members.value_counts(SOURCE_SPEC.PATHWAY_ID)
521
+ top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
522
+
523
+ return top_pathway
524
+
525
+
526
+ def _select_top_pathway_by_enrichment(
527
+ unaccounted_for_members: pd.DataFrame,
528
+ source_total_counts: pd.Series,
529
+ n_total_entities: int,
530
+ table_pk: str,
531
+ min_pw_size: int = 5,
532
+ ) -> str:
533
+
534
+ n_observed_entities = len(
535
+ unaccounted_for_members.index.get_level_values(table_pk).unique()
536
+ )
537
+ pathway_members = unaccounted_for_members.value_counts(
538
+ SOURCE_SPEC.PATHWAY_ID
539
+ ).rename("observed_members")
540
+
541
+ pathway_members = pathway_members.loc[pathway_members >= min_pw_size]
542
+ if pathway_members.shape[0] == 0:
543
+ return None
544
+
545
+ wide_contingency_table = (
546
+ pathway_members.to_frame()
547
+ .join(source_total_counts)
548
+ .assign(
549
+ missing_members=lambda x: x["total_counts"] - x["observed_members"],
550
+ observed_nonmembers=lambda x: n_observed_entities - x["observed_members"],
551
+ nonobserved_nonmembers=lambda x: n_total_entities
552
+ - x["observed_nonmembers"]
553
+ - x["missing_members"]
554
+ - x["observed_members"],
555
+ )
556
+ .drop(columns=["total_counts"])
557
+ )
558
+
559
+ # calculate enrichments using a fast vectorized normal approximation
560
+ odds_ratios, _ = hypothesis_testing.fisher_exact_vectorized(
561
+ wide_contingency_table["observed_members"],
562
+ wide_contingency_table["missing_members"],
563
+ wide_contingency_table["observed_nonmembers"],
564
+ wide_contingency_table["nonobserved_nonmembers"],
565
+ )
566
+
567
+ return pathway_members.index[np.argmax(odds_ratios)]
568
+
569
+
570
+ def _update_unaccounted_for_members(
571
+ top_pathway, unaccounted_for_members
572
+ ) -> pd.DataFrame:
573
+ """
574
+ Update the unaccounted for members dataframe by removing the members
575
+ associated with the top pathway.
576
+
577
+ Parameters
578
+ ----------
579
+ top_pathway: str
580
+ the pathway to remove from the unaccounted for members
581
+ unaccounted_for_members: pd.DataFrame
582
+ the dataframe of unaccounted for members
583
+
584
+ Returns
585
+ -------
586
+ unaccounted_for_members: pd.DataFrame
587
+ the dataframe of unaccounted for members with the top pathway removed
588
+ """
589
+
590
+ table_type = sbml_dfs_utils.infer_entity_type(unaccounted_for_members)
591
+ pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
592
+
593
+ members_captured = (
594
+ unaccounted_for_members[
595
+ unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
596
+ ]
597
+ .index.get_level_values(pk)
598
+ .tolist()
599
+ )
600
+
601
+ return unaccounted_for_members[
602
+ ~unaccounted_for_members.index.get_level_values(pk).isin(members_captured)
603
+ ]
@@ -0,0 +1,66 @@
1
+ from typing import Union
2
+
3
+ import numpy as np
4
+ from scipy.stats import norm
5
+
6
+
7
+ def fisher_exact_vectorized(
8
+ observed_members: Union[list[int], np.ndarray],
9
+ missing_members: Union[list[int], np.ndarray],
10
+ observed_nonmembers: Union[list[int], np.ndarray],
11
+ nonobserved_nonmembers: Union[list[int], np.ndarray],
12
+ ) -> tuple[np.ndarray, np.ndarray]:
13
+ """
14
+ Fast vectorized one-tailed Fisher exact test using normal approximation.
15
+
16
+ Parameters:
17
+ -----------
18
+ observed_members, missing_members, observed_nonmembers, nonobserved_nonmembers : array-like
19
+ The four cells of the 2x2 contingency tables (must be non-negative)
20
+
21
+ Returns:
22
+ --------
23
+ odds_ratios : numpy array
24
+ Odds ratios for each test
25
+ p_values : numpy array
26
+ One-tailed p-values (tests for enrichment)
27
+ """
28
+ # Convert to numpy arrays
29
+ a = np.array(observed_members, dtype=float)
30
+ b = np.array(missing_members, dtype=float)
31
+ c = np.array(observed_nonmembers, dtype=float)
32
+ d = np.array(nonobserved_nonmembers, dtype=float)
33
+
34
+ # Check for negative values and raise error
35
+ if np.any((a < 0) | (b < 0) | (c < 0) | (d < 0)):
36
+ raise ValueError("All contingency table values must be non-negative")
37
+
38
+ # Calculate odds ratios
39
+ odds_ratios = np.divide(
40
+ a * d, b * c, out=np.full_like(a, np.inf, dtype=float), where=(b * c) != 0
41
+ )
42
+
43
+ # Normal approximation to hypergeometric distribution
44
+ n = a + b + c + d
45
+
46
+ # Avoid division by zero in expected value calculation
47
+ expected_a = np.divide(
48
+ (a + b) * (a + c), n, out=np.zeros_like(n, dtype=float), where=n != 0
49
+ )
50
+
51
+ # Variance calculation with protection against division by zero
52
+ var_a = np.divide(
53
+ (a + b) * (c + d) * (a + c) * (b + d),
54
+ n * n * (n - 1),
55
+ out=np.ones_like(n, dtype=float), # Default to 1 to avoid sqrt(0)
56
+ where=(n > 1),
57
+ )
58
+ var_a = np.maximum(var_a, 1e-10) # Ensure positive variance
59
+
60
+ # Continuity correction and z-score
61
+ z = (a - expected_a - 0.5) / np.sqrt(var_a)
62
+
63
+ # One-tailed p-value (upper tail for enrichment)
64
+ p_values = norm.sf(z) # 1 - norm.cdf(z)
65
+
66
+ return odds_ratios, p_values
napistu/utils.py CHANGED
@@ -14,7 +14,7 @@ import zipfile
14
14
  from contextlib import closing
15
15
  from itertools import starmap
16
16
  from textwrap import fill
17
- from typing import Any, List, Optional, Union
17
+ from typing import Any, Dict, Optional, List, Union
18
18
  from urllib.parse import urlparse
19
19
  from pathlib import Path
20
20
  from requests.adapters import HTTPAdapter
@@ -1131,6 +1131,28 @@ def safe_fill(x: str, fill_width: int = 15) -> str:
1131
1131
  return fill(x, fill_width)
1132
1132
 
1133
1133
 
1134
+ def match_regex_dict(s: str, regex_dict: Dict[str, any]) -> Optional[any]:
1135
+ """
1136
+ Apply each regex in regex_dict to the string s. If a regex matches, return its value.
1137
+ If no regex matches, return None.
1138
+
1139
+ Parameters
1140
+ ----------
1141
+ s : str
1142
+ The string to test.
1143
+ regex_dict : dict
1144
+ Dictionary where keys are regex patterns (str), and values are the values to return.
1145
+
1146
+ Returns
1147
+ -------
1148
+ The value associated with the first matching regex, or None if no match.
1149
+ """
1150
+ for pattern, value in regex_dict.items():
1151
+ if re.search(pattern, s):
1152
+ return value
1153
+ return None
1154
+
1155
+
1134
1156
  def _add_nameness_score_wrapper(df, name_var, table_schema):
1135
1157
  """Call _add_nameness_score with default value."""
1136
1158
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: napistu
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: Connecting high-dimensional data to curated pathways
5
5
  Home-page: https://github.com/napistu/napistu-py
6
6
  Author: Sean Hackett
@@ -1,13 +1,13 @@
1
1
  napistu/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
2
2
  napistu/__main__.py,sha256=xwlbh_0Ig3a-yG6BIJRiDPSN9R2HnX2pEBvlodlO6h4,29015
3
- napistu/consensus.py,sha256=xWXiqIM6ot-SSPJZXTrVpohbINSCkZXBtRi-5REfk_g,69897
3
+ napistu/consensus.py,sha256=SDw58vkDivzy5AiOQUnf5vUbFxmSrMGMMmptDMZhk0E,69807
4
4
  napistu/constants.py,sha256=8sp1l0cxu2rsnCrWBEEwhcBKvDtc4u0D0f_72zILLW0,13427
5
5
  napistu/identifiers.py,sha256=e2-nTVzr5AINa0y1ER9218bKXyF2kAeJ9At22S4Z00o,33914
6
6
  napistu/indices.py,sha256=Zjg3gE0JQ3T879lCPazYg-WXVE6hvcAr713ZKpJ32rk,9830
7
7
  napistu/sbml_dfs_core.py,sha256=s0OyoHs-AjOcbZu1d3KNkW_PI7Rxbhu5ZLpfQeO4iY8,72639
8
- napistu/sbml_dfs_utils.py,sha256=w5dFcJFDKnKDK9jxPOCuCW8IccxdXmyNmP9vCUhVdf8,46184
9
- napistu/source.py,sha256=UGpN70bqbC9gnKmM0ivSdQYim9hfzgABeXoQKzRr9oU,13646
10
- napistu/utils.py,sha256=PEAsLn7VGN8JlNJQcAMYpjF1gr2mWmb5IqBsypP9hi0,35768
8
+ napistu/sbml_dfs_utils.py,sha256=SOy1Ii2hDFOfQa7pFAJS9EfAmfBVD_sHvDJBVmCN_p8,46456
9
+ napistu/source.py,sha256=iDDKpN-4k_W_tyxEjqe_z-yPJv7uoFRRBhkiBtOH5C8,20416
10
+ napistu/utils.py,sha256=p2sJxTklmV30XS6hanJRjcdfgeaZpkULuMyQX3BPP0c,36404
11
11
  napistu/context/__init__.py,sha256=LQBEqipcHKK0E5UlDEg1ct-ymCs93IlUrUaH8BCevf0,242
12
12
  napistu/context/discretize.py,sha256=Qq7zg46F_I-PvQIT2_pEDQV7YEtUQCxKoRvT5Gu9QsE,15052
13
13
  napistu/context/filtering.py,sha256=l1oq-43ysSGqU9VmhTOO_pYT4DSMf20yxvktPC1MI0I,13696
@@ -17,13 +17,14 @@ napistu/gcs/downloads.py,sha256=SvGv9WYr_Vt3guzyz1QiAuBndeKPTBtWSFLj1-QbLf4,6348
17
17
  napistu/gcs/utils.py,sha256=eLSsvewWJdCguyj2k0ozUGP5BTemaE1PZg41Z3aY5kM,571
18
18
  napistu/ingestion/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
19
19
  napistu/ingestion/bigg.py,sha256=f65--8ARe248eYCUJpFMF284Wz53sLyFyBuwelxHmJA,4340
20
- napistu/ingestion/constants.py,sha256=9UP47VImZ11q0kz17N3EJg2155USqLewwNWyKpA-cbA,8089
20
+ napistu/ingestion/constants.py,sha256=jo3v8Z7Y_tNNhTmEcokVOh1HBJFAXc-Z38S4mG58qfo,10059
21
21
  napistu/ingestion/gtex.py,sha256=X0hSC1yrpf4xSJWFhpeNcnHwJzKDII2MvjfUqYA0JN8,3720
22
22
  napistu/ingestion/hpa.py,sha256=R27ExrryKQ4Crxv9ATXmBJCa-yd01TMOrDjkeBhIQac,5054
23
23
  napistu/ingestion/identifiers_etl.py,sha256=6ppDUA6lEZurdmVbiFLOUzphYbr-hndMhtqsQnq_yAc,5009
24
24
  napistu/ingestion/napistu_edgelist.py,sha256=4RLXsoIk_-Atu-Nqme_t1JpEpBET26VIY2Y_Hcd3sMw,3580
25
25
  napistu/ingestion/obo.py,sha256=AQkIPWbjA464Lma0tx91JucWkIwLjC7Jgv5VHGRTDkE,9601
26
26
  napistu/ingestion/psi_mi.py,sha256=5eJjm7XWogL9oTyGqR52kntHClLwLsTePKqCvUGyi-w,10111
27
+ napistu/ingestion/reactom_fi.py,sha256=hKdOY2wNtcNk6WlnHnNalryiXv6mtcWUiBW9isXPB0Y,6991
27
28
  napistu/ingestion/reactome.py,sha256=Hn9X-vDp4o_HK-OtaQvel3vJeZ8_TC1-4N2rruK9Oks,7099
28
29
  napistu/ingestion/sbml.py,sha256=l8Z98yWuOIRGns8G4UNnoQz7v_xmukZb_IZ_5ye34Ko,25296
29
30
  napistu/ingestion/string.py,sha256=go1WGTkoLJejX7GQWf9bFeInFGAw4jNSpS2B_Zr5f_s,11364
@@ -66,9 +67,9 @@ napistu/network/net_create.py,sha256=66kV_xoWnu4BVLaJZ1TAC7wBSsjPDqjoAXH-X9ShV3s
66
67
  napistu/network/net_create_utils.py,sha256=zajwaz2xAij_9fEnD77SgBw_EnNAnJ8jBCmmK2rk_bA,24672
67
68
  napistu/network/net_propagation.py,sha256=Il5nDOWh3nLz8gRhDFHGp2LxcvJ9C1twiSZjDeiZMUo,23490
68
69
  napistu/network/ng_core.py,sha256=dGnTUKR4WtnvaYMyIHqqF55FY4mJSa7wjA2LZ4cVB6U,11720
69
- napistu/network/ng_utils.py,sha256=c1tHXz_JcH01D5KovNQmRLTEVxpCkCe36otULq-liz8,15579
70
+ napistu/network/ng_utils.py,sha256=ahSm-8M2pV662V7MMVcGaoguBM55_y-F7LDmZSVp9ag,15951
70
71
  napistu/network/paths.py,sha256=r6LVKVvX7i3ctBA5r-xvHfpH5Zsd0VDHUCtin2iag20,17453
71
- napistu/network/precompute.py,sha256=ibL0ByY7Wp5kEfIG3LUDpQKdvAeQX0DNkT_46g2YrGc,8367
72
+ napistu/network/precompute.py,sha256=ARU2tktWnxFISaHAY8chpkg8pusZPv7TT5jSIB9eFF0,10081
72
73
  napistu/ontologies/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
73
74
  napistu/ontologies/constants.py,sha256=GyOFvezSxDK1VigATcruTKtNhjcYaid1ggulEf_HEtQ,4345
74
75
  napistu/ontologies/dogma.py,sha256=VVj6NKBgNym4SdOSu8g22OohALj7cbObhIJmdY2Sfy0,8860
@@ -84,8 +85,9 @@ napistu/scverse/__init__.py,sha256=Lgxr3iMQAkTzXE9BNz93CndNP5djzerLvmHM-D0PU3I,3
84
85
  napistu/scverse/constants.py,sha256=0iAkhyJUIeFGHdLLU3fCaEU1O3Oix4qAsxr3CxGTjVs,653
85
86
  napistu/scverse/loading.py,sha256=jqiE71XB-wdV50GyZrauFNY0Lai4bX9Fm2Gv80VR8t8,27016
86
87
  napistu/statistics/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
88
+ napistu/statistics/hypothesis_testing.py,sha256=k0mBFAMF0XHVcKwS26aPnEbq_FIUVwXU1gZ6cKfFbCk,2190
87
89
  napistu/statistics/quantiles.py,sha256=1-LnmVzC2CQWxCKUh0yi6YfKrbsZM1-kkD7nu2-aS5s,3042
88
- napistu-0.4.2.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
90
+ napistu-0.4.3.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
89
91
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
92
  tests/conftest.py,sha256=t-GHb0MvSsC-MyhkFpOy2K3t5fi7eaig_Rc2xEQC-t8,9678
91
93
  tests/test_consensus.py,sha256=Hzfrgp4SpkRDnEMVMD3f0UInSycndB8kKzC4wDDvRas,15076
@@ -114,7 +116,7 @@ tests/test_network_net_propagation.py,sha256=kZeDHD93iMrLVvxO4OyfRH5_vgsYeQyC40O
114
116
  tests/test_network_ng_core.py,sha256=w-iNBTtenennJhaLFauk952pEsk7W0-Fa8lPvIRqHyY,628
115
117
  tests/test_network_ng_utils.py,sha256=QVVuRnvCRfTSIlGdwQTIF9lr0wOwoc5gGeXAUY_AdgE,713
116
118
  tests/test_network_paths.py,sha256=TWZnxY5bF3m6gahcxcYJGrBIawh2-_vUcec1LyPmXV8,1686
117
- tests/test_network_precompute.py,sha256=zwJrKNC3s8rIrsyAQfQMYxbl8HZXUr7u09nMJ_K8jiU,9005
119
+ tests/test_network_precompute.py,sha256=IPr1KhtxBD0fXx_2TvZqnevrD-Iig35otb8yloRFpRc,10014
118
120
  tests/test_ontologies_genodexito.py,sha256=6fINyUiubHZqu7qxye09DQfJXw28ZMAJc3clPb-cCoY,2298
119
121
  tests/test_ontologies_id_tables.py,sha256=CpwpbmQvTc1BaVd6jbDKHAVE2etwN0vx93nC8jpnMlE,7265
120
122
  tests/test_ontologies_mygene.py,sha256=VkdRcKIWmcG6V-2dpfvsBiOJN5dO-j0RqZNxtJRcyBU,1583
@@ -124,18 +126,18 @@ tests/test_rpy2_callr.py,sha256=V4a-QH5krgYOQRgqzksMzIkGAFjBqKOAqgprxrH6bE0,2904
124
126
  tests/test_rpy2_init.py,sha256=T3gnxC1O7XNvYM2P4018ikpPPAy-kwQLm7Erj0RfA-4,5895
125
127
  tests/test_sbml.py,sha256=f25zj1NogYrmLluvBDboLameTuCiQ309433Qn3iPvhg,1483
126
128
  tests/test_sbml_dfs_core.py,sha256=nnLPpZTVtCznOBohk7CX67x6sMqktJWt-sZMWQKoaDs,26521
127
- tests/test_sbml_dfs_utils.py,sha256=gWIhzUEtQlOR9c1TiCyhlSAELmWnBSncn6vCEqH5hl0,11029
129
+ tests/test_sbml_dfs_utils.py,sha256=ZD9x2B81fsfYEjAV9wphHOR7ywjNcfvfw1LGNv4PxUA,11471
128
130
  tests/test_sbo.py,sha256=x_PENFaXYsrZIzOZu9cj_Wrej7i7SNGxgBYYvcigLs0,308
129
131
  tests/test_scverse_loading.py,sha256=bnU1lQSYYWhOAs0IIBoi4ZohqPokDQJ0n_rtkAfEyMU,29948
130
- tests/test_set_coverage.py,sha256=J-6m6LuOjcQa9pxRuWglSfJk4Ltm7kt_eOrn_Q-7P6Q,1604
131
- tests/test_source.py,sha256=hT0IlpexR5zP0OhWl5BBaho9d1aCYQlFZLwRIRRnw_Y,1969
132
+ tests/test_source.py,sha256=iV-Yyu8flhIGWF17SCL8msG2bjqwb9w2IZ694b0iZ-o,2985
133
+ tests/test_statistics_hypothesis_testing.py,sha256=qD-oS9zo5JlH-jdtiOrWAKI4nKFuZvvh6361_pFSpIs,2259
132
134
  tests/test_statistics_quantiles.py,sha256=yNDeqwgbP-1Rx3C_dLX_wnwT_Lr-iJWClmeKmElqmTE,4984
133
135
  tests/test_uncompartmentalize.py,sha256=nAk5kfAVLU9a2VWe2x2HYVcKqj-EnwmwddERIPRax8c,1289
134
136
  tests/test_utils.py,sha256=qPSpV-Q9b6vmdycgaDmQqtcvzKnAVnN9j5xJ9x-T6bg,23959
135
137
  tests/utils.py,sha256=SoWQ_5roJteFGcMaOeEiQ5ucwq3Z2Fa3AAs9iXHTsJY,749
136
138
  tests/test_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
- napistu-0.4.2.dist-info/METADATA,sha256=6P_9Mmno6pVu4Me-3QdcMtiGOhCcajTqm5LP_Hns4lI,4078
138
- napistu-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
139
- napistu-0.4.2.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
140
- napistu-0.4.2.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
141
- napistu-0.4.2.dist-info/RECORD,,
139
+ napistu-0.4.3.dist-info/METADATA,sha256=gV0a41vyQ52Ja15QyLSPGfeIJPj6oQRTC00HsxJjG88,4078
140
+ napistu-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
141
+ napistu-0.4.3.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
142
+ napistu-0.4.3.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
143
+ napistu-0.4.3.dist-info/RECORD,,
@@ -276,3 +276,33 @@ def test_precomputed_distances_serialization():
276
276
  # Clean up the temporary file
277
277
  if os.path.exists(temp_path):
278
278
  os.remove(temp_path)
279
+
280
+
281
+ def test_filter_precomputed_distances_top_n_subset():
282
+ # Use a small top_n for a quick test
283
+ top_n = 5
284
+ filtered = precompute.filter_precomputed_distances_top_n(
285
+ precomputed_distances, top_n=top_n
286
+ )
287
+ # Check that the filtered DataFrame is a subset of the original
288
+ merged = filtered.merge(
289
+ precomputed_distances,
290
+ on=[
291
+ precompute.NAPISTU_EDGELIST.SC_ID_ORIGIN,
292
+ precompute.NAPISTU_EDGELIST.SC_ID_DEST,
293
+ ],
294
+ how="left",
295
+ indicator=True,
296
+ )
297
+ assert (
298
+ merged["_merge"] == "both"
299
+ ).all(), "Filtered rows must be present in the original DataFrame"
300
+ # Check that columns are preserved
301
+ assert set(
302
+ [
303
+ precompute.NAPISTU_EDGELIST.SC_ID_ORIGIN,
304
+ precompute.NAPISTU_EDGELIST.SC_ID_DEST,
305
+ ]
306
+ ).issubset(filtered.columns)
307
+ # Optionally, check that the number of rows is less than or equal to the input
308
+ assert filtered.shape[0] <= precomputed_distances.shape[0]
@@ -334,3 +334,16 @@ def test_infer_entity_type_errors():
334
334
  ) # Two primary keys
335
335
  with pytest.raises(ValueError):
336
336
  sbml_dfs_utils.infer_entity_type(df)
337
+
338
+
339
+ def test_infer_entity_type_multindex_reactions():
340
+ # DataFrame with MultiIndex (r_id, foo), should infer as reactions
341
+ import pandas as pd
342
+ from napistu.constants import SBML_DFS
343
+
344
+ df = pd.DataFrame({"some_col": [1, 2]})
345
+ df.index = pd.MultiIndex.from_tuples(
346
+ [("rxn1", "a"), ("rxn2", "b")], names=[SBML_DFS.R_ID, "foo"]
347
+ )
348
+ result = sbml_dfs_utils.infer_entity_type(df)
349
+ assert result == SBML_DFS.REACTIONS
tests/test_source.py CHANGED
@@ -5,6 +5,8 @@ import os
5
5
  import pandas as pd
6
6
  from napistu import indices
7
7
  from napistu import source
8
+ from napistu.network import ng_utils
9
+ from napistu.constants import SBML_DFS
8
10
 
9
11
  test_path = os.path.abspath(os.path.join(__file__, os.pardir))
10
12
  test_data = os.path.join(test_path, "test_data")
@@ -58,10 +60,40 @@ def test_source_w_pwindex():
58
60
  assert source_obj.source.shape == (2, 8)
59
61
 
60
62
 
61
- ################################################
62
- # __main__
63
- ################################################
63
+ def test_get_minimal_source_edges(sbml_dfs_metabolism):
64
+ vertices = sbml_dfs_metabolism.reactions.reset_index().rename(
65
+ columns={SBML_DFS.R_ID: "node"}
66
+ )
67
+
68
+ minimal_source_edges = ng_utils.get_minimal_sources_edges(
69
+ vertices, sbml_dfs_metabolism
70
+ )
71
+ # print(minimal_source_edges.shape)
72
+ assert minimal_source_edges.shape == (87, 3)
73
+
74
+
75
+ def test_source_set_coverage(sbml_dfs_metabolism):
76
+
77
+ source_df = source.unnest_sources(sbml_dfs_metabolism.reactions)
78
+
79
+ # print(source_df.shape)
80
+ assert source_df.shape == (111, 7)
81
+
82
+ set_coverage = source.source_set_coverage(source_df)
83
+ # print(set_coverage.shape)
84
+ assert set_coverage.shape == (87, 6)
85
+
86
+
87
+ def test_source_set_coverage_enrichment(sbml_dfs_metabolism):
88
+
89
+ source_total_counts = source.get_source_total_counts(
90
+ sbml_dfs_metabolism, "reactions"
91
+ )
92
+
93
+ source_df = source.unnest_sources(sbml_dfs_metabolism.reactions).head(40)
94
+
95
+ set_coverage = source.source_set_coverage(
96
+ source_df, source_total_counts=source_total_counts, sbml_dfs=sbml_dfs_metabolism
97
+ )
64
98
 
65
- if __name__ == "__main__":
66
- test_source()
67
- test_source_w_pwindex()
99
+ assert set_coverage.shape == (30, 6)
@@ -0,0 +1,62 @@
1
+ import numpy as np
2
+ from scipy.stats import fisher_exact
3
+
4
+ from napistu.statistics import hypothesis_testing
5
+
6
+
7
+ def test_fisher_exact_vectorized_basic_and_vectorized():
8
+
9
+ # Classic Fisher's test example: [[1, 9], [11, 3]]
10
+ # a=1, b=9, c=11, d=3
11
+ odds, p = hypothesis_testing.fisher_exact_vectorized([1], [9], [11], [3])
12
+ # Odds ratio: (1*3)/(9*11) = 3/99 = 0.0303...
13
+ assert np.allclose(odds, [3 / 99])
14
+ assert p.shape == (1,)
15
+ assert (p >= 0).all() and (p <= 1).all()
16
+
17
+ # Vectorized: two tables
18
+ odds, p = hypothesis_testing.fisher_exact_vectorized(
19
+ [1, 2], [9, 8], [11, 10], [3, 4]
20
+ )
21
+ assert odds.shape == (2,)
22
+ assert p.shape == (2,)
23
+ # Check that odds ratios are correct
24
+ expected_odds = np.array([(1 * 3) / (9 * 11), (2 * 4) / (8 * 10)])
25
+ assert np.allclose(odds, expected_odds)
26
+ # P-values should be between 0 and 1
27
+ assert (p >= 0).all() and (p <= 1).all()
28
+
29
+
30
+ def test_fisher_exact_vectorized_vs_scipy():
31
+
32
+ # Define several 2x2 tables
33
+ tables = [
34
+ ([1], [9], [11], [3]),
35
+ ([5], [2], [8], [7]),
36
+ ([10], [10], [10], [10]),
37
+ ([0], [5], [5], [10]),
38
+ ([3], [7], [2], [8]),
39
+ ]
40
+ for a, b, c, d in tables:
41
+ odds_vec, p_vec = hypothesis_testing.fisher_exact_vectorized(a, b, c, d)
42
+ # Build the table for scipy
43
+ table = np.array([[a[0], b[0]], [c[0], d[0]]])
44
+ odds_scipy, p_scipy = fisher_exact(table, alternative="greater")
45
+ # Odds ratios should be nearly identical
46
+ assert np.allclose(odds_vec, [odds_scipy], rtol=1e-6, atol=1e-8)
47
+ # P-values should be close (normal approx vs exact)
48
+ assert np.allclose(
49
+ p_vec, [p_scipy], rtol=0.15, atol=1e-3
50
+ ) # allow some tolerance
51
+
52
+ # Also test vectorized input
53
+ a = [1, 5, 10, 0, 3]
54
+ b = [9, 2, 10, 5, 7]
55
+ c = [11, 8, 10, 5, 2]
56
+ d = [3, 7, 10, 10, 8]
57
+ odds_vec, p_vec = hypothesis_testing.fisher_exact_vectorized(a, b, c, d)
58
+ for i in range(len(a)):
59
+ table = np.array([[a[i], b[i]], [c[i], d[i]]])
60
+ odds_scipy, p_scipy = fisher_exact(table, alternative="greater")
61
+ assert np.allclose(odds_vec[i], odds_scipy, rtol=1e-6, atol=1e-8)
62
+ assert np.allclose(p_vec[i], p_scipy, rtol=0.15, atol=1e-3)
@@ -1,50 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from napistu import source
4
- from napistu.network import ng_utils
5
-
6
-
7
- def test_get_minimal_source_edges(sbml_dfs_metabolism):
8
- vertices = sbml_dfs_metabolism.reactions.reset_index().rename(
9
- columns={"r_id": "node"}
10
- )
11
-
12
- minimal_source_edges = ng_utils.get_minimal_sources_edges(
13
- vertices, sbml_dfs_metabolism
14
- )
15
- # print(minimal_source_edges.shape)
16
- assert minimal_source_edges.shape == (87, 3)
17
-
18
-
19
- def test_greedy_set_coverge_of_sources(sbml_dfs_metabolism):
20
- table_schema = sbml_dfs_metabolism.schema["reactions"]
21
-
22
- source_df = source.unnest_sources(
23
- sbml_dfs_metabolism.reactions, source_var="r_Source"
24
- )
25
- # print(source_df.shape)
26
- assert source_df.shape == (111, 7)
27
-
28
- set_coverage = source.greedy_set_coverge_of_sources(source_df, table_schema)
29
- # print(set_coverage.shape)
30
- assert set_coverage.shape == (87, 6)
31
-
32
-
33
- ################################################
34
- # __main__
35
- ################################################
36
-
37
- if __name__ == "__main__":
38
- import os
39
- from napistu import indices
40
- from napistu import consensus
41
-
42
- test_path = os.path.abspath(os.path.join(__file__, os.pardir))
43
- test_data = os.path.join(test_path, "test_data")
44
-
45
- pw_index = indices.PWIndex(os.path.join(test_data, "pw_index_metabolism.tsv"))
46
- sbml_dfs_dict = consensus.construct_sbml_dfs_dict(pw_index)
47
- sbml_dfs_metabolism = consensus.construct_consensus_model(sbml_dfs_dict, pw_index)
48
-
49
- test_get_minimal_source_edges(sbml_dfs_metabolism)
50
- test_greedy_set_coverge_of_sources(sbml_dfs_metabolism)