napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. napistu/__main__.py +126 -96
  2. napistu/constants.py +35 -41
  3. napistu/context/__init__.py +10 -0
  4. napistu/context/discretize.py +462 -0
  5. napistu/context/filtering.py +387 -0
  6. napistu/gcs/__init__.py +1 -1
  7. napistu/identifiers.py +74 -15
  8. napistu/indices.py +68 -0
  9. napistu/ingestion/__init__.py +1 -1
  10. napistu/ingestion/bigg.py +47 -62
  11. napistu/ingestion/constants.py +18 -133
  12. napistu/ingestion/gtex.py +113 -0
  13. napistu/ingestion/hpa.py +147 -0
  14. napistu/ingestion/sbml.py +0 -97
  15. napistu/ingestion/string.py +2 -2
  16. napistu/matching/__init__.py +10 -0
  17. napistu/matching/constants.py +18 -0
  18. napistu/matching/interactions.py +518 -0
  19. napistu/matching/mount.py +529 -0
  20. napistu/matching/species.py +510 -0
  21. napistu/mcp/__init__.py +7 -4
  22. napistu/mcp/__main__.py +128 -72
  23. napistu/mcp/client.py +16 -25
  24. napistu/mcp/codebase.py +201 -145
  25. napistu/mcp/component_base.py +170 -0
  26. napistu/mcp/config.py +223 -0
  27. napistu/mcp/constants.py +45 -2
  28. napistu/mcp/documentation.py +253 -136
  29. napistu/mcp/documentation_utils.py +13 -48
  30. napistu/mcp/execution.py +372 -305
  31. napistu/mcp/health.py +47 -65
  32. napistu/mcp/profiles.py +10 -6
  33. napistu/mcp/server.py +161 -80
  34. napistu/mcp/tutorials.py +139 -87
  35. napistu/modify/__init__.py +1 -1
  36. napistu/modify/gaps.py +1 -1
  37. napistu/network/__init__.py +1 -1
  38. napistu/network/constants.py +101 -34
  39. napistu/network/data_handling.py +388 -0
  40. napistu/network/ig_utils.py +351 -0
  41. napistu/network/napistu_graph_core.py +354 -0
  42. napistu/network/neighborhoods.py +40 -40
  43. napistu/network/net_create.py +373 -309
  44. napistu/network/net_propagation.py +47 -19
  45. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  46. napistu/network/paths.py +67 -51
  47. napistu/network/precompute.py +11 -11
  48. napistu/ontologies/__init__.py +10 -0
  49. napistu/ontologies/constants.py +129 -0
  50. napistu/ontologies/dogma.py +243 -0
  51. napistu/ontologies/genodexito.py +649 -0
  52. napistu/ontologies/mygene.py +369 -0
  53. napistu/ontologies/renaming.py +198 -0
  54. napistu/rpy2/__init__.py +229 -86
  55. napistu/rpy2/callr.py +47 -77
  56. napistu/rpy2/constants.py +24 -23
  57. napistu/rpy2/rids.py +61 -648
  58. napistu/sbml_dfs_core.py +587 -222
  59. napistu/scverse/__init__.py +15 -0
  60. napistu/scverse/constants.py +28 -0
  61. napistu/scverse/loading.py +727 -0
  62. napistu/utils.py +118 -10
  63. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
  64. napistu-0.3.1.dist-info/RECORD +133 -0
  65. tests/conftest.py +22 -0
  66. tests/test_context_discretize.py +56 -0
  67. tests/test_context_filtering.py +267 -0
  68. tests/test_identifiers.py +100 -0
  69. tests/test_indices.py +65 -0
  70. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  71. tests/test_matching_interactions.py +108 -0
  72. tests/test_matching_mount.py +305 -0
  73. tests/test_matching_species.py +394 -0
  74. tests/test_mcp_config.py +193 -0
  75. tests/test_mcp_documentation_utils.py +12 -3
  76. tests/test_mcp_server.py +156 -19
  77. tests/test_network_data_handling.py +397 -0
  78. tests/test_network_ig_utils.py +23 -0
  79. tests/test_network_neighborhoods.py +19 -0
  80. tests/test_network_net_create.py +459 -0
  81. tests/test_network_ng_utils.py +30 -0
  82. tests/test_network_paths.py +56 -0
  83. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  84. tests/test_ontologies_genodexito.py +58 -0
  85. tests/test_ontologies_mygene.py +39 -0
  86. tests/test_ontologies_renaming.py +110 -0
  87. tests/test_rpy2_callr.py +79 -0
  88. tests/test_rpy2_init.py +151 -0
  89. tests/test_sbml.py +0 -31
  90. tests/test_sbml_dfs_core.py +134 -10
  91. tests/test_scverse_loading.py +778 -0
  92. tests/test_set_coverage.py +2 -2
  93. tests/test_utils.py +121 -1
  94. napistu/mechanism_matching.py +0 -1353
  95. napistu/rpy2/netcontextr.py +0 -467
  96. napistu-0.2.5.dev7.dist-info/RECORD +0 -98
  97. tests/test_igraph.py +0 -367
  98. tests/test_mechanism_matching.py +0 -784
  99. tests/test_net_utils.py +0 -149
  100. tests/test_netcontextr.py +0 -105
  101. tests/test_rpy2.py +0 -61
  102. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  103. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
  104. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
  105. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
  106. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
  107. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,387 @@
1
+ import copy
2
+ import logging
3
+ from typing import Union, List, Optional
4
+
5
+ import pandas as pd
6
+
7
+ from napistu import sbml_dfs_core
8
+ from napistu import utils
9
+ from napistu.constants import SBML_DFS
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def filter_species_by_attribute(
15
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
16
+ species_data_table: str,
17
+ attribute_name: str,
18
+ attribute_value: Union[int, bool, str, List[str]],
19
+ negate: bool = False,
20
+ inplace: bool = True,
21
+ ) -> Optional[sbml_dfs_core.SBML_dfs]:
22
+ """
23
+ Filter species in the SBML_dfs based on an attribute value.
24
+
25
+ Parameters
26
+ ----------
27
+ sbml_dfs : sbml_dfs_core.SBML_dfs
28
+ The SBML_dfs object to filter.
29
+ species_data_table : str
30
+ The name of the species data table to filter.
31
+ attribute_name : str
32
+ The name of the attribute to filter on.
33
+ attribute_value : Union[int, bool, str, List[str]]
34
+ The value of the attribute to filter on. Can be a single value or a list of values.
35
+ negate : bool, optional
36
+ Whether to negate the filter, by default False.
37
+ If True, keeps species with the attribute defined that do NOT match the attribute value.
38
+ inplace : bool, optional
39
+ Whether to filter the SBML_dfs in place, by default True.
40
+ If False, returns a new SBML_dfs object with the filtered species.
41
+
42
+ Returns
43
+ -------
44
+ Optional[sbml_dfs_core.SBML_dfs]
45
+ If inplace=True, returns None.
46
+ If inplace=False, returns a new SBML_dfs object with the filtered species.
47
+
48
+ Raises
49
+ ------
50
+ ValueError
51
+ If species_data_table is not found in sbml_dfs.species_data
52
+ If attribute_name is not found in the species data table columns
53
+ """
54
+
55
+ # If not inplace, make a copy
56
+ if not inplace:
57
+ sbml_dfs = copy.deepcopy(sbml_dfs)
58
+
59
+ # Get the species data
60
+ species_data = sbml_dfs.select_species_data(species_data_table)
61
+
62
+ # Find species that match the filter criteria (including negation)
63
+ species_to_remove = find_species_with_attribute(
64
+ species_data, attribute_name, attribute_value, negate=negate
65
+ )
66
+
67
+ if isinstance(attribute_value, list):
68
+ filter_str = (
69
+ f"{attribute_name} in {attribute_value}"
70
+ if not negate
71
+ else f"{attribute_name} not in {attribute_value}"
72
+ )
73
+ else:
74
+ filter_str = (
75
+ f"{attribute_name}={attribute_value}"
76
+ if not negate
77
+ else f"{attribute_name}!={attribute_value}"
78
+ )
79
+ logger.info(
80
+ f"Removing {len(species_to_remove)} species from {species_data_table} table with filter {filter_str}"
81
+ )
82
+
83
+ sbml_dfs._remove_species(species_to_remove)
84
+
85
+ return None if inplace else sbml_dfs
86
+
87
+
88
+ def filter_reactions_with_disconnected_cspecies(
89
+ sbml_dfs: sbml_dfs_core.SBML_dfs, species_data_table: str, inplace: bool = False
90
+ ) -> Optional[sbml_dfs_core.SBML_dfs]:
91
+ """
92
+ Remove reactions from the SBML_dfs object whose defining compartmentalized species (cspecies) are disconnected
93
+ according to a co-occurrence matrix derived from a species data table.
94
+
95
+ This function identifies reactions where any pair of defining cspecies do not co-occur (i.e., are disconnected)
96
+ in the provided species data table, and removes those reactions from the model. The operation can be performed
97
+ in-place or on a copy of the SBML_dfs object.
98
+
99
+ Parameters
100
+ ----------
101
+ sbml_dfs : sbml_dfs_core.SBML_dfs
102
+ The SBML_dfs object to filter reactions from.
103
+ species_data_table : str
104
+ The name of the species data table to use for co-occurrence calculation.
105
+ inplace : bool, optional
106
+ If True, modifies the input SBML_dfs object in-place and returns None. If False (default),
107
+ returns a new SBML_dfs object with the filtered reactions.
108
+
109
+ Returns
110
+ -------
111
+ Optional[sbml_dfs_core.SBML_dfs]
112
+ If inplace=True, returns None. If inplace=False, returns a new SBML_dfs object with filtered reactions.
113
+
114
+ Warns
115
+ -----
116
+ UserWarning
117
+ If no reactions are pruned based on non-cooccurrence.
118
+
119
+ Examples
120
+ --------
121
+ >>> filtered_sbml_dfs = filter_reactions_with_disconnected_cspecies(sbml_dfs, "test_data", inplace=False)
122
+ >>> # To modify in-place:
123
+ >>> filter_reactions_with_disconnected_cspecies(sbml_dfs, "test_data", inplace=True)
124
+ """
125
+
126
+ if inplace:
127
+ sbml_dfs = copy.deepcopy(sbml_dfs)
128
+
129
+ # find how many conditions a pair of species cooccur in
130
+ cooccurence_edgelist = _create_cooccurence_edgelist(sbml_dfs, species_data_table)
131
+
132
+ reactions_to_remove = _find_reactions_with_disconnected_cspecies(
133
+ cooccurence_edgelist, sbml_dfs
134
+ )
135
+
136
+ if len(reactions_to_remove) == 0:
137
+ logger.warning("No reactions will be pruned based on non-cooccurrence.")
138
+ else:
139
+ logger.info(
140
+ f"Pruning {len(reactions_to_remove)} reactions based on non-cooccurrence."
141
+ )
142
+ sbml_dfs.remove_reactions(reactions_to_remove)
143
+
144
+ return None if inplace else sbml_dfs
145
+
146
+
147
+ def find_species_with_attribute(
148
+ species_data: pd.DataFrame,
149
+ attribute_name: str,
150
+ attribute_value: Union[int, bool, str, List[str]],
151
+ negate: bool = False,
152
+ ) -> List[str]:
153
+ """
154
+ Find species that match the given attribute filter criteria.
155
+
156
+ Parameters
157
+ ----------
158
+ species_data : pd.DataFrame
159
+ The species data table to filter.
160
+ attribute_name : str
161
+ The name of the attribute to filter on.
162
+ attribute_value : Union[int, bool, str, List[str]]
163
+ The value of the attribute to filter on. Can be a single value or a list of values.
164
+ negate : bool, optional
165
+ Whether to negate the filter, by default False.
166
+ If True, returns species that do NOT match the attribute value.
167
+
168
+ Returns
169
+ -------
170
+ List[str]
171
+ List of species IDs that match the filter criteria.
172
+
173
+ Raises
174
+ ------
175
+ ValueError
176
+ If attribute_name is not found in the species data table columns
177
+ """
178
+ # Check if attribute_name exists in species_data columns
179
+ if attribute_name not in species_data.columns:
180
+ raise ValueError(
181
+ f"attribute_name {attribute_name} not found in species_data.columns. "
182
+ f"Available attributes: {species_data.columns}"
183
+ )
184
+
185
+ # First, get the mask for defined values (not NA)
186
+ defined_mask = species_data[attribute_name].notna()
187
+
188
+ # Then, get the mask for matching values
189
+ if isinstance(attribute_value, list):
190
+ match_mask = species_data[attribute_name].isin(attribute_value)
191
+ else:
192
+ match_mask = species_data[attribute_name] == attribute_value
193
+
194
+ # Apply negation if requested and combine with defined mask
195
+ if negate:
196
+ # When negating, we only want to consider rows where the attribute is defined
197
+ final_mask = defined_mask & ~match_mask
198
+ else:
199
+ final_mask = defined_mask & match_mask
200
+
201
+ # Return species that match our criteria
202
+ return species_data[final_mask].index.tolist()
203
+
204
+
205
+ def _find_reactions_with_disconnected_cspecies(
206
+ coccurrence_edgelist: pd.DataFrame,
207
+ sbml_dfs: Optional[sbml_dfs_core.SBML_dfs],
208
+ cooccurence_threshold: int = 0, # noqa
209
+ ) -> set:
210
+ """
211
+ Find reactions with disconnected cspecies.
212
+
213
+ This function finds reactions with disconnected cspecies based on the cooccurrence matrix.
214
+ Only cspecies which are DEFINING are considered because these are AND rules for reaction operability.
215
+ It returns the set of reaction ids with disconnected cspecies.
216
+
217
+ Parameters
218
+ ----------
219
+ coccurrence_edgelist : pd.DataFrame
220
+ The cooccurrence edgelist.
221
+ sbml_dfs : sbml_dfs_core.SBML_dfs
222
+ The SBML_dfs object.
223
+ cooccurence_threshold : int
224
+ The threshold for cooccurrence. Values equal to or below this threshold are considered disconnected.
225
+
226
+ Returns
227
+ -------
228
+ set
229
+ The set of reaction ids with disconnected cspecies.
230
+
231
+ """
232
+
233
+ utils.match_pd_vars(
234
+ coccurrence_edgelist, {"s_id_1", "s_id_2", "cooccurence"}
235
+ ).assert_present()
236
+ sbml_dfs._validate_table(SBML_DFS.REACTION_SPECIES)
237
+ sbml_dfs._validate_table(SBML_DFS.COMPARTMENTALIZED_SPECIES)
238
+
239
+ reaction_species = sbml_dfs_core.add_sbo_role(sbml_dfs.reaction_species)
240
+
241
+ logger.info(
242
+ "Finding disconnected pairs of cspecies based on the zero values in the coccurrence_edgelist"
243
+ )
244
+
245
+ # map to cspcies
246
+ disconnected_cspecies = (
247
+ coccurrence_edgelist.query("cooccurence <= @cooccurence_threshold")
248
+ .merge(
249
+ sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
250
+ .reset_index(drop=False)
251
+ .rename(columns={SBML_DFS.S_ID: "s_id_1", SBML_DFS.SC_ID: "sc_id_1"}),
252
+ how="left",
253
+ )
254
+ .merge(
255
+ sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
256
+ .reset_index(drop=False)
257
+ .rename(columns={SBML_DFS.S_ID: "s_id_2", SBML_DFS.SC_ID: "sc_id_2"}),
258
+ how="left",
259
+ )
260
+ )
261
+
262
+ # remove defining attributes which don't occur since these are AND rules
263
+ # ignore required attributes since these are OR rules and do not require cooccurrence
264
+
265
+ defining_reaction_species = reaction_species.query("sbo_role == 'DEFINING'")[
266
+ [SBML_DFS.R_ID, SBML_DFS.SC_ID]
267
+ ].drop_duplicates()
268
+
269
+ logger.info(
270
+ "Finding reactions with disconnected cspecies based on the cooccurrence matrix"
271
+ )
272
+ # since any 2 pairs of cspecies being missing together would stop a reaction from operating, we can convert reaction_species to an edgelist by self-joining on reaction id
273
+ invalid_defining_non_cooccurring = (
274
+ (
275
+ defining_reaction_species.rename(columns={SBML_DFS.SC_ID: "sc_id_1"}).merge(
276
+ defining_reaction_species.rename(columns={SBML_DFS.SC_ID: "sc_id_2"}),
277
+ on=SBML_DFS.R_ID,
278
+ how="left",
279
+ )
280
+ )
281
+ .query("sc_id_1 != sc_id_2")
282
+ .merge(disconnected_cspecies, on=["sc_id_1", "sc_id_2"], how="inner")
283
+ )
284
+
285
+ invalid_defining_non_cooccurring_reactions = set(
286
+ invalid_defining_non_cooccurring[SBML_DFS.R_ID].unique()
287
+ )
288
+
289
+ return invalid_defining_non_cooccurring_reactions
290
+
291
+
292
+ def _create_cooccurence_edgelist(
293
+ sbml_dfs: sbml_dfs_core.SBML_dfs, species_data_table: str
294
+ ):
295
+ """
296
+ Create a co-occurrence edgelist for species based on a binary species data table.
297
+
298
+ This function computes a co-occurrence matrix for all pairs of species in the given data table,
299
+ where each entry represents the number of conditions in which both species are present (i.e., have value 1).
300
+ The result is returned as an edgelist DataFrame with columns 's_id_1', 's_id_2', and 'cooccurence'.
301
+
302
+ Parameters
303
+ ----------
304
+ sbml_dfs : sbml_dfs_core.SBML_dfs
305
+ The SBML_dfs object containing the species data table.
306
+ species_data_table : str
307
+ The name of the species data table to use for co-occurrence calculation. The table must contain only binary or boolean columns.
308
+
309
+ Returns
310
+ -------
311
+ pd.DataFrame
312
+ Edgelist DataFrame with columns ['s_id_1', 's_id_2', 'cooccurence'], where each row gives the number of conditions in which the two species co-occur.
313
+
314
+ Raises
315
+ ------
316
+ ValueError
317
+ If no binary or boolean columns are found in the species data table.
318
+ """
319
+ species_data = sbml_dfs.select_species_data(species_data_table)
320
+
321
+ # select all binary columns (results in {0, 1})
322
+ # convert to numpy ndarray
323
+ binary_matrix = _binarize_species_data(species_data).to_numpy()
324
+
325
+ # x * t(x)
326
+ cooccurrence_matrix = binary_matrix @ binary_matrix.T
327
+ # convert to a binary matrix
328
+
329
+ cooccurence_edgelist = utils.matrix_to_edgelist(
330
+ cooccurrence_matrix,
331
+ row_labels=species_data.index.tolist(),
332
+ col_labels=species_data.index.tolist(),
333
+ ).rename(columns={"row": "s_id_1", "column": "s_id_2", "value": "cooccurence"})
334
+
335
+ # calculate the cooccurrence matrix
336
+ return cooccurence_edgelist
337
+
338
+
339
+ def _binarize_species_data(species_data: pd.DataFrame) -> pd.DataFrame:
340
+ """
341
+ Convert all boolean or binary columns in a species data table to a DataFrame of binary (0/1) values.
342
+
343
+ This function selects columns of dtype 'bool' or integer columns containing only 0 and 1, and converts them to a DataFrame of binary values (0/1).
344
+ Columns that are not boolean or binary are ignored. If no such columns are found, a ValueError is raised.
345
+
346
+ Parameters
347
+ ----------
348
+ species_data : pd.DataFrame
349
+ The species data table to binarize.
350
+
351
+ Returns
352
+ -------
353
+ pd.DataFrame
354
+ DataFrame containing only the binarized columns (0/1 values) from the input.
355
+
356
+ Raises
357
+ ------
358
+ ValueError
359
+ If no binary or boolean columns are found in the input DataFrame.
360
+
361
+ Warns
362
+ -----
363
+ UserWarning
364
+ If some columns in the input were not binarized and left out of the output.
365
+ """
366
+ binary_series = []
367
+ for c in species_data.columns:
368
+ if species_data[c].dtype == "bool":
369
+ binary_series.append(species_data[c].astype(int))
370
+ elif species_data[c].dtype == "int64":
371
+ if species_data[c].isin([0, 1]).all():
372
+ binary_series.append(species_data[c])
373
+ else:
374
+ continue
375
+ else:
376
+ continue
377
+
378
+ if len(binary_series) == 0:
379
+ raise ValueError("No binary or boolean columns found")
380
+
381
+ binary_df = pd.concat(binary_series, axis=1)
382
+
383
+ if len(binary_df.columns) != len(species_data.columns):
384
+ left_out = set(species_data.columns) - set(binary_df.columns)
385
+ logger.warning(f"Some columns were not binarized: {', '.join(left_out)}")
386
+
387
+ return binary_df
napistu/gcs/__init__.py CHANGED
@@ -4,7 +4,7 @@ from importlib.metadata import PackageNotFoundError
4
4
  from importlib.metadata import version
5
5
 
6
6
  try:
7
- __version__ = version("calicolabs-cpr")
7
+ __version__ = version("napistu")
8
8
  except PackageNotFoundError:
9
9
  # package is not installed
10
10
  pass
napistu/identifiers.py CHANGED
@@ -22,6 +22,8 @@ from napistu.constants import ENSEMBL_MOLECULE_TYPES_FROM_ONTOLOGY
22
22
  from napistu.constants import ENSEMBL_SPECIES_FROM_CODE
23
23
  from napistu.constants import ENSEMBL_SPECIES_TO_CODE
24
24
  from napistu.constants import SPECIES_IDENTIFIERS_REQUIRED_VARS
25
+ from napistu.constants import SBML_DFS_SCHEMA
26
+ from napistu.constants import IDENTIFIERS_REQUIRED_VARS
25
27
 
26
28
  logger = logging.getLogger(__name__)
27
29
 
@@ -172,6 +174,61 @@ def merge_identifiers(identifier_series: pd.Series) -> Identifiers:
172
174
  return Identifiers(merged_ids)
173
175
 
174
176
 
177
+ def df_to_identifiers(df: pd.DataFrame, entity_type: str) -> pd.Series:
178
+ """
179
+ Convert a DataFrame of identifier information to a Series of Identifiers objects.
180
+
181
+ Parameters
182
+ ----------
183
+ df : pd.DataFrame
184
+ DataFrame containing identifier information with required columns:
185
+ ontology, identifier, url, bqb
186
+ index_col : str
187
+ Name of the column to use as index for the output Series
188
+
189
+ Returns
190
+ -------
191
+ pd.Series
192
+ Series indexed by index_col containing Identifiers objects
193
+ """
194
+
195
+ if entity_type not in SBML_DFS_SCHEMA.SCHEMA:
196
+ raise ValueError(f"Invalid entity type: {entity_type}")
197
+
198
+ table_schema = SBML_DFS_SCHEMA.SCHEMA[entity_type]
199
+ if "id" not in table_schema:
200
+ raise ValueError(f"The entity type {entity_type} does not have an id column")
201
+
202
+ table_pk_var = table_schema["pk"]
203
+ expected_columns = set([table_pk_var]) | IDENTIFIERS_REQUIRED_VARS
204
+ missing_columns = expected_columns - set(df.columns)
205
+ if missing_columns:
206
+ raise ValueError(
207
+ f"The DataFrame does not contain the required columns: {missing_columns}"
208
+ )
209
+
210
+ # Process identifiers to remove duplicates
211
+ indexed_df = (
212
+ df
213
+ # remove duplicated identifiers
214
+ .groupby([table_pk_var, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER])
215
+ .first()
216
+ .reset_index()
217
+ .set_index(table_pk_var)
218
+ )
219
+
220
+ # create a dictionary of new Identifiers objects
221
+ expanded_identifiers_dict = {
222
+ i: _expand_identifiers_new_entries(i, indexed_df)
223
+ for i in indexed_df.index.unique()
224
+ }
225
+
226
+ output = pd.Series(expanded_identifiers_dict).rename(table_schema["id"])
227
+ output.index.name = table_pk_var
228
+
229
+ return output
230
+
231
+
175
232
  def format_uri(uri: str, biological_qualifier_type: str | None = None) -> Identifiers:
176
233
  """
177
234
  Convert a RDF URI into an Identifier object
@@ -255,11 +312,8 @@ def format_uri_url(uri: str) -> dict:
255
312
  or re.search("ENS[A-Z]{3}[GTP]", split_path[-1])
256
313
  ):
257
314
  # format ensembl IDs which lack gene/transview
258
- identifier, implied_ontology, _ = parse_ensembl_id(split_path[-1])
259
- if implied_ontology != ontology:
260
- raise ValueError(
261
- f"Implied ontology mismatch: expected {ontology}, got {implied_ontology}"
262
- )
315
+ identifier, ontology, _ = parse_ensembl_id(split_path[-1])
316
+
263
317
  elif netloc == "www.mirbase.org" or netloc == "mirbase.org":
264
318
  ontology = "mirbase"
265
319
  if re.search("MI[0-9]+", split_path[-1]):
@@ -566,16 +620,6 @@ def create_uri_url(ontology: str, identifier: str, strict: bool = True) -> str:
566
620
 
567
621
  """
568
622
 
569
- # check input types
570
- if not isinstance(ontology, str):
571
- raise TypeError(f"ontology was an {type(ontology).__name__} and must be a str")
572
- if not isinstance(identifier, str):
573
- raise TypeError(
574
- f"identifier was an {type(identifier).__name__} and must be a str"
575
- )
576
- if not isinstance(strict, bool):
577
- raise TypeError(f"strict was an {type(strict).__name__} and must be a bool")
578
-
579
623
  # default to no id_regex
580
624
  id_regex = None
581
625
 
@@ -893,6 +937,21 @@ def _validate_assets_sbml_ids(
893
937
  return None
894
938
 
895
939
 
940
+ def _expand_identifiers_new_entries(
941
+ sysid: str, expanded_identifiers_df: pd.DataFrame
942
+ ) -> Identifiers:
943
+ """Create an identifiers object from an index entry in a dataframe"""
944
+ entry = expanded_identifiers_df.loc[sysid]
945
+
946
+ if type(entry) is pd.Series:
947
+ sysis_id_list = [entry.to_dict()]
948
+ else:
949
+ # multiple annotations
950
+ sysis_id_list = list(entry.reset_index(drop=True).T.to_dict().values())
951
+
952
+ return Identifiers(sysis_id_list)
953
+
954
+
896
955
  class _IdentifierValidator(BaseModel):
897
956
  ontology: str
898
957
  identifier: str
napistu/indices.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import copy
4
4
  import os
5
5
  import re
6
+ import datetime
6
7
  from os import PathLike
7
8
  from typing import Iterable
8
9
 
@@ -14,6 +15,73 @@ from napistu.constants import EXPECTED_PW_INDEX_COLUMNS
14
15
  from napistu.constants import SOURCE_SPEC
15
16
 
16
17
 
18
+ def create_pathway_index_df(
19
+ model_keys: dict[str, str],
20
+ model_urls: dict[str, str],
21
+ model_species: dict[str, str],
22
+ base_path: str,
23
+ source_name: str,
24
+ file_extension: str = ".sbml",
25
+ ) -> pd.DataFrame:
26
+ """Create a pathway index DataFrame from model definitions.
27
+
28
+ Parameters
29
+ ----------
30
+ model_keys : dict[str, str]
31
+ Mapping of species to model keys/IDs
32
+ model_urls : dict[str, str]
33
+ Mapping of species to model URLs
34
+ model_species : dict[str, str]
35
+ Mapping of species to their full names
36
+ base_path : str
37
+ Base path where models will be stored
38
+ source_name : str
39
+ Name of the source (e.g. "BiGG")
40
+ file_extension : str, optional
41
+ File extension for model files, by default ".sbml"
42
+
43
+ Returns
44
+ -------
45
+ pd.DataFrame
46
+ DataFrame containing pathway index information with columns:
47
+ - url: URL to download the model from
48
+ - species: Species name
49
+ - sbml_path: Full path where model will be stored
50
+ - file: Basename of the model file
51
+ - date: Current date in YYYYMMDD format
52
+ - pathway_id: Unique identifier for the pathway
53
+ - name: Display name for the pathway
54
+ - source: Source database name
55
+
56
+ Notes
57
+ -----
58
+ The function creates a standardized pathway index DataFrame that can be used
59
+ across different model sources. It handles file paths and metadata consistently.
60
+ """
61
+ models = {
62
+ model_keys[species]: {
63
+ "url": model_urls[species],
64
+ "species": model_species[species],
65
+ }
66
+ for species in model_keys.keys()
67
+ }
68
+
69
+ models_df = pd.DataFrame(models).T
70
+ models_df["sbml_path"] = [
71
+ os.path.join(base_path, k) + file_extension for k in models_df.index.tolist()
72
+ ]
73
+ models_df["file"] = [os.path.basename(x) for x in models_df["sbml_path"]]
74
+
75
+ # add other attributes which will be used in the pw_index
76
+ models_df["date"] = datetime.date.today().strftime("%Y%m%d")
77
+ models_df.index = models_df.index.rename("pathway_id")
78
+ models_df = models_df.reset_index()
79
+ models_df["name"] = models_df["pathway_id"]
80
+ models_df = models_df.assign(source=source_name)
81
+
82
+ return models_df
83
+
84
+
17
85
  class PWIndex:
18
86
  """
19
87
  Pathway Index
@@ -4,7 +4,7 @@ from importlib.metadata import PackageNotFoundError
4
4
  from importlib.metadata import version
5
5
 
6
6
  try:
7
- __version__ = version("calicolabs-cpr")
7
+ __version__ = version("napistu")
8
8
  except PackageNotFoundError:
9
9
  # package is not installed
10
10
  pass