napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,387 @@
|
|
1
|
+
import copy
|
2
|
+
import logging
|
3
|
+
from typing import Union, List, Optional
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
from napistu import sbml_dfs_core
|
8
|
+
from napistu import utils
|
9
|
+
from napistu.constants import SBML_DFS
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def filter_species_by_attribute(
|
15
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
16
|
+
species_data_table: str,
|
17
|
+
attribute_name: str,
|
18
|
+
attribute_value: Union[int, bool, str, List[str]],
|
19
|
+
negate: bool = False,
|
20
|
+
inplace: bool = True,
|
21
|
+
) -> Optional[sbml_dfs_core.SBML_dfs]:
|
22
|
+
"""
|
23
|
+
Filter species in the SBML_dfs based on an attribute value.
|
24
|
+
|
25
|
+
Parameters
|
26
|
+
----------
|
27
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
28
|
+
The SBML_dfs object to filter.
|
29
|
+
species_data_table : str
|
30
|
+
The name of the species data table to filter.
|
31
|
+
attribute_name : str
|
32
|
+
The name of the attribute to filter on.
|
33
|
+
attribute_value : Union[int, bool, str, List[str]]
|
34
|
+
The value of the attribute to filter on. Can be a single value or a list of values.
|
35
|
+
negate : bool, optional
|
36
|
+
Whether to negate the filter, by default False.
|
37
|
+
If True, keeps species with the attribute defined that do NOT match the attribute value.
|
38
|
+
inplace : bool, optional
|
39
|
+
Whether to filter the SBML_dfs in place, by default True.
|
40
|
+
If False, returns a new SBML_dfs object with the filtered species.
|
41
|
+
|
42
|
+
Returns
|
43
|
+
-------
|
44
|
+
Optional[sbml_dfs_core.SBML_dfs]
|
45
|
+
If inplace=True, returns None.
|
46
|
+
If inplace=False, returns a new SBML_dfs object with the filtered species.
|
47
|
+
|
48
|
+
Raises
|
49
|
+
------
|
50
|
+
ValueError
|
51
|
+
If species_data_table is not found in sbml_dfs.species_data
|
52
|
+
If attribute_name is not found in the species data table columns
|
53
|
+
"""
|
54
|
+
|
55
|
+
# If not inplace, make a copy
|
56
|
+
if not inplace:
|
57
|
+
sbml_dfs = copy.deepcopy(sbml_dfs)
|
58
|
+
|
59
|
+
# Get the species data
|
60
|
+
species_data = sbml_dfs.select_species_data(species_data_table)
|
61
|
+
|
62
|
+
# Find species that match the filter criteria (including negation)
|
63
|
+
species_to_remove = find_species_with_attribute(
|
64
|
+
species_data, attribute_name, attribute_value, negate=negate
|
65
|
+
)
|
66
|
+
|
67
|
+
if isinstance(attribute_value, list):
|
68
|
+
filter_str = (
|
69
|
+
f"{attribute_name} in {attribute_value}"
|
70
|
+
if not negate
|
71
|
+
else f"{attribute_name} not in {attribute_value}"
|
72
|
+
)
|
73
|
+
else:
|
74
|
+
filter_str = (
|
75
|
+
f"{attribute_name}={attribute_value}"
|
76
|
+
if not negate
|
77
|
+
else f"{attribute_name}!={attribute_value}"
|
78
|
+
)
|
79
|
+
logger.info(
|
80
|
+
f"Removing {len(species_to_remove)} species from {species_data_table} table with filter {filter_str}"
|
81
|
+
)
|
82
|
+
|
83
|
+
sbml_dfs._remove_species(species_to_remove)
|
84
|
+
|
85
|
+
return None if inplace else sbml_dfs
|
86
|
+
|
87
|
+
|
88
|
+
def filter_reactions_with_disconnected_cspecies(
|
89
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, species_data_table: str, inplace: bool = False
|
90
|
+
) -> Optional[sbml_dfs_core.SBML_dfs]:
|
91
|
+
"""
|
92
|
+
Remove reactions from the SBML_dfs object whose defining compartmentalized species (cspecies) are disconnected
|
93
|
+
according to a co-occurrence matrix derived from a species data table.
|
94
|
+
|
95
|
+
This function identifies reactions where any pair of defining cspecies do not co-occur (i.e., are disconnected)
|
96
|
+
in the provided species data table, and removes those reactions from the model. The operation can be performed
|
97
|
+
in-place or on a copy of the SBML_dfs object.
|
98
|
+
|
99
|
+
Parameters
|
100
|
+
----------
|
101
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
102
|
+
The SBML_dfs object to filter reactions from.
|
103
|
+
species_data_table : str
|
104
|
+
The name of the species data table to use for co-occurrence calculation.
|
105
|
+
inplace : bool, optional
|
106
|
+
If True, modifies the input SBML_dfs object in-place and returns None. If False (default),
|
107
|
+
returns a new SBML_dfs object with the filtered reactions.
|
108
|
+
|
109
|
+
Returns
|
110
|
+
-------
|
111
|
+
Optional[sbml_dfs_core.SBML_dfs]
|
112
|
+
If inplace=True, returns None. If inplace=False, returns a new SBML_dfs object with filtered reactions.
|
113
|
+
|
114
|
+
Warns
|
115
|
+
-----
|
116
|
+
UserWarning
|
117
|
+
If no reactions are pruned based on non-cooccurrence.
|
118
|
+
|
119
|
+
Examples
|
120
|
+
--------
|
121
|
+
>>> filtered_sbml_dfs = filter_reactions_with_disconnected_cspecies(sbml_dfs, "test_data", inplace=False)
|
122
|
+
>>> # To modify in-place:
|
123
|
+
>>> filter_reactions_with_disconnected_cspecies(sbml_dfs, "test_data", inplace=True)
|
124
|
+
"""
|
125
|
+
|
126
|
+
if inplace:
|
127
|
+
sbml_dfs = copy.deepcopy(sbml_dfs)
|
128
|
+
|
129
|
+
# find how many conditions a pair of species cooccur in
|
130
|
+
cooccurence_edgelist = _create_cooccurence_edgelist(sbml_dfs, species_data_table)
|
131
|
+
|
132
|
+
reactions_to_remove = _find_reactions_with_disconnected_cspecies(
|
133
|
+
cooccurence_edgelist, sbml_dfs
|
134
|
+
)
|
135
|
+
|
136
|
+
if len(reactions_to_remove) == 0:
|
137
|
+
logger.warning("No reactions will be pruned based on non-cooccurrence.")
|
138
|
+
else:
|
139
|
+
logger.info(
|
140
|
+
f"Pruning {len(reactions_to_remove)} reactions based on non-cooccurrence."
|
141
|
+
)
|
142
|
+
sbml_dfs.remove_reactions(reactions_to_remove)
|
143
|
+
|
144
|
+
return None if inplace else sbml_dfs
|
145
|
+
|
146
|
+
|
147
|
+
def find_species_with_attribute(
|
148
|
+
species_data: pd.DataFrame,
|
149
|
+
attribute_name: str,
|
150
|
+
attribute_value: Union[int, bool, str, List[str]],
|
151
|
+
negate: bool = False,
|
152
|
+
) -> List[str]:
|
153
|
+
"""
|
154
|
+
Find species that match the given attribute filter criteria.
|
155
|
+
|
156
|
+
Parameters
|
157
|
+
----------
|
158
|
+
species_data : pd.DataFrame
|
159
|
+
The species data table to filter.
|
160
|
+
attribute_name : str
|
161
|
+
The name of the attribute to filter on.
|
162
|
+
attribute_value : Union[int, bool, str, List[str]]
|
163
|
+
The value of the attribute to filter on. Can be a single value or a list of values.
|
164
|
+
negate : bool, optional
|
165
|
+
Whether to negate the filter, by default False.
|
166
|
+
If True, returns species that do NOT match the attribute value.
|
167
|
+
|
168
|
+
Returns
|
169
|
+
-------
|
170
|
+
List[str]
|
171
|
+
List of species IDs that match the filter criteria.
|
172
|
+
|
173
|
+
Raises
|
174
|
+
------
|
175
|
+
ValueError
|
176
|
+
If attribute_name is not found in the species data table columns
|
177
|
+
"""
|
178
|
+
# Check if attribute_name exists in species_data columns
|
179
|
+
if attribute_name not in species_data.columns:
|
180
|
+
raise ValueError(
|
181
|
+
f"attribute_name {attribute_name} not found in species_data.columns. "
|
182
|
+
f"Available attributes: {species_data.columns}"
|
183
|
+
)
|
184
|
+
|
185
|
+
# First, get the mask for defined values (not NA)
|
186
|
+
defined_mask = species_data[attribute_name].notna()
|
187
|
+
|
188
|
+
# Then, get the mask for matching values
|
189
|
+
if isinstance(attribute_value, list):
|
190
|
+
match_mask = species_data[attribute_name].isin(attribute_value)
|
191
|
+
else:
|
192
|
+
match_mask = species_data[attribute_name] == attribute_value
|
193
|
+
|
194
|
+
# Apply negation if requested and combine with defined mask
|
195
|
+
if negate:
|
196
|
+
# When negating, we only want to consider rows where the attribute is defined
|
197
|
+
final_mask = defined_mask & ~match_mask
|
198
|
+
else:
|
199
|
+
final_mask = defined_mask & match_mask
|
200
|
+
|
201
|
+
# Return species that match our criteria
|
202
|
+
return species_data[final_mask].index.tolist()
|
203
|
+
|
204
|
+
|
205
|
+
def _find_reactions_with_disconnected_cspecies(
|
206
|
+
coccurrence_edgelist: pd.DataFrame,
|
207
|
+
sbml_dfs: Optional[sbml_dfs_core.SBML_dfs],
|
208
|
+
cooccurence_threshold: int = 0, # noqa
|
209
|
+
) -> set:
|
210
|
+
"""
|
211
|
+
Find reactions with disconnected cspecies.
|
212
|
+
|
213
|
+
This function finds reactions with disconnected cspecies based on the cooccurrence matrix.
|
214
|
+
Only cspecies which are DEFINING are considered because these are AND rules for reaction operability.
|
215
|
+
It returns the set of reaction ids with disconnected cspecies.
|
216
|
+
|
217
|
+
Parameters
|
218
|
+
----------
|
219
|
+
coccurrence_edgelist : pd.DataFrame
|
220
|
+
The cooccurrence edgelist.
|
221
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
222
|
+
The SBML_dfs object.
|
223
|
+
cooccurence_threshold : int
|
224
|
+
The threshold for cooccurrence. Values equal to or below this threshold are considered disconnected.
|
225
|
+
|
226
|
+
Returns
|
227
|
+
-------
|
228
|
+
set
|
229
|
+
The set of reaction ids with disconnected cspecies.
|
230
|
+
|
231
|
+
"""
|
232
|
+
|
233
|
+
utils.match_pd_vars(
|
234
|
+
coccurrence_edgelist, {"s_id_1", "s_id_2", "cooccurence"}
|
235
|
+
).assert_present()
|
236
|
+
sbml_dfs._validate_table(SBML_DFS.REACTION_SPECIES)
|
237
|
+
sbml_dfs._validate_table(SBML_DFS.COMPARTMENTALIZED_SPECIES)
|
238
|
+
|
239
|
+
reaction_species = sbml_dfs_core.add_sbo_role(sbml_dfs.reaction_species)
|
240
|
+
|
241
|
+
logger.info(
|
242
|
+
"Finding disconnected pairs of cspecies based on the zero values in the coccurrence_edgelist"
|
243
|
+
)
|
244
|
+
|
245
|
+
# map to cspcies
|
246
|
+
disconnected_cspecies = (
|
247
|
+
coccurrence_edgelist.query("cooccurence <= @cooccurence_threshold")
|
248
|
+
.merge(
|
249
|
+
sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
|
250
|
+
.reset_index(drop=False)
|
251
|
+
.rename(columns={SBML_DFS.S_ID: "s_id_1", SBML_DFS.SC_ID: "sc_id_1"}),
|
252
|
+
how="left",
|
253
|
+
)
|
254
|
+
.merge(
|
255
|
+
sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
|
256
|
+
.reset_index(drop=False)
|
257
|
+
.rename(columns={SBML_DFS.S_ID: "s_id_2", SBML_DFS.SC_ID: "sc_id_2"}),
|
258
|
+
how="left",
|
259
|
+
)
|
260
|
+
)
|
261
|
+
|
262
|
+
# remove defining attributes which don't occur since these are AND rules
|
263
|
+
# ignore required attributes since these are OR rules and do not require cooccurrence
|
264
|
+
|
265
|
+
defining_reaction_species = reaction_species.query("sbo_role == 'DEFINING'")[
|
266
|
+
[SBML_DFS.R_ID, SBML_DFS.SC_ID]
|
267
|
+
].drop_duplicates()
|
268
|
+
|
269
|
+
logger.info(
|
270
|
+
"Finding reactions with disconnected cspecies based on the cooccurrence matrix"
|
271
|
+
)
|
272
|
+
# since any 2 pairs of cspecies being missing together would stop a reaction from operating, we can convert reaction_species to an edgelist by self-joining on reaction id
|
273
|
+
invalid_defining_non_cooccurring = (
|
274
|
+
(
|
275
|
+
defining_reaction_species.rename(columns={SBML_DFS.SC_ID: "sc_id_1"}).merge(
|
276
|
+
defining_reaction_species.rename(columns={SBML_DFS.SC_ID: "sc_id_2"}),
|
277
|
+
on=SBML_DFS.R_ID,
|
278
|
+
how="left",
|
279
|
+
)
|
280
|
+
)
|
281
|
+
.query("sc_id_1 != sc_id_2")
|
282
|
+
.merge(disconnected_cspecies, on=["sc_id_1", "sc_id_2"], how="inner")
|
283
|
+
)
|
284
|
+
|
285
|
+
invalid_defining_non_cooccurring_reactions = set(
|
286
|
+
invalid_defining_non_cooccurring[SBML_DFS.R_ID].unique()
|
287
|
+
)
|
288
|
+
|
289
|
+
return invalid_defining_non_cooccurring_reactions
|
290
|
+
|
291
|
+
|
292
|
+
def _create_cooccurence_edgelist(
|
293
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, species_data_table: str
|
294
|
+
):
|
295
|
+
"""
|
296
|
+
Create a co-occurrence edgelist for species based on a binary species data table.
|
297
|
+
|
298
|
+
This function computes a co-occurrence matrix for all pairs of species in the given data table,
|
299
|
+
where each entry represents the number of conditions in which both species are present (i.e., have value 1).
|
300
|
+
The result is returned as an edgelist DataFrame with columns 's_id_1', 's_id_2', and 'cooccurence'.
|
301
|
+
|
302
|
+
Parameters
|
303
|
+
----------
|
304
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
305
|
+
The SBML_dfs object containing the species data table.
|
306
|
+
species_data_table : str
|
307
|
+
The name of the species data table to use for co-occurrence calculation. The table must contain only binary or boolean columns.
|
308
|
+
|
309
|
+
Returns
|
310
|
+
-------
|
311
|
+
pd.DataFrame
|
312
|
+
Edgelist DataFrame with columns ['s_id_1', 's_id_2', 'cooccurence'], where each row gives the number of conditions in which the two species co-occur.
|
313
|
+
|
314
|
+
Raises
|
315
|
+
------
|
316
|
+
ValueError
|
317
|
+
If no binary or boolean columns are found in the species data table.
|
318
|
+
"""
|
319
|
+
species_data = sbml_dfs.select_species_data(species_data_table)
|
320
|
+
|
321
|
+
# select all binary columns (results in {0, 1})
|
322
|
+
# convert to numpy ndarray
|
323
|
+
binary_matrix = _binarize_species_data(species_data).to_numpy()
|
324
|
+
|
325
|
+
# x * t(x)
|
326
|
+
cooccurrence_matrix = binary_matrix @ binary_matrix.T
|
327
|
+
# convert to a binary matrix
|
328
|
+
|
329
|
+
cooccurence_edgelist = utils.matrix_to_edgelist(
|
330
|
+
cooccurrence_matrix,
|
331
|
+
row_labels=species_data.index.tolist(),
|
332
|
+
col_labels=species_data.index.tolist(),
|
333
|
+
).rename(columns={"row": "s_id_1", "column": "s_id_2", "value": "cooccurence"})
|
334
|
+
|
335
|
+
# calculate the cooccurrence matrix
|
336
|
+
return cooccurence_edgelist
|
337
|
+
|
338
|
+
|
339
|
+
def _binarize_species_data(species_data: pd.DataFrame) -> pd.DataFrame:
|
340
|
+
"""
|
341
|
+
Convert all boolean or binary columns in a species data table to a DataFrame of binary (0/1) values.
|
342
|
+
|
343
|
+
This function selects columns of dtype 'bool' or integer columns containing only 0 and 1, and converts them to a DataFrame of binary values (0/1).
|
344
|
+
Columns that are not boolean or binary are ignored. If no such columns are found, a ValueError is raised.
|
345
|
+
|
346
|
+
Parameters
|
347
|
+
----------
|
348
|
+
species_data : pd.DataFrame
|
349
|
+
The species data table to binarize.
|
350
|
+
|
351
|
+
Returns
|
352
|
+
-------
|
353
|
+
pd.DataFrame
|
354
|
+
DataFrame containing only the binarized columns (0/1 values) from the input.
|
355
|
+
|
356
|
+
Raises
|
357
|
+
------
|
358
|
+
ValueError
|
359
|
+
If no binary or boolean columns are found in the input DataFrame.
|
360
|
+
|
361
|
+
Warns
|
362
|
+
-----
|
363
|
+
UserWarning
|
364
|
+
If some columns in the input were not binarized and left out of the output.
|
365
|
+
"""
|
366
|
+
binary_series = []
|
367
|
+
for c in species_data.columns:
|
368
|
+
if species_data[c].dtype == "bool":
|
369
|
+
binary_series.append(species_data[c].astype(int))
|
370
|
+
elif species_data[c].dtype == "int64":
|
371
|
+
if species_data[c].isin([0, 1]).all():
|
372
|
+
binary_series.append(species_data[c])
|
373
|
+
else:
|
374
|
+
continue
|
375
|
+
else:
|
376
|
+
continue
|
377
|
+
|
378
|
+
if len(binary_series) == 0:
|
379
|
+
raise ValueError("No binary or boolean columns found")
|
380
|
+
|
381
|
+
binary_df = pd.concat(binary_series, axis=1)
|
382
|
+
|
383
|
+
if len(binary_df.columns) != len(species_data.columns):
|
384
|
+
left_out = set(species_data.columns) - set(binary_df.columns)
|
385
|
+
logger.warning(f"Some columns were not binarized: {', '.join(left_out)}")
|
386
|
+
|
387
|
+
return binary_df
|
napistu/gcs/__init__.py
CHANGED
napistu/identifiers.py
CHANGED
@@ -22,6 +22,8 @@ from napistu.constants import ENSEMBL_MOLECULE_TYPES_FROM_ONTOLOGY
|
|
22
22
|
from napistu.constants import ENSEMBL_SPECIES_FROM_CODE
|
23
23
|
from napistu.constants import ENSEMBL_SPECIES_TO_CODE
|
24
24
|
from napistu.constants import SPECIES_IDENTIFIERS_REQUIRED_VARS
|
25
|
+
from napistu.constants import SBML_DFS_SCHEMA
|
26
|
+
from napistu.constants import IDENTIFIERS_REQUIRED_VARS
|
25
27
|
|
26
28
|
logger = logging.getLogger(__name__)
|
27
29
|
|
@@ -172,6 +174,61 @@ def merge_identifiers(identifier_series: pd.Series) -> Identifiers:
|
|
172
174
|
return Identifiers(merged_ids)
|
173
175
|
|
174
176
|
|
177
|
+
def df_to_identifiers(df: pd.DataFrame, entity_type: str) -> pd.Series:
|
178
|
+
"""
|
179
|
+
Convert a DataFrame of identifier information to a Series of Identifiers objects.
|
180
|
+
|
181
|
+
Parameters
|
182
|
+
----------
|
183
|
+
df : pd.DataFrame
|
184
|
+
DataFrame containing identifier information with required columns:
|
185
|
+
ontology, identifier, url, bqb
|
186
|
+
index_col : str
|
187
|
+
Name of the column to use as index for the output Series
|
188
|
+
|
189
|
+
Returns
|
190
|
+
-------
|
191
|
+
pd.Series
|
192
|
+
Series indexed by index_col containing Identifiers objects
|
193
|
+
"""
|
194
|
+
|
195
|
+
if entity_type not in SBML_DFS_SCHEMA.SCHEMA:
|
196
|
+
raise ValueError(f"Invalid entity type: {entity_type}")
|
197
|
+
|
198
|
+
table_schema = SBML_DFS_SCHEMA.SCHEMA[entity_type]
|
199
|
+
if "id" not in table_schema:
|
200
|
+
raise ValueError(f"The entity type {entity_type} does not have an id column")
|
201
|
+
|
202
|
+
table_pk_var = table_schema["pk"]
|
203
|
+
expected_columns = set([table_pk_var]) | IDENTIFIERS_REQUIRED_VARS
|
204
|
+
missing_columns = expected_columns - set(df.columns)
|
205
|
+
if missing_columns:
|
206
|
+
raise ValueError(
|
207
|
+
f"The DataFrame does not contain the required columns: {missing_columns}"
|
208
|
+
)
|
209
|
+
|
210
|
+
# Process identifiers to remove duplicates
|
211
|
+
indexed_df = (
|
212
|
+
df
|
213
|
+
# remove duplicated identifiers
|
214
|
+
.groupby([table_pk_var, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER])
|
215
|
+
.first()
|
216
|
+
.reset_index()
|
217
|
+
.set_index(table_pk_var)
|
218
|
+
)
|
219
|
+
|
220
|
+
# create a dictionary of new Identifiers objects
|
221
|
+
expanded_identifiers_dict = {
|
222
|
+
i: _expand_identifiers_new_entries(i, indexed_df)
|
223
|
+
for i in indexed_df.index.unique()
|
224
|
+
}
|
225
|
+
|
226
|
+
output = pd.Series(expanded_identifiers_dict).rename(table_schema["id"])
|
227
|
+
output.index.name = table_pk_var
|
228
|
+
|
229
|
+
return output
|
230
|
+
|
231
|
+
|
175
232
|
def format_uri(uri: str, biological_qualifier_type: str | None = None) -> Identifiers:
|
176
233
|
"""
|
177
234
|
Convert a RDF URI into an Identifier object
|
@@ -255,11 +312,8 @@ def format_uri_url(uri: str) -> dict:
|
|
255
312
|
or re.search("ENS[A-Z]{3}[GTP]", split_path[-1])
|
256
313
|
):
|
257
314
|
# format ensembl IDs which lack gene/transview
|
258
|
-
identifier,
|
259
|
-
|
260
|
-
raise ValueError(
|
261
|
-
f"Implied ontology mismatch: expected {ontology}, got {implied_ontology}"
|
262
|
-
)
|
315
|
+
identifier, ontology, _ = parse_ensembl_id(split_path[-1])
|
316
|
+
|
263
317
|
elif netloc == "www.mirbase.org" or netloc == "mirbase.org":
|
264
318
|
ontology = "mirbase"
|
265
319
|
if re.search("MI[0-9]+", split_path[-1]):
|
@@ -566,16 +620,6 @@ def create_uri_url(ontology: str, identifier: str, strict: bool = True) -> str:
|
|
566
620
|
|
567
621
|
"""
|
568
622
|
|
569
|
-
# check input types
|
570
|
-
if not isinstance(ontology, str):
|
571
|
-
raise TypeError(f"ontology was an {type(ontology).__name__} and must be a str")
|
572
|
-
if not isinstance(identifier, str):
|
573
|
-
raise TypeError(
|
574
|
-
f"identifier was an {type(identifier).__name__} and must be a str"
|
575
|
-
)
|
576
|
-
if not isinstance(strict, bool):
|
577
|
-
raise TypeError(f"strict was an {type(strict).__name__} and must be a bool")
|
578
|
-
|
579
623
|
# default to no id_regex
|
580
624
|
id_regex = None
|
581
625
|
|
@@ -893,6 +937,21 @@ def _validate_assets_sbml_ids(
|
|
893
937
|
return None
|
894
938
|
|
895
939
|
|
940
|
+
def _expand_identifiers_new_entries(
|
941
|
+
sysid: str, expanded_identifiers_df: pd.DataFrame
|
942
|
+
) -> Identifiers:
|
943
|
+
"""Create an identifiers object from an index entry in a dataframe"""
|
944
|
+
entry = expanded_identifiers_df.loc[sysid]
|
945
|
+
|
946
|
+
if type(entry) is pd.Series:
|
947
|
+
sysis_id_list = [entry.to_dict()]
|
948
|
+
else:
|
949
|
+
# multiple annotations
|
950
|
+
sysis_id_list = list(entry.reset_index(drop=True).T.to_dict().values())
|
951
|
+
|
952
|
+
return Identifiers(sysis_id_list)
|
953
|
+
|
954
|
+
|
896
955
|
class _IdentifierValidator(BaseModel):
|
897
956
|
ontology: str
|
898
957
|
identifier: str
|
napistu/indices.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import copy
|
4
4
|
import os
|
5
5
|
import re
|
6
|
+
import datetime
|
6
7
|
from os import PathLike
|
7
8
|
from typing import Iterable
|
8
9
|
|
@@ -14,6 +15,73 @@ from napistu.constants import EXPECTED_PW_INDEX_COLUMNS
|
|
14
15
|
from napistu.constants import SOURCE_SPEC
|
15
16
|
|
16
17
|
|
18
|
+
def create_pathway_index_df(
|
19
|
+
model_keys: dict[str, str],
|
20
|
+
model_urls: dict[str, str],
|
21
|
+
model_species: dict[str, str],
|
22
|
+
base_path: str,
|
23
|
+
source_name: str,
|
24
|
+
file_extension: str = ".sbml",
|
25
|
+
) -> pd.DataFrame:
|
26
|
+
"""Create a pathway index DataFrame from model definitions.
|
27
|
+
|
28
|
+
Parameters
|
29
|
+
----------
|
30
|
+
model_keys : dict[str, str]
|
31
|
+
Mapping of species to model keys/IDs
|
32
|
+
model_urls : dict[str, str]
|
33
|
+
Mapping of species to model URLs
|
34
|
+
model_species : dict[str, str]
|
35
|
+
Mapping of species to their full names
|
36
|
+
base_path : str
|
37
|
+
Base path where models will be stored
|
38
|
+
source_name : str
|
39
|
+
Name of the source (e.g. "BiGG")
|
40
|
+
file_extension : str, optional
|
41
|
+
File extension for model files, by default ".sbml"
|
42
|
+
|
43
|
+
Returns
|
44
|
+
-------
|
45
|
+
pd.DataFrame
|
46
|
+
DataFrame containing pathway index information with columns:
|
47
|
+
- url: URL to download the model from
|
48
|
+
- species: Species name
|
49
|
+
- sbml_path: Full path where model will be stored
|
50
|
+
- file: Basename of the model file
|
51
|
+
- date: Current date in YYYYMMDD format
|
52
|
+
- pathway_id: Unique identifier for the pathway
|
53
|
+
- name: Display name for the pathway
|
54
|
+
- source: Source database name
|
55
|
+
|
56
|
+
Notes
|
57
|
+
-----
|
58
|
+
The function creates a standardized pathway index DataFrame that can be used
|
59
|
+
across different model sources. It handles file paths and metadata consistently.
|
60
|
+
"""
|
61
|
+
models = {
|
62
|
+
model_keys[species]: {
|
63
|
+
"url": model_urls[species],
|
64
|
+
"species": model_species[species],
|
65
|
+
}
|
66
|
+
for species in model_keys.keys()
|
67
|
+
}
|
68
|
+
|
69
|
+
models_df = pd.DataFrame(models).T
|
70
|
+
models_df["sbml_path"] = [
|
71
|
+
os.path.join(base_path, k) + file_extension for k in models_df.index.tolist()
|
72
|
+
]
|
73
|
+
models_df["file"] = [os.path.basename(x) for x in models_df["sbml_path"]]
|
74
|
+
|
75
|
+
# add other attributes which will be used in the pw_index
|
76
|
+
models_df["date"] = datetime.date.today().strftime("%Y%m%d")
|
77
|
+
models_df.index = models_df.index.rename("pathway_id")
|
78
|
+
models_df = models_df.reset_index()
|
79
|
+
models_df["name"] = models_df["pathway_id"]
|
80
|
+
models_df = models_df.assign(source=source_name)
|
81
|
+
|
82
|
+
return models_df
|
83
|
+
|
84
|
+
|
17
85
|
class PWIndex:
|
18
86
|
"""
|
19
87
|
Pathway Index
|
napistu/ingestion/__init__.py
CHANGED