napistu 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/constants.py +2 -0
- napistu/gcs/constants.py +11 -11
- napistu/ontologies/id_tables.py +282 -0
- napistu/sbml_dfs_core.py +53 -63
- napistu/sbml_dfs_utils.py +82 -18
- {napistu-0.4.0.dist-info → napistu-0.4.1.dist-info}/METADATA +6 -1
- {napistu-0.4.0.dist-info → napistu-0.4.1.dist-info}/RECORD +14 -12
- tests/test_ontologies_id_tables.py +198 -0
- tests/test_sbml_dfs_core.py +30 -19
- tests/test_sbml_dfs_utils.py +70 -0
- {napistu-0.4.0.dist-info → napistu-0.4.1.dist-info}/WHEEL +0 -0
- {napistu-0.4.0.dist-info → napistu-0.4.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.4.0.dist-info → napistu-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.4.0.dist-info → napistu-0.4.1.dist-info}/top_level.txt +0 -0
napistu/constants.py
CHANGED
@@ -402,12 +402,14 @@ ONTOLOGIES = SimpleNamespace(
|
|
402
402
|
ENSEMBL_PROTEIN_VERSION="ensembl_protein_version",
|
403
403
|
GENE_NAME="gene_name",
|
404
404
|
GO="go",
|
405
|
+
KEGG="kegg",
|
405
406
|
MIRBASE="mirbase",
|
406
407
|
NCBI_ENTREZ_GENE="ncbi_entrez_gene",
|
407
408
|
PHAROS="pharos",
|
408
409
|
REACTOME="reactome",
|
409
410
|
SYMBOL="symbol",
|
410
411
|
UNIPROT="uniprot",
|
412
|
+
WIKIPATHWAYS="wikipathways",
|
411
413
|
)
|
412
414
|
|
413
415
|
ONTOLOGIES_LIST = list(ONTOLOGIES.__dict__.values())
|
napistu/gcs/constants.py
CHANGED
@@ -5,17 +5,17 @@ from types import SimpleNamespace
|
|
5
5
|
|
6
6
|
GCS_SUBASSET_NAMES = SimpleNamespace(
|
7
7
|
SBML_DFS="sbml_dfs",
|
8
|
-
|
9
|
-
|
8
|
+
NAPISTU_GRAPH="napistu_graph",
|
9
|
+
SPECIES_IDENTIFIERS="species_identifiers",
|
10
10
|
REGULATORY_DISTANCES="regulatory_distances",
|
11
11
|
)
|
12
12
|
|
13
13
|
|
14
14
|
GCS_FILETYPES = SimpleNamespace(
|
15
15
|
SBML_DFS="sbml_dfs.pkl",
|
16
|
-
|
17
|
-
|
18
|
-
REGULATORY_DISTANCES="regulatory_distances.
|
16
|
+
NAPISTU_GRAPH="napistu_graph.pkl",
|
17
|
+
SPECIES_IDENTIFIERS="species_identifiers.tsv",
|
18
|
+
REGULATORY_DISTANCES="regulatory_distances.parquet",
|
19
19
|
)
|
20
20
|
|
21
21
|
|
@@ -27,8 +27,8 @@ GCS_ASSETS = SimpleNamespace(
|
|
27
27
|
"file": "test_pathway.tar.gz",
|
28
28
|
"subassets": {
|
29
29
|
GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
|
30
|
-
GCS_SUBASSET_NAMES.
|
31
|
-
GCS_SUBASSET_NAMES.
|
30
|
+
GCS_SUBASSET_NAMES.NAPISTU_GRAPH: GCS_FILETYPES.NAPISTU_GRAPH,
|
31
|
+
GCS_SUBASSET_NAMES.SPECIES_IDENTIFIERS: GCS_FILETYPES.SPECIES_IDENTIFIERS,
|
32
32
|
GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
|
33
33
|
},
|
34
34
|
"public_url": "https://storage.googleapis.com/shackett-napistu-public/test_pathway.tar.gz",
|
@@ -37,8 +37,8 @@ GCS_ASSETS = SimpleNamespace(
|
|
37
37
|
"file": "human_consensus.tar.gz",
|
38
38
|
"subassets": {
|
39
39
|
GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
|
40
|
-
GCS_SUBASSET_NAMES.
|
41
|
-
GCS_SUBASSET_NAMES.
|
40
|
+
GCS_SUBASSET_NAMES.NAPISTU_GRAPH: GCS_FILETYPES.NAPISTU_GRAPH,
|
41
|
+
GCS_SUBASSET_NAMES.SPECIES_IDENTIFIERS: GCS_FILETYPES.SPECIES_IDENTIFIERS,
|
42
42
|
},
|
43
43
|
"public_url": "https://storage.googleapis.com/shackett-napistu-public/human_consensus.tar.gz",
|
44
44
|
},
|
@@ -46,8 +46,8 @@ GCS_ASSETS = SimpleNamespace(
|
|
46
46
|
"file": "human_consensus_w_distances.tar.gz",
|
47
47
|
"subassets": {
|
48
48
|
GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
|
49
|
-
GCS_SUBASSET_NAMES.
|
50
|
-
GCS_SUBASSET_NAMES.
|
49
|
+
GCS_SUBASSET_NAMES.NAPISTU_GRAPH: GCS_FILETYPES.NAPISTU_GRAPH,
|
50
|
+
GCS_SUBASSET_NAMES.SPECIES_IDENTIFIERS: GCS_FILETYPES.SPECIES_IDENTIFIERS,
|
51
51
|
GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
|
52
52
|
},
|
53
53
|
"public_url": "https://storage.googleapis.com/calico-cpr-public/human_consensus_w_distances.tar.gz",
|
@@ -0,0 +1,282 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional, Union, Set
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
from napistu import sbml_dfs_utils
|
7
|
+
from napistu.constants import (
|
8
|
+
BQB,
|
9
|
+
BQB_DEFINING_ATTRS_LOOSE,
|
10
|
+
IDENTIFIERS,
|
11
|
+
SBML_DFS_SCHEMA,
|
12
|
+
SCHEMA_DEFS,
|
13
|
+
VALID_BQB_TERMS,
|
14
|
+
)
|
15
|
+
from napistu import utils
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
def filter_id_table(
|
21
|
+
id_table: pd.DataFrame,
|
22
|
+
identifiers: Optional[Union[str, list, set]] = None,
|
23
|
+
ontologies: Optional[Union[str, list, set]] = None,
|
24
|
+
bqbs: Optional[Union[str, list, set]] = BQB_DEFINING_ATTRS_LOOSE + [BQB.HAS_PART],
|
25
|
+
) -> pd.DataFrame:
|
26
|
+
"""
|
27
|
+
Filter an identifier table by identifiers, ontologies, and BQB terms for a given entity type.
|
28
|
+
|
29
|
+
Parameters
|
30
|
+
----------
|
31
|
+
id_table : pd.DataFrame
|
32
|
+
DataFrame containing identifier mappings to be filtered.
|
33
|
+
identifiers : str, list, set, or None, optional
|
34
|
+
Identifiers to filter by. If None, no filtering is applied on identifiers.
|
35
|
+
ontologies : str, list, set, or None, optional
|
36
|
+
Ontologies to filter by. If None, no filtering is applied on ontologies.
|
37
|
+
bqbs : str, list, set, or None, optional
|
38
|
+
BQB terms to filter by. If None, no filtering is applied on BQB terms. Default is [BQB.IS, BQB.HAS_PART].
|
39
|
+
|
40
|
+
Returns
|
41
|
+
-------
|
42
|
+
pd.DataFrame
|
43
|
+
Filtered DataFrame containing only rows matching the specified criteria.
|
44
|
+
|
45
|
+
Raises
|
46
|
+
------
|
47
|
+
ValueError
|
48
|
+
If the id_table or filter values are invalid, or required columns are missing.
|
49
|
+
"""
|
50
|
+
|
51
|
+
entity_type = sbml_dfs_utils.infer_entity_type(id_table)
|
52
|
+
_validate_id_table(id_table, entity_type)
|
53
|
+
|
54
|
+
# bqbs
|
55
|
+
if bqbs is not None:
|
56
|
+
bqbs = _sanitize_id_table_bqbs(bqbs, id_table)
|
57
|
+
id_table = id_table.query("bqb in @bqbs")
|
58
|
+
|
59
|
+
# ontologies
|
60
|
+
if ontologies is not None:
|
61
|
+
ontologies = _sanitize_id_table_ontologies(ontologies, id_table)
|
62
|
+
id_table = id_table.query("ontology in @ontologies")
|
63
|
+
|
64
|
+
# identifiers
|
65
|
+
if identifiers is not None:
|
66
|
+
identifiers = _sanitize_id_table_identifiers(identifiers, id_table)
|
67
|
+
id_table = id_table.query("identifier in @identifiers")
|
68
|
+
|
69
|
+
# return the filtered id_table
|
70
|
+
return id_table
|
71
|
+
|
72
|
+
|
73
|
+
def _validate_id_table(id_table: pd.DataFrame, entity_type: str) -> None:
|
74
|
+
"""
|
75
|
+
Validate that the id_table contains the required columns and matches the schema for the given entity_type.
|
76
|
+
|
77
|
+
Parameters
|
78
|
+
----------
|
79
|
+
id_table : pd.DataFrame
|
80
|
+
DataFrame containing identifier mappings for a given entity type.
|
81
|
+
entity_type : str
|
82
|
+
The type of entity (e.g., 'species', 'reactions') to validate against the schema.
|
83
|
+
|
84
|
+
Returns
|
85
|
+
-------
|
86
|
+
None
|
87
|
+
|
88
|
+
Raises
|
89
|
+
------
|
90
|
+
ValueError
|
91
|
+
If entity_type is not present in the schema, or if required columns are missing in id_table.
|
92
|
+
"""
|
93
|
+
|
94
|
+
schema = SBML_DFS_SCHEMA.SCHEMA
|
95
|
+
|
96
|
+
if entity_type not in schema.keys():
|
97
|
+
raise ValueError(
|
98
|
+
f"{entity_type} does not match a table in the SBML_dfs object. The tables "
|
99
|
+
f"which are present are {', '.join(schema.keys())}"
|
100
|
+
)
|
101
|
+
|
102
|
+
entity_table_attrs = schema[entity_type]
|
103
|
+
|
104
|
+
if SCHEMA_DEFS.ID not in entity_table_attrs.keys():
|
105
|
+
raise ValueError(f"{entity_type} does not have an 'id' attribute")
|
106
|
+
|
107
|
+
entity_pk = entity_table_attrs[SCHEMA_DEFS.PK]
|
108
|
+
|
109
|
+
utils.match_pd_vars(
|
110
|
+
id_table,
|
111
|
+
req_vars={
|
112
|
+
entity_pk,
|
113
|
+
IDENTIFIERS.ONTOLOGY,
|
114
|
+
IDENTIFIERS.IDENTIFIER,
|
115
|
+
IDENTIFIERS.URL,
|
116
|
+
IDENTIFIERS.BQB,
|
117
|
+
},
|
118
|
+
allow_series=False,
|
119
|
+
).assert_present()
|
120
|
+
|
121
|
+
return None
|
122
|
+
|
123
|
+
|
124
|
+
def _sanitize_id_table_values(
|
125
|
+
values: Union[str, list, set],
|
126
|
+
id_table: pd.DataFrame,
|
127
|
+
column_name: str,
|
128
|
+
valid_values: Optional[Set[str]] = None,
|
129
|
+
value_type_name: str = None,
|
130
|
+
) -> set:
|
131
|
+
"""
|
132
|
+
Generic function to sanitize and validate values against an id_table column.
|
133
|
+
|
134
|
+
Parameters
|
135
|
+
----------
|
136
|
+
values : str, list, or set
|
137
|
+
Values to sanitize and validate. Can be a single string, list of strings,
|
138
|
+
or set of strings.
|
139
|
+
id_table : pd.DataFrame
|
140
|
+
DataFrame containing the reference data to validate against.
|
141
|
+
column_name : str
|
142
|
+
Name of the column in id_table to check values against.
|
143
|
+
valid_values : set of str, optional
|
144
|
+
Optional set of globally valid values for additional validation
|
145
|
+
(e.g., VALID_BQB_TERMS). If provided, values must be a subset of this set.
|
146
|
+
value_type_name : str, optional
|
147
|
+
Human-readable name for the value type used in error messages.
|
148
|
+
If None, defaults to column_name.
|
149
|
+
|
150
|
+
Returns
|
151
|
+
-------
|
152
|
+
set
|
153
|
+
Set of sanitized and validated values.
|
154
|
+
|
155
|
+
Raises
|
156
|
+
------
|
157
|
+
ValueError
|
158
|
+
If values is not a string, list, or set.
|
159
|
+
If any values are not in valid_values (when provided).
|
160
|
+
If none of the requested values are present in the id_table.
|
161
|
+
|
162
|
+
Warnings
|
163
|
+
--------
|
164
|
+
Logs a warning if some (but not all) requested values are missing from id_table.
|
165
|
+
"""
|
166
|
+
if value_type_name is None:
|
167
|
+
value_type_name = column_name
|
168
|
+
|
169
|
+
# Convert to set
|
170
|
+
if isinstance(values, str):
|
171
|
+
values = {values}
|
172
|
+
elif isinstance(values, list):
|
173
|
+
values = set(values)
|
174
|
+
elif isinstance(values, set):
|
175
|
+
pass
|
176
|
+
else:
|
177
|
+
raise ValueError(
|
178
|
+
f"{value_type_name} must be a string, a set, or list, got {type(values).__name__}"
|
179
|
+
)
|
180
|
+
|
181
|
+
# Check against global valid values if provided
|
182
|
+
if valid_values is not None:
|
183
|
+
invalid_values = values.difference(valid_values)
|
184
|
+
if len(invalid_values) > 0:
|
185
|
+
raise ValueError(
|
186
|
+
f"The following {value_type_name} are not valid: {', '.join(invalid_values)}.\n"
|
187
|
+
f"Valid {value_type_name} are {', '.join(valid_values)}"
|
188
|
+
)
|
189
|
+
|
190
|
+
# Check against values present in the id_table
|
191
|
+
available_values = set(id_table[column_name].unique())
|
192
|
+
missing_values = values.difference(available_values)
|
193
|
+
|
194
|
+
if len(missing_values) == len(values):
|
195
|
+
raise ValueError(
|
196
|
+
f"None of the requested {value_type_name} are present in the id_table: {', '.join(missing_values)}.\n"
|
197
|
+
f"The included {value_type_name} are {', '.join(available_values)}"
|
198
|
+
)
|
199
|
+
elif len(missing_values) > 0:
|
200
|
+
logger.warning(
|
201
|
+
f"The following {value_type_name} are not present in the id_table: {', '.join(missing_values)}.\n"
|
202
|
+
f"The included {value_type_name} are {', '.join(available_values)}"
|
203
|
+
)
|
204
|
+
|
205
|
+
return values
|
206
|
+
|
207
|
+
|
208
|
+
def _sanitize_id_table_ontologies(
|
209
|
+
ontologies: Union[str, list, set], id_table: pd.DataFrame
|
210
|
+
) -> set:
|
211
|
+
"""
|
212
|
+
Sanitize and validate ontologies against the id_table.
|
213
|
+
|
214
|
+
Parameters
|
215
|
+
----------
|
216
|
+
ontologies : str, list, or set
|
217
|
+
Ontology names to validate.
|
218
|
+
id_table : pd.DataFrame
|
219
|
+
DataFrame containing ontology reference data.
|
220
|
+
|
221
|
+
Returns
|
222
|
+
-------
|
223
|
+
set
|
224
|
+
Set of validated ontology names.
|
225
|
+
"""
|
226
|
+
return _sanitize_id_table_values(
|
227
|
+
values=ontologies,
|
228
|
+
id_table=id_table,
|
229
|
+
column_name=IDENTIFIERS.ONTOLOGY,
|
230
|
+
value_type_name="ontologies",
|
231
|
+
)
|
232
|
+
|
233
|
+
|
234
|
+
def _sanitize_id_table_bqbs(bqbs: Union[str, list, set], id_table: pd.DataFrame) -> set:
|
235
|
+
"""
|
236
|
+
Sanitize and validate BQBs against the id_table.
|
237
|
+
|
238
|
+
Parameters
|
239
|
+
----------
|
240
|
+
bqbs : str, list, or set
|
241
|
+
BQB terms to validate.
|
242
|
+
id_table : pd.DataFrame
|
243
|
+
DataFrame containing BQB reference data.
|
244
|
+
|
245
|
+
Returns
|
246
|
+
-------
|
247
|
+
set
|
248
|
+
Set of validated BQB terms.
|
249
|
+
"""
|
250
|
+
return _sanitize_id_table_values(
|
251
|
+
values=bqbs,
|
252
|
+
id_table=id_table,
|
253
|
+
column_name=IDENTIFIERS.BQB,
|
254
|
+
valid_values=VALID_BQB_TERMS,
|
255
|
+
value_type_name="bqbs",
|
256
|
+
)
|
257
|
+
|
258
|
+
|
259
|
+
def _sanitize_id_table_identifiers(
|
260
|
+
identifiers: Union[str, list, set], id_table: pd.DataFrame
|
261
|
+
) -> set:
|
262
|
+
"""
|
263
|
+
Sanitize and validate identifiers against the id_table.
|
264
|
+
|
265
|
+
Parameters
|
266
|
+
----------
|
267
|
+
identifiers : str, list, or set
|
268
|
+
Identifier values to validate.
|
269
|
+
id_table : pd.DataFrame
|
270
|
+
DataFrame containing identifier reference data.
|
271
|
+
|
272
|
+
Returns
|
273
|
+
-------
|
274
|
+
set
|
275
|
+
Set of validated identifiers.
|
276
|
+
"""
|
277
|
+
return _sanitize_id_table_values(
|
278
|
+
values=identifiers,
|
279
|
+
id_table=id_table,
|
280
|
+
column_name=IDENTIFIERS.IDENTIFIER,
|
281
|
+
value_type_name="identifiers",
|
282
|
+
)
|
napistu/sbml_dfs_core.py
CHANGED
@@ -19,17 +19,23 @@ from napistu import sbml_dfs_utils
|
|
19
19
|
from napistu import source
|
20
20
|
from napistu import utils
|
21
21
|
from napistu.ingestion import sbml
|
22
|
-
from napistu.
|
23
|
-
from napistu.constants import
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
22
|
+
from napistu.ontologies import id_tables
|
23
|
+
from napistu.constants import (
|
24
|
+
BQB,
|
25
|
+
BQB_DEFINING_ATTRS_LOOSE,
|
26
|
+
BQB_PRIORITIES,
|
27
|
+
ENTITIES_W_DATA,
|
28
|
+
ENTITIES_TO_ENTITY_DATA,
|
29
|
+
IDENTIFIERS,
|
30
|
+
MINI_SBO_FROM_NAME,
|
31
|
+
MINI_SBO_TO_NAME,
|
32
|
+
NAPISTU_STANDARD_OUTPUTS,
|
33
|
+
ONTOLOGY_PRIORITIES,
|
34
|
+
SBML_DFS,
|
35
|
+
SBML_DFS_SCHEMA,
|
36
|
+
SBOTERM_NAMES,
|
37
|
+
SCHEMA_DEFS,
|
38
|
+
)
|
33
39
|
|
34
40
|
logger = logging.getLogger(__name__)
|
35
41
|
|
@@ -101,7 +107,7 @@ class SBML_dfs:
|
|
101
107
|
Remove a reactions data table by label.
|
102
108
|
remove_species_data(label)
|
103
109
|
Remove a species data table by label.
|
104
|
-
search_by_ids(
|
110
|
+
search_by_ids(id_table, identifiers=None, ontologies=None, bqbs=None)
|
105
111
|
Find entities and identifiers matching a set of query IDs.
|
106
112
|
search_by_name(name, entity_type, partial_match=True)
|
107
113
|
Find entities by exact or partial name match.
|
@@ -455,12 +461,12 @@ class SBML_dfs:
|
|
455
461
|
ValueError
|
456
462
|
If id_type is invalid or identifiers are malformed
|
457
463
|
"""
|
458
|
-
selected_table = self.get_table(id_type, {
|
464
|
+
selected_table = self.get_table(id_type, {SCHEMA_DEFS.ID})
|
459
465
|
schema = SBML_DFS_SCHEMA.SCHEMA
|
460
466
|
|
461
467
|
identifiers_dict = dict()
|
462
468
|
for sysid in selected_table.index:
|
463
|
-
id_entry = selected_table[schema[id_type][
|
469
|
+
id_entry = selected_table[schema[id_type][SCHEMA_DEFS.ID]][sysid]
|
464
470
|
|
465
471
|
if isinstance(id_entry, identifiers.Identifiers):
|
466
472
|
identifiers_dict[sysid] = pd.DataFrame(id_entry.ids)
|
@@ -473,16 +479,16 @@ class SBML_dfs:
|
|
473
479
|
)
|
474
480
|
if not identifiers_dict:
|
475
481
|
# Return empty DataFrame with expected columns if nothing found
|
476
|
-
return pd.DataFrame(columns=[schema[id_type][
|
482
|
+
return pd.DataFrame(columns=[schema[id_type][SCHEMA_DEFS.PK], "entry"])
|
477
483
|
|
478
484
|
identifiers_tbl = pd.concat(identifiers_dict)
|
479
485
|
|
480
|
-
identifiers_tbl.index.names = [schema[id_type][
|
486
|
+
identifiers_tbl.index.names = [schema[id_type][SCHEMA_DEFS.PK], "entry"]
|
481
487
|
identifiers_tbl = identifiers_tbl.reset_index()
|
482
488
|
|
483
489
|
named_identifiers = identifiers_tbl.merge(
|
484
|
-
selected_table.drop(schema[id_type][
|
485
|
-
left_on=schema[id_type][
|
490
|
+
selected_table.drop(schema[id_type][SCHEMA_DEFS.ID], axis=1),
|
491
|
+
left_on=schema[id_type][SCHEMA_DEFS.PK],
|
486
492
|
right_index=True,
|
487
493
|
)
|
488
494
|
|
@@ -1163,24 +1169,25 @@ class SBML_dfs:
|
|
1163
1169
|
|
1164
1170
|
def search_by_ids(
|
1165
1171
|
self,
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1172
|
+
id_table: pd.DataFrame,
|
1173
|
+
identifiers: Optional[Union[str, list, set]] = None,
|
1174
|
+
ontologies: Optional[Union[str, list, set]] = None,
|
1175
|
+
bqbs: Optional[Union[str, list, set]] = BQB_DEFINING_ATTRS_LOOSE
|
1176
|
+
+ [BQB.HAS_PART],
|
1170
1177
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
1171
1178
|
"""
|
1172
1179
|
Find entities and identifiers matching a set of query IDs.
|
1173
1180
|
|
1174
1181
|
Parameters
|
1175
1182
|
----------
|
1176
|
-
|
1177
|
-
List of identifiers to search for
|
1178
|
-
entity_type : str
|
1179
|
-
Type of entity to search (e.g., 'species', 'reactions')
|
1180
|
-
identifiers_df : pd.DataFrame
|
1183
|
+
id_table : pd.DataFrame
|
1181
1184
|
DataFrame containing identifier mappings
|
1182
|
-
|
1183
|
-
|
1185
|
+
identifiers : Optional[Union[str, list, set]], optional
|
1186
|
+
Identifiers to filter by, by default None
|
1187
|
+
ontologies : Optional[Union[str, list, set]], optional
|
1188
|
+
Ontologies to filter by, by default None
|
1189
|
+
bqbs : Optional[Union[str, list, set]], optional
|
1190
|
+
BQB terms to filter by, by default [BQB.IS, BQB.HAS_PART]
|
1184
1191
|
|
1185
1192
|
Returns
|
1186
1193
|
-------
|
@@ -1196,42 +1203,25 @@ class SBML_dfs:
|
|
1196
1203
|
If ontologies is not a set
|
1197
1204
|
"""
|
1198
1205
|
# validate inputs
|
1199
|
-
entity_table = self.get_table(entity_type, required_attributes={"id"})
|
1200
|
-
entity_pk = self.schema[entity_type]["pk"]
|
1201
|
-
|
1202
|
-
utils.match_pd_vars(
|
1203
|
-
identifiers_df,
|
1204
|
-
req_vars={
|
1205
|
-
entity_pk,
|
1206
|
-
IDENTIFIERS.ONTOLOGY,
|
1207
|
-
IDENTIFIERS.IDENTIFIER,
|
1208
|
-
IDENTIFIERS.URL,
|
1209
|
-
IDENTIFIERS.BQB,
|
1210
|
-
},
|
1211
|
-
allow_series=False,
|
1212
|
-
).assert_present()
|
1213
|
-
|
1214
|
-
if ontologies is not None:
|
1215
|
-
if not isinstance(ontologies, set):
|
1216
|
-
# for clarity this should not be reachable based on type hints
|
1217
|
-
raise TypeError(
|
1218
|
-
f"ontologies must be a set, but got {type(ontologies).__name__}"
|
1219
|
-
)
|
1220
|
-
ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
|
1221
|
-
invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
|
1222
|
-
if len(invalid_ontologies) > 0:
|
1223
|
-
raise ValueError(
|
1224
|
-
f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
|
1225
|
-
f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
|
1226
|
-
)
|
1227
1206
|
|
1228
|
-
|
1229
|
-
|
1207
|
+
entity_type = sbml_dfs_utils.infer_entity_type(id_table)
|
1208
|
+
entity_table = self.get_table(entity_type, required_attributes={SCHEMA_DEFS.ID})
|
1209
|
+
entity_pk = self.schema[entity_type][SCHEMA_DEFS.PK]
|
1230
1210
|
|
1231
|
-
matching_identifiers =
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1211
|
+
matching_identifiers = id_tables.filter_id_table(
|
1212
|
+
id_table=id_table, identifiers=identifiers, ontologies=ontologies, bqbs=bqbs
|
1213
|
+
)
|
1214
|
+
|
1215
|
+
matching_keys = matching_identifiers[entity_pk].tolist()
|
1216
|
+
entity_subset = entity_table.loc[matching_keys]
|
1217
|
+
|
1218
|
+
if matching_identifiers.shape[0] != entity_subset.shape[0]:
|
1219
|
+
raise ValueError(
|
1220
|
+
f"Some identifiers did not match to an entity for {entity_type}. "
|
1221
|
+
"This suggests that the identifiers and sbml_dfs are not in sync. "
|
1222
|
+
"Please create new identifiers with sbml_dfs.get_characteristic_species_ids() "
|
1223
|
+
"or sbml_dfs.get_identifiers()."
|
1224
|
+
)
|
1235
1225
|
|
1236
1226
|
return entity_subset, matching_identifiers
|
1237
1227
|
|
napistu/sbml_dfs_utils.py
CHANGED
@@ -14,24 +14,29 @@ from napistu import utils
|
|
14
14
|
from napistu import identifiers
|
15
15
|
from napistu import indices
|
16
16
|
|
17
|
-
from napistu.constants import
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
17
|
+
from napistu.constants import (
|
18
|
+
BQB,
|
19
|
+
BQB_DEFINING_ATTRS,
|
20
|
+
BQB_DEFINING_ATTRS_LOOSE,
|
21
|
+
SBML_DFS,
|
22
|
+
SBML_DFS_SCHEMA,
|
23
|
+
SCHEMA_DEFS,
|
24
|
+
IDENTIFIERS,
|
25
|
+
INTERACTION_EDGELIST_EXPECTED_VARS,
|
26
|
+
ONTOLOGIES,
|
27
|
+
MINI_SBO_FROM_NAME,
|
28
|
+
MINI_SBO_TO_NAME,
|
29
|
+
REQUIRED_REACTION_FROMEDGELIST_COLUMNS,
|
30
|
+
SBO_ROLES_DEFS,
|
31
|
+
SBO_NAME_TO_ROLE,
|
32
|
+
VALID_SBO_TERM_NAMES,
|
33
|
+
VALID_SBO_TERMS,
|
34
|
+
)
|
35
|
+
from napistu.ingestion.constants import (
|
36
|
+
COMPARTMENTS_GO_TERMS,
|
37
|
+
GENERIC_COMPARTMENT,
|
38
|
+
VALID_COMPARTMENTS,
|
39
|
+
)
|
35
40
|
|
36
41
|
logger = logging.getLogger(__name__)
|
37
42
|
|
@@ -418,6 +423,65 @@ def id_formatter_inv(ids: list[str]) -> list[int]:
|
|
418
423
|
return id_val
|
419
424
|
|
420
425
|
|
426
|
+
def infer_entity_type(df: pd.DataFrame) -> str:
|
427
|
+
"""
|
428
|
+
Infer the entity type of a DataFrame based on its structure and schema.
|
429
|
+
|
430
|
+
Parameters
|
431
|
+
----------
|
432
|
+
df : pd.DataFrame
|
433
|
+
The DataFrame to analyze
|
434
|
+
|
435
|
+
Returns
|
436
|
+
-------
|
437
|
+
str
|
438
|
+
The inferred entity type name
|
439
|
+
|
440
|
+
Raises
|
441
|
+
------
|
442
|
+
ValueError
|
443
|
+
If no entity type can be determined
|
444
|
+
"""
|
445
|
+
schema = SBML_DFS_SCHEMA.SCHEMA
|
446
|
+
|
447
|
+
# Get all primary keys
|
448
|
+
primary_keys = [
|
449
|
+
entity_schema.get(SCHEMA_DEFS.PK) for entity_schema in schema.values()
|
450
|
+
]
|
451
|
+
primary_keys = [pk for pk in primary_keys if pk is not None]
|
452
|
+
|
453
|
+
# Check if index matches a primary key
|
454
|
+
if df.index.name in primary_keys:
|
455
|
+
for entity_type, entity_schema in schema.items():
|
456
|
+
if entity_schema.get(SCHEMA_DEFS.PK) == df.index.name:
|
457
|
+
return entity_type
|
458
|
+
|
459
|
+
# Get DataFrame columns that are also primary keys
|
460
|
+
df_columns = set(df.columns).intersection(primary_keys)
|
461
|
+
|
462
|
+
# Check for exact match with primary key + foreign keys
|
463
|
+
for entity_type, entity_schema in schema.items():
|
464
|
+
expected_keys = set()
|
465
|
+
|
466
|
+
# Add primary key
|
467
|
+
pk = entity_schema.get(SCHEMA_DEFS.PK)
|
468
|
+
if pk:
|
469
|
+
expected_keys.add(pk)
|
470
|
+
|
471
|
+
# Add foreign keys
|
472
|
+
fks = entity_schema.get(SCHEMA_DEFS.FK, [])
|
473
|
+
expected_keys.update(fks)
|
474
|
+
|
475
|
+
# Check for exact match
|
476
|
+
if df_columns == expected_keys:
|
477
|
+
return entity_type
|
478
|
+
|
479
|
+
# No match found
|
480
|
+
raise ValueError(
|
481
|
+
f"No entity type matches DataFrame with columns: {sorted(df_columns)}"
|
482
|
+
)
|
483
|
+
|
484
|
+
|
421
485
|
def match_entitydata_index_to_entity(
|
422
486
|
entity_data_dict: dict,
|
423
487
|
an_entity_data_type: str,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: napistu
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.1
|
4
4
|
Summary: Connecting high-dimensional data to curated pathways
|
5
5
|
Home-page: https://github.com/napistu/napistu-py
|
6
6
|
Author: Sean Hackett
|
@@ -61,7 +61,12 @@ Dynamic: license-file
|
|
61
61
|
|
62
62
|
# Napistu Python Library
|
63
63
|
|
64
|
+
[](https://badge.fury.io/py/napistu)
|
64
65
|
[](https://napistu.readthedocs.io/en/latest/?badge=latest)
|
66
|
+
[](https://github.com/napistu/napistu-py/actions/workflows/ci.yml)
|
67
|
+
[](https://github.com/napistu/napistu-py/actions/workflows/release.yml)
|
68
|
+
[](https://github.com/napistu/napistu-py/actions/workflows/deploy.yml)
|
69
|
+
[](https://github.com/psf/black)
|
65
70
|
|
66
71
|
This Python package hosts the majority of the algorithmic code for the [Napistu project](https://github.com/napistu/napistu).
|
67
72
|
|
@@ -1,18 +1,18 @@
|
|
1
1
|
napistu/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
|
2
2
|
napistu/__main__.py,sha256=xwlbh_0Ig3a-yG6BIJRiDPSN9R2HnX2pEBvlodlO6h4,29015
|
3
3
|
napistu/consensus.py,sha256=xWXiqIM6ot-SSPJZXTrVpohbINSCkZXBtRi-5REfk_g,69897
|
4
|
-
napistu/constants.py,sha256=
|
4
|
+
napistu/constants.py,sha256=8sp1l0cxu2rsnCrWBEEwhcBKvDtc4u0D0f_72zILLW0,13427
|
5
5
|
napistu/identifiers.py,sha256=e2-nTVzr5AINa0y1ER9218bKXyF2kAeJ9At22S4Z00o,33914
|
6
6
|
napistu/indices.py,sha256=Zjg3gE0JQ3T879lCPazYg-WXVE6hvcAr713ZKpJ32rk,9830
|
7
|
-
napistu/sbml_dfs_core.py,sha256=
|
8
|
-
napistu/sbml_dfs_utils.py,sha256=
|
7
|
+
napistu/sbml_dfs_core.py,sha256=s0OyoHs-AjOcbZu1d3KNkW_PI7Rxbhu5ZLpfQeO4iY8,72639
|
8
|
+
napistu/sbml_dfs_utils.py,sha256=w5dFcJFDKnKDK9jxPOCuCW8IccxdXmyNmP9vCUhVdf8,46184
|
9
9
|
napistu/source.py,sha256=UGpN70bqbC9gnKmM0ivSdQYim9hfzgABeXoQKzRr9oU,13646
|
10
10
|
napistu/utils.py,sha256=PEAsLn7VGN8JlNJQcAMYpjF1gr2mWmb5IqBsypP9hi0,35768
|
11
11
|
napistu/context/__init__.py,sha256=LQBEqipcHKK0E5UlDEg1ct-ymCs93IlUrUaH8BCevf0,242
|
12
12
|
napistu/context/discretize.py,sha256=Qq7zg46F_I-PvQIT2_pEDQV7YEtUQCxKoRvT5Gu9QsE,15052
|
13
13
|
napistu/context/filtering.py,sha256=l1oq-43ysSGqU9VmhTOO_pYT4DSMf20yxvktPC1MI0I,13696
|
14
14
|
napistu/gcs/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
|
15
|
-
napistu/gcs/constants.py,sha256=
|
15
|
+
napistu/gcs/constants.py,sha256=5hLp1pL7SHEiscLNKcdI4IeOP4vUaasBCIHJrEedl0o,2909
|
16
16
|
napistu/gcs/downloads.py,sha256=SvGv9WYr_Vt3guzyz1QiAuBndeKPTBtWSFLj1-QbLf4,6348
|
17
17
|
napistu/gcs/utils.py,sha256=eLSsvewWJdCguyj2k0ozUGP5BTemaE1PZg41Z3aY5kM,571
|
18
18
|
napistu/ingestion/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
|
@@ -73,6 +73,7 @@ napistu/ontologies/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1
|
|
73
73
|
napistu/ontologies/constants.py,sha256=GyOFvezSxDK1VigATcruTKtNhjcYaid1ggulEf_HEtQ,4345
|
74
74
|
napistu/ontologies/dogma.py,sha256=VVj6NKBgNym4SdOSu8g22OohALj7cbObhIJmdY2Sfy0,8860
|
75
75
|
napistu/ontologies/genodexito.py,sha256=ZZmb7V38BmFjy9VOGdxbD3-BD5tKGl5izr0nwO_eEdA,24967
|
76
|
+
napistu/ontologies/id_tables.py,sha256=q_31eQwlkRNFzLOkJNT4Fp6ra6kkzFOByzgJu5WFh0U,8372
|
76
77
|
napistu/ontologies/mygene.py,sha256=RMFQTWsLkeYxmsOPxxmeIya2phdcUMcF5V2abaS8MVg,11109
|
77
78
|
napistu/ontologies/renaming.py,sha256=aZR5oxjeZhse026fuvFyQiKM8PVzbBT915J8AfXGv1M,7006
|
78
79
|
napistu/rpy2/__init__.py,sha256=8WzSK_tmdcbyMUtb17OmqdQqbisqIBl8OQrDsaFDeX4,8356
|
@@ -82,7 +83,7 @@ napistu/rpy2/rids.py,sha256=AfXLTfTdonfspgAHYO0Ph7jSUWv8YuyT8x3fyLfAqc8,3413
|
|
82
83
|
napistu/scverse/__init__.py,sha256=Lgxr3iMQAkTzXE9BNz93CndNP5djzerLvmHM-D0PU3I,357
|
83
84
|
napistu/scverse/constants.py,sha256=0iAkhyJUIeFGHdLLU3fCaEU1O3Oix4qAsxr3CxGTjVs,653
|
84
85
|
napistu/scverse/loading.py,sha256=jqiE71XB-wdV50GyZrauFNY0Lai4bX9Fm2Gv80VR8t8,27016
|
85
|
-
napistu-0.4.
|
86
|
+
napistu-0.4.1.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
|
86
87
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
87
88
|
tests/conftest.py,sha256=t-GHb0MvSsC-MyhkFpOy2K3t5fi7eaig_Rc2xEQC-t8,9678
|
88
89
|
tests/test_consensus.py,sha256=Hzfrgp4SpkRDnEMVMD3f0UInSycndB8kKzC4wDDvRas,15076
|
@@ -113,14 +114,15 @@ tests/test_network_ng_utils.py,sha256=QVVuRnvCRfTSIlGdwQTIF9lr0wOwoc5gGeXAUY_Adg
|
|
113
114
|
tests/test_network_paths.py,sha256=TWZnxY5bF3m6gahcxcYJGrBIawh2-_vUcec1LyPmXV8,1686
|
114
115
|
tests/test_network_precompute.py,sha256=zwJrKNC3s8rIrsyAQfQMYxbl8HZXUr7u09nMJ_K8jiU,9005
|
115
116
|
tests/test_ontologies_genodexito.py,sha256=6fINyUiubHZqu7qxye09DQfJXw28ZMAJc3clPb-cCoY,2298
|
117
|
+
tests/test_ontologies_id_tables.py,sha256=CpwpbmQvTc1BaVd6jbDKHAVE2etwN0vx93nC8jpnMlE,7265
|
116
118
|
tests/test_ontologies_mygene.py,sha256=VkdRcKIWmcG6V-2dpfvsBiOJN5dO-j0RqZNxtJRcyBU,1583
|
117
119
|
tests/test_ontologies_renaming.py,sha256=pawp3pV1hxW8nskWc4f2YHwMUqTilEEBD2BtpcSay5Q,3839
|
118
120
|
tests/test_pathwayannot.py,sha256=bceosccNy9tgxQei_7j7ATBSSvBSxOngJvK-mAzR_K0,3312
|
119
121
|
tests/test_rpy2_callr.py,sha256=V4a-QH5krgYOQRgqzksMzIkGAFjBqKOAqgprxrH6bE0,2904
|
120
122
|
tests/test_rpy2_init.py,sha256=T3gnxC1O7XNvYM2P4018ikpPPAy-kwQLm7Erj0RfA-4,5895
|
121
123
|
tests/test_sbml.py,sha256=f25zj1NogYrmLluvBDboLameTuCiQ309433Qn3iPvhg,1483
|
122
|
-
tests/test_sbml_dfs_core.py,sha256=
|
123
|
-
tests/test_sbml_dfs_utils.py,sha256=
|
124
|
+
tests/test_sbml_dfs_core.py,sha256=nnLPpZTVtCznOBohk7CX67x6sMqktJWt-sZMWQKoaDs,26521
|
125
|
+
tests/test_sbml_dfs_utils.py,sha256=gWIhzUEtQlOR9c1TiCyhlSAELmWnBSncn6vCEqH5hl0,11029
|
124
126
|
tests/test_sbo.py,sha256=x_PENFaXYsrZIzOZu9cj_Wrej7i7SNGxgBYYvcigLs0,308
|
125
127
|
tests/test_scverse_loading.py,sha256=bnU1lQSYYWhOAs0IIBoi4ZohqPokDQJ0n_rtkAfEyMU,29948
|
126
128
|
tests/test_set_coverage.py,sha256=J-6m6LuOjcQa9pxRuWglSfJk4Ltm7kt_eOrn_Q-7P6Q,1604
|
@@ -129,8 +131,8 @@ tests/test_uncompartmentalize.py,sha256=nAk5kfAVLU9a2VWe2x2HYVcKqj-EnwmwddERIPRa
|
|
129
131
|
tests/test_utils.py,sha256=qPSpV-Q9b6vmdycgaDmQqtcvzKnAVnN9j5xJ9x-T6bg,23959
|
130
132
|
tests/utils.py,sha256=SoWQ_5roJteFGcMaOeEiQ5ucwq3Z2Fa3AAs9iXHTsJY,749
|
131
133
|
tests/test_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
132
|
-
napistu-0.4.
|
133
|
-
napistu-0.4.
|
134
|
-
napistu-0.4.
|
135
|
-
napistu-0.4.
|
136
|
-
napistu-0.4.
|
134
|
+
napistu-0.4.1.dist-info/METADATA,sha256=zl_710wCsatB3lKZAgHba-MLEOPSDOyrxs3b5FB6toA,4078
|
135
|
+
napistu-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
136
|
+
napistu-0.4.1.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
|
137
|
+
napistu-0.4.1.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
|
138
|
+
napistu-0.4.1.dist-info/RECORD,,
|
@@ -0,0 +1,198 @@
|
|
1
|
+
import pytest
|
2
|
+
import pandas as pd
|
3
|
+
from unittest.mock import patch
|
4
|
+
|
5
|
+
from napistu.ontologies import id_tables
|
6
|
+
from napistu.constants import (
|
7
|
+
BQB,
|
8
|
+
IDENTIFIERS,
|
9
|
+
ONTOLOGIES,
|
10
|
+
SBML_DFS,
|
11
|
+
VALID_BQB_TERMS,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
@pytest.fixture
|
16
|
+
def sample_id_table():
|
17
|
+
"""Create a sample DataFrame for testing."""
|
18
|
+
return pd.DataFrame(
|
19
|
+
{
|
20
|
+
SBML_DFS.S_ID: ["s1", "s2", "s3", "s4"],
|
21
|
+
IDENTIFIERS.ONTOLOGY: [
|
22
|
+
ONTOLOGIES.GO,
|
23
|
+
ONTOLOGIES.KEGG,
|
24
|
+
ONTOLOGIES.REACTOME,
|
25
|
+
ONTOLOGIES.WIKIPATHWAYS,
|
26
|
+
],
|
27
|
+
IDENTIFIERS.IDENTIFIER: ["GO:0001", "hsa00010", "R-HSA-123", "WP123"],
|
28
|
+
IDENTIFIERS.BQB: [BQB.IS, BQB.HAS_PART, BQB.IS_PART_OF, BQB.IS_VERSION_OF],
|
29
|
+
IDENTIFIERS.URL: ["foo", "bar", "baz", "qux"],
|
30
|
+
"other_col": ["a", "b", "c", "d"],
|
31
|
+
}
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
def test_sanitize_id_table_values_valid_cases(sample_id_table):
|
36
|
+
"""Test all valid use cases for _sanitize_id_table_values function."""
|
37
|
+
|
38
|
+
# Test string input conversion
|
39
|
+
result = id_tables._sanitize_id_table_values(
|
40
|
+
ONTOLOGIES.GO, sample_id_table, IDENTIFIERS.ONTOLOGY
|
41
|
+
)
|
42
|
+
assert result == {ONTOLOGIES.GO}
|
43
|
+
assert isinstance(result, set)
|
44
|
+
|
45
|
+
# Test list input conversion
|
46
|
+
result = id_tables._sanitize_id_table_values(
|
47
|
+
[ONTOLOGIES.GO, ONTOLOGIES.KEGG], sample_id_table, IDENTIFIERS.ONTOLOGY
|
48
|
+
)
|
49
|
+
assert result == {ONTOLOGIES.GO, ONTOLOGIES.KEGG}
|
50
|
+
assert isinstance(result, set)
|
51
|
+
|
52
|
+
# Test set input unchanged
|
53
|
+
input_set = {ONTOLOGIES.GO, ONTOLOGIES.KEGG}
|
54
|
+
result = id_tables._sanitize_id_table_values(
|
55
|
+
input_set, sample_id_table, IDENTIFIERS.ONTOLOGY
|
56
|
+
)
|
57
|
+
assert result == input_set
|
58
|
+
assert isinstance(result, set)
|
59
|
+
|
60
|
+
# Test successful validation against valid_values
|
61
|
+
result = id_tables._sanitize_id_table_values(
|
62
|
+
BQB.IS, sample_id_table, IDENTIFIERS.BQB, set(VALID_BQB_TERMS)
|
63
|
+
)
|
64
|
+
assert result == {BQB.IS}
|
65
|
+
|
66
|
+
# Test duplicate values in input list are handled correctly
|
67
|
+
result = id_tables._sanitize_id_table_values(
|
68
|
+
[ONTOLOGIES.GO, ONTOLOGIES.GO, ONTOLOGIES.KEGG],
|
69
|
+
sample_id_table,
|
70
|
+
IDENTIFIERS.ONTOLOGY,
|
71
|
+
)
|
72
|
+
assert result == {
|
73
|
+
ONTOLOGIES.GO,
|
74
|
+
ONTOLOGIES.KEGG,
|
75
|
+
} # Duplicates removed by set conversion
|
76
|
+
|
77
|
+
# Test all values present in table
|
78
|
+
result = id_tables._sanitize_id_table_values(
|
79
|
+
[ONTOLOGIES.GO, ONTOLOGIES.KEGG, ONTOLOGIES.REACTOME],
|
80
|
+
sample_id_table,
|
81
|
+
IDENTIFIERS.ONTOLOGY,
|
82
|
+
)
|
83
|
+
assert result == {ONTOLOGIES.GO, ONTOLOGIES.KEGG, ONTOLOGIES.REACTOME}
|
84
|
+
|
85
|
+
# Test single value present in table
|
86
|
+
result = id_tables._sanitize_id_table_values(
|
87
|
+
ONTOLOGIES.WIKIPATHWAYS, sample_id_table, IDENTIFIERS.ONTOLOGY
|
88
|
+
)
|
89
|
+
assert result == {ONTOLOGIES.WIKIPATHWAYS}
|
90
|
+
|
91
|
+
# Test with different column (BQB)
|
92
|
+
result = id_tables._sanitize_id_table_values(
|
93
|
+
BQB.HAS_PART, sample_id_table, IDENTIFIERS.BQB
|
94
|
+
)
|
95
|
+
assert result == {BQB.HAS_PART}
|
96
|
+
|
97
|
+
|
98
|
+
@patch("napistu.ontologies.id_tables.logger")
|
99
|
+
def test_sanitize_id_table_values_error_cases(mock_logger, sample_id_table):
|
100
|
+
"""Test error cases and edge cases for _sanitize_id_table_values function."""
|
101
|
+
|
102
|
+
# Test invalid input types raise ValueError
|
103
|
+
with pytest.raises(ValueError, match="ontology must be a string, a set, or list"):
|
104
|
+
id_tables._sanitize_id_table_values(123, sample_id_table, IDENTIFIERS.ONTOLOGY)
|
105
|
+
|
106
|
+
with pytest.raises(ValueError, match="ontology must be a string, a set, or list"):
|
107
|
+
id_tables._sanitize_id_table_values(
|
108
|
+
{"key": "value"}, sample_id_table, IDENTIFIERS.ONTOLOGY
|
109
|
+
)
|
110
|
+
|
111
|
+
# Test validation failure against valid_values
|
112
|
+
with pytest.raises(
|
113
|
+
ValueError, match="The following bqb are not valid: INVALID_BQB"
|
114
|
+
):
|
115
|
+
id_tables._sanitize_id_table_values(
|
116
|
+
"INVALID_BQB", sample_id_table, IDENTIFIERS.BQB, set(VALID_BQB_TERMS), "bqb"
|
117
|
+
)
|
118
|
+
|
119
|
+
# Test multiple invalid values against valid_values
|
120
|
+
with pytest.raises(ValueError, match="The following bqb are not valid"):
|
121
|
+
id_tables._sanitize_id_table_values(
|
122
|
+
["INVALID1", "INVALID2"],
|
123
|
+
sample_id_table,
|
124
|
+
IDENTIFIERS.BQB,
|
125
|
+
set(VALID_BQB_TERMS),
|
126
|
+
"bqb",
|
127
|
+
)
|
128
|
+
|
129
|
+
# Test all values missing from table raises error
|
130
|
+
missing_values = {"MISSING1", "MISSING2"}
|
131
|
+
with pytest.raises(ValueError, match="None of the requested ontology are present"):
|
132
|
+
id_tables._sanitize_id_table_values(
|
133
|
+
missing_values, sample_id_table, IDENTIFIERS.ONTOLOGY
|
134
|
+
)
|
135
|
+
|
136
|
+
# Test case-sensitive matching (lowercase 'go' should fail)
|
137
|
+
with pytest.raises(ValueError, match="None of the requested ontology are present"):
|
138
|
+
id_tables._sanitize_id_table_values(
|
139
|
+
"INVALID_ONTOLOGY", sample_id_table, IDENTIFIERS.ONTOLOGY
|
140
|
+
)
|
141
|
+
|
142
|
+
# Test custom value_type_name in error messages
|
143
|
+
with pytest.raises(ValueError, match="custom_type must be a string"):
|
144
|
+
id_tables._sanitize_id_table_values(
|
145
|
+
123, sample_id_table, IDENTIFIERS.ONTOLOGY, value_type_name="custom_type"
|
146
|
+
)
|
147
|
+
|
148
|
+
# Test default value_type_name uses column_name
|
149
|
+
with pytest.raises(ValueError, match="test_column must be a string"):
|
150
|
+
id_tables._sanitize_id_table_values(123, sample_id_table, "test_column")
|
151
|
+
|
152
|
+
# Test empty dataframe column
|
153
|
+
empty_df = pd.DataFrame({"ontology": []})
|
154
|
+
with pytest.raises(ValueError, match="None of the requested ontology are present"):
|
155
|
+
id_tables._sanitize_id_table_values("GO", empty_df, IDENTIFIERS.ONTOLOGY)
|
156
|
+
|
157
|
+
# Test partial values missing logs warning but doesn't raise error
|
158
|
+
mixed_values = {ONTOLOGIES.GO, "MISSING"} # GO exists, MISSING doesn't
|
159
|
+
result = id_tables._sanitize_id_table_values(
|
160
|
+
mixed_values, sample_id_table, IDENTIFIERS.ONTOLOGY
|
161
|
+
)
|
162
|
+
|
163
|
+
assert result == mixed_values
|
164
|
+
mock_logger.warning.assert_called_once()
|
165
|
+
warning_call = mock_logger.warning.call_args[0][0]
|
166
|
+
assert "MISSING" in warning_call
|
167
|
+
assert "not present in the id_table" in warning_call
|
168
|
+
|
169
|
+
# Test multiple partial missing values
|
170
|
+
mock_logger.reset_mock()
|
171
|
+
mixed_values = {ONTOLOGIES.GO, ONTOLOGIES.KEGG, "MISSING1", "MISSING2"}
|
172
|
+
result = id_tables._sanitize_id_table_values(
|
173
|
+
mixed_values, sample_id_table, IDENTIFIERS.ONTOLOGY
|
174
|
+
)
|
175
|
+
|
176
|
+
assert result == mixed_values
|
177
|
+
mock_logger.warning.assert_called_once()
|
178
|
+
warning_call = mock_logger.warning.call_args[0][0]
|
179
|
+
assert "MISSING1" in warning_call and "MISSING2" in warning_call
|
180
|
+
|
181
|
+
|
182
|
+
def test_filter_id_table_basic(sample_id_table):
|
183
|
+
"""Basic test for filter_id_table filtering by identifier, ontology, and bqb."""
|
184
|
+
|
185
|
+
# Use a known identifier, ontology, and bqb from the fixture
|
186
|
+
filtered = id_tables.filter_id_table(
|
187
|
+
id_table=sample_id_table,
|
188
|
+
identifiers=["GO:0001"],
|
189
|
+
ontologies=[ONTOLOGIES.GO],
|
190
|
+
bqbs=[BQB.IS],
|
191
|
+
)
|
192
|
+
# Should return a DataFrame with only the matching row
|
193
|
+
assert isinstance(filtered, pd.DataFrame)
|
194
|
+
assert len(filtered) == 1
|
195
|
+
row = filtered.iloc[0]
|
196
|
+
assert row[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.GO
|
197
|
+
assert row[IDENTIFIERS.IDENTIFIER] == "GO:0001"
|
198
|
+
assert row[IDENTIFIERS.BQB] == BQB.IS
|
tests/test_sbml_dfs_core.py
CHANGED
@@ -13,10 +13,12 @@ from napistu.modify import pathwayannot
|
|
13
13
|
|
14
14
|
from napistu import identifiers as napistu_identifiers
|
15
15
|
from napistu.constants import (
|
16
|
-
|
16
|
+
BQB,
|
17
17
|
BQB_DEFINING_ATTRS,
|
18
18
|
BQB_DEFINING_ATTRS_LOOSE,
|
19
|
-
|
19
|
+
SBML_DFS,
|
20
|
+
SCHEMA_DEFS,
|
21
|
+
ONTOLOGIES,
|
20
22
|
)
|
21
23
|
from napistu.sbml_dfs_core import SBML_dfs
|
22
24
|
from unittest.mock import patch
|
@@ -291,53 +293,62 @@ def test_read_sbml_with_invalid_ids():
|
|
291
293
|
|
292
294
|
|
293
295
|
def test_get_table(sbml_dfs):
|
294
|
-
assert isinstance(sbml_dfs.get_table(
|
295
|
-
assert isinstance(
|
296
|
+
assert isinstance(sbml_dfs.get_table(SBML_DFS.SPECIES), pd.DataFrame)
|
297
|
+
assert isinstance(
|
298
|
+
sbml_dfs.get_table(SBML_DFS.SPECIES, {SCHEMA_DEFS.ID}), pd.DataFrame
|
299
|
+
)
|
296
300
|
|
297
301
|
# invalid table
|
298
302
|
with pytest.raises(ValueError):
|
299
|
-
sbml_dfs.get_table("foo", {
|
303
|
+
sbml_dfs.get_table("foo", {SCHEMA_DEFS.ID})
|
300
304
|
|
301
305
|
# bad type
|
302
306
|
with pytest.raises(TypeError):
|
303
|
-
sbml_dfs.get_table(
|
307
|
+
sbml_dfs.get_table(SBML_DFS.REACTION_SPECIES, SCHEMA_DEFS.ID)
|
304
308
|
|
305
309
|
# reaction species don't have ids
|
306
310
|
with pytest.raises(ValueError):
|
307
|
-
sbml_dfs.get_table(
|
311
|
+
sbml_dfs.get_table(SBML_DFS.REACTION_SPECIES, {SCHEMA_DEFS.ID})
|
308
312
|
|
309
313
|
|
310
314
|
def test_search_by_name(sbml_dfs_metabolism):
|
311
|
-
assert
|
312
|
-
|
313
|
-
|
315
|
+
assert (
|
316
|
+
sbml_dfs_metabolism.search_by_name("atp", SBML_DFS.SPECIES, False).shape[0] == 1
|
317
|
+
)
|
318
|
+
assert sbml_dfs_metabolism.search_by_name("pyr", SBML_DFS.SPECIES).shape[0] == 3
|
319
|
+
assert (
|
320
|
+
sbml_dfs_metabolism.search_by_name("kinase", SBML_DFS.REACTIONS).shape[0] == 4
|
321
|
+
)
|
314
322
|
|
315
323
|
|
316
324
|
def test_search_by_id(sbml_dfs_metabolism):
|
317
|
-
identifiers_tbl = sbml_dfs_metabolism.get_identifiers(
|
325
|
+
identifiers_tbl = sbml_dfs_metabolism.get_identifiers(SBML_DFS.SPECIES)
|
318
326
|
ids, species = sbml_dfs_metabolism.search_by_ids(
|
319
|
-
["P40926"]
|
327
|
+
identifiers_tbl, identifiers=["P40926"]
|
320
328
|
)
|
321
329
|
assert ids.shape[0] == 1
|
322
330
|
assert species.shape[0] == 1
|
323
331
|
|
324
332
|
ids, species = sbml_dfs_metabolism.search_by_ids(
|
325
|
-
|
333
|
+
identifiers_tbl,
|
334
|
+
identifiers=["57540", "30744"],
|
335
|
+
ontologies={ONTOLOGIES.CHEBI},
|
326
336
|
)
|
327
337
|
assert ids.shape[0] == 2
|
328
338
|
assert species.shape[0] == 2
|
329
339
|
|
330
|
-
|
331
|
-
|
332
|
-
)
|
333
|
-
|
334
|
-
|
340
|
+
with pytest.raises(
|
341
|
+
ValueError, match="None of the requested identifiers are present"
|
342
|
+
):
|
343
|
+
ids, species = sbml_dfs_metabolism.search_by_ids(
|
344
|
+
identifiers_tbl, identifiers=["baz"] # Non-existent identifier
|
345
|
+
)
|
335
346
|
|
336
347
|
|
337
348
|
def test_species_status(sbml_dfs):
|
338
349
|
|
339
350
|
species = sbml_dfs.species
|
340
|
-
select_species = species[species[
|
351
|
+
select_species = species[species[SBML_DFS.S_NAME] == "OxyHbA"]
|
341
352
|
assert select_species.shape[0] == 1
|
342
353
|
|
343
354
|
status = sbml_dfs.species_status(select_species.index[0])
|
tests/test_sbml_dfs_utils.py
CHANGED
@@ -264,3 +264,73 @@ def test_sbo_constants_internal_consistency():
|
|
264
264
|
assert MINI_SBO_TO_NAME[term] == name
|
265
265
|
for term, name in MINI_SBO_TO_NAME.items():
|
266
266
|
assert MINI_SBO_FROM_NAME[name] == term
|
267
|
+
|
268
|
+
|
269
|
+
def test_infer_entity_type():
|
270
|
+
"""Test entity type inference with valid keys"""
|
271
|
+
# when index matches primary key.
|
272
|
+
# Test compartments with index as primary key
|
273
|
+
df = pd.DataFrame(
|
274
|
+
{SBML_DFS.C_NAME: ["cytoplasm"], SBML_DFS.C_IDENTIFIERS: ["GO:0005737"]}
|
275
|
+
)
|
276
|
+
df.index.name = SBML_DFS.C_ID
|
277
|
+
result = sbml_dfs_utils.infer_entity_type(df)
|
278
|
+
assert result == SBML_DFS.COMPARTMENTS
|
279
|
+
|
280
|
+
# Test species with index as primary key
|
281
|
+
df = pd.DataFrame(
|
282
|
+
{SBML_DFS.S_NAME: ["glucose"], SBML_DFS.S_IDENTIFIERS: ["CHEBI:17234"]}
|
283
|
+
)
|
284
|
+
df.index.name = SBML_DFS.S_ID
|
285
|
+
result = sbml_dfs_utils.infer_entity_type(df)
|
286
|
+
assert result == SBML_DFS.SPECIES
|
287
|
+
|
288
|
+
# Test entity type inference by exact column matching.
|
289
|
+
# Test compartmentalized_species (has foreign keys)
|
290
|
+
df = pd.DataFrame(
|
291
|
+
{
|
292
|
+
SBML_DFS.SC_ID: ["glucose_c"],
|
293
|
+
SBML_DFS.S_ID: ["glucose"],
|
294
|
+
SBML_DFS.C_ID: ["cytoplasm"],
|
295
|
+
}
|
296
|
+
)
|
297
|
+
result = sbml_dfs_utils.infer_entity_type(df)
|
298
|
+
assert result == "compartmentalized_species"
|
299
|
+
|
300
|
+
# Test reaction_species (has foreign keys)
|
301
|
+
df = pd.DataFrame(
|
302
|
+
{
|
303
|
+
SBML_DFS.RSC_ID: ["rxn1_glc"],
|
304
|
+
SBML_DFS.R_ID: ["rxn1"],
|
305
|
+
SBML_DFS.SC_ID: ["glucose_c"],
|
306
|
+
}
|
307
|
+
)
|
308
|
+
result = sbml_dfs_utils.infer_entity_type(df)
|
309
|
+
assert result == SBML_DFS.REACTION_SPECIES
|
310
|
+
|
311
|
+
# Test reactions (only primary key)
|
312
|
+
df = pd.DataFrame({SBML_DFS.R_ID: ["rxn1"]})
|
313
|
+
result = sbml_dfs_utils.infer_entity_type(df)
|
314
|
+
assert result == SBML_DFS.REACTIONS
|
315
|
+
|
316
|
+
|
317
|
+
def test_infer_entity_type_errors():
|
318
|
+
"""Test error cases for entity type inference."""
|
319
|
+
# Test no matching entity type
|
320
|
+
df = pd.DataFrame({"random_column": ["value"], "another_col": ["data"]})
|
321
|
+
with pytest.raises(ValueError, match="No entity type matches DataFrame"):
|
322
|
+
sbml_dfs_utils.infer_entity_type(df)
|
323
|
+
|
324
|
+
# Test partial match (missing required foreign key)
|
325
|
+
df = pd.DataFrame(
|
326
|
+
{SBML_DFS.SC_ID: ["glucose_c"], SBML_DFS.S_ID: ["glucose"]}
|
327
|
+
) # Missing c_id
|
328
|
+
with pytest.raises(ValueError):
|
329
|
+
sbml_dfs_utils.infer_entity_type(df)
|
330
|
+
|
331
|
+
# Test extra primary keys that shouldn't be there
|
332
|
+
df = pd.DataFrame(
|
333
|
+
{SBML_DFS.R_ID: ["rxn1"], SBML_DFS.S_ID: ["glucose"]}
|
334
|
+
) # Two primary keys
|
335
|
+
with pytest.raises(ValueError):
|
336
|
+
sbml_dfs_utils.infer_entity_type(df)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|