napistu 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
napistu/constants.py CHANGED
@@ -402,12 +402,14 @@ ONTOLOGIES = SimpleNamespace(
402
402
  ENSEMBL_PROTEIN_VERSION="ensembl_protein_version",
403
403
  GENE_NAME="gene_name",
404
404
  GO="go",
405
+ KEGG="kegg",
405
406
  MIRBASE="mirbase",
406
407
  NCBI_ENTREZ_GENE="ncbi_entrez_gene",
407
408
  PHAROS="pharos",
408
409
  REACTOME="reactome",
409
410
  SYMBOL="symbol",
410
411
  UNIPROT="uniprot",
412
+ WIKIPATHWAYS="wikipathways",
411
413
  )
412
414
 
413
415
  ONTOLOGIES_LIST = list(ONTOLOGIES.__dict__.values())
napistu/gcs/constants.py CHANGED
@@ -5,17 +5,17 @@ from types import SimpleNamespace
5
5
 
6
6
  GCS_SUBASSET_NAMES = SimpleNamespace(
7
7
  SBML_DFS="sbml_dfs",
8
- IDENTIFIERS="identifiers",
9
- REGULATORY_GRAPH="regulatory_graph",
8
+ NAPISTU_GRAPH="napistu_graph",
9
+ SPECIES_IDENTIFIERS="species_identifiers",
10
10
  REGULATORY_DISTANCES="regulatory_distances",
11
11
  )
12
12
 
13
13
 
14
14
  GCS_FILETYPES = SimpleNamespace(
15
15
  SBML_DFS="sbml_dfs.pkl",
16
- IDENTIFIERS="identifiers.tsv",
17
- REGULATORY_GRAPH="regulatory_graph.pkl",
18
- REGULATORY_DISTANCES="regulatory_distances.json",
16
+ NAPISTU_GRAPH="napistu_graph.pkl",
17
+ SPECIES_IDENTIFIERS="species_identifiers.tsv",
18
+ REGULATORY_DISTANCES="regulatory_distances.parquet",
19
19
  )
20
20
 
21
21
 
@@ -27,8 +27,8 @@ GCS_ASSETS = SimpleNamespace(
27
27
  "file": "test_pathway.tar.gz",
28
28
  "subassets": {
29
29
  GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
30
- GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
31
- GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
30
+ GCS_SUBASSET_NAMES.NAPISTU_GRAPH: GCS_FILETYPES.NAPISTU_GRAPH,
31
+ GCS_SUBASSET_NAMES.SPECIES_IDENTIFIERS: GCS_FILETYPES.SPECIES_IDENTIFIERS,
32
32
  GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
33
33
  },
34
34
  "public_url": "https://storage.googleapis.com/shackett-napistu-public/test_pathway.tar.gz",
@@ -37,8 +37,8 @@ GCS_ASSETS = SimpleNamespace(
37
37
  "file": "human_consensus.tar.gz",
38
38
  "subassets": {
39
39
  GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
40
- GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
41
- GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
40
+ GCS_SUBASSET_NAMES.NAPISTU_GRAPH: GCS_FILETYPES.NAPISTU_GRAPH,
41
+ GCS_SUBASSET_NAMES.SPECIES_IDENTIFIERS: GCS_FILETYPES.SPECIES_IDENTIFIERS,
42
42
  },
43
43
  "public_url": "https://storage.googleapis.com/shackett-napistu-public/human_consensus.tar.gz",
44
44
  },
@@ -46,8 +46,8 @@ GCS_ASSETS = SimpleNamespace(
46
46
  "file": "human_consensus_w_distances.tar.gz",
47
47
  "subassets": {
48
48
  GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
49
- GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
50
- GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
49
+ GCS_SUBASSET_NAMES.NAPISTU_GRAPH: GCS_FILETYPES.NAPISTU_GRAPH,
50
+ GCS_SUBASSET_NAMES.SPECIES_IDENTIFIERS: GCS_FILETYPES.SPECIES_IDENTIFIERS,
51
51
  GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
52
52
  },
53
53
  "public_url": "https://storage.googleapis.com/calico-cpr-public/human_consensus_w_distances.tar.gz",
@@ -0,0 +1,282 @@
1
+ import logging
2
+ from typing import Optional, Union, Set
3
+
4
+ import pandas as pd
5
+
6
+ from napistu import sbml_dfs_utils
7
+ from napistu.constants import (
8
+ BQB,
9
+ BQB_DEFINING_ATTRS_LOOSE,
10
+ IDENTIFIERS,
11
+ SBML_DFS_SCHEMA,
12
+ SCHEMA_DEFS,
13
+ VALID_BQB_TERMS,
14
+ )
15
+ from napistu import utils
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def filter_id_table(
21
+ id_table: pd.DataFrame,
22
+ identifiers: Optional[Union[str, list, set]] = None,
23
+ ontologies: Optional[Union[str, list, set]] = None,
24
+ bqbs: Optional[Union[str, list, set]] = BQB_DEFINING_ATTRS_LOOSE + [BQB.HAS_PART],
25
+ ) -> pd.DataFrame:
26
+ """
27
+ Filter an identifier table by identifiers, ontologies, and BQB terms for a given entity type.
28
+
29
+ Parameters
30
+ ----------
31
+ id_table : pd.DataFrame
32
+ DataFrame containing identifier mappings to be filtered.
33
+ identifiers : str, list, set, or None, optional
34
+ Identifiers to filter by. If None, no filtering is applied on identifiers.
35
+ ontologies : str, list, set, or None, optional
36
+ Ontologies to filter by. If None, no filtering is applied on ontologies.
37
+ bqbs : str, list, set, or None, optional
38
+ BQB terms to filter by. If None, no filtering is applied on BQB terms. Default is [BQB.IS, BQB.HAS_PART].
39
+
40
+ Returns
41
+ -------
42
+ pd.DataFrame
43
+ Filtered DataFrame containing only rows matching the specified criteria.
44
+
45
+ Raises
46
+ ------
47
+ ValueError
48
+ If the id_table or filter values are invalid, or required columns are missing.
49
+ """
50
+
51
+ entity_type = sbml_dfs_utils.infer_entity_type(id_table)
52
+ _validate_id_table(id_table, entity_type)
53
+
54
+ # bqbs
55
+ if bqbs is not None:
56
+ bqbs = _sanitize_id_table_bqbs(bqbs, id_table)
57
+ id_table = id_table.query("bqb in @bqbs")
58
+
59
+ # ontologies
60
+ if ontologies is not None:
61
+ ontologies = _sanitize_id_table_ontologies(ontologies, id_table)
62
+ id_table = id_table.query("ontology in @ontologies")
63
+
64
+ # identifiers
65
+ if identifiers is not None:
66
+ identifiers = _sanitize_id_table_identifiers(identifiers, id_table)
67
+ id_table = id_table.query("identifier in @identifiers")
68
+
69
+ # return the filtered id_table
70
+ return id_table
71
+
72
+
73
+ def _validate_id_table(id_table: pd.DataFrame, entity_type: str) -> None:
74
+ """
75
+ Validate that the id_table contains the required columns and matches the schema for the given entity_type.
76
+
77
+ Parameters
78
+ ----------
79
+ id_table : pd.DataFrame
80
+ DataFrame containing identifier mappings for a given entity type.
81
+ entity_type : str
82
+ The type of entity (e.g., 'species', 'reactions') to validate against the schema.
83
+
84
+ Returns
85
+ -------
86
+ None
87
+
88
+ Raises
89
+ ------
90
+ ValueError
91
+ If entity_type is not present in the schema, or if required columns are missing in id_table.
92
+ """
93
+
94
+ schema = SBML_DFS_SCHEMA.SCHEMA
95
+
96
+ if entity_type not in schema.keys():
97
+ raise ValueError(
98
+ f"{entity_type} does not match a table in the SBML_dfs object. The tables "
99
+ f"which are present are {', '.join(schema.keys())}"
100
+ )
101
+
102
+ entity_table_attrs = schema[entity_type]
103
+
104
+ if SCHEMA_DEFS.ID not in entity_table_attrs.keys():
105
+ raise ValueError(f"{entity_type} does not have an 'id' attribute")
106
+
107
+ entity_pk = entity_table_attrs[SCHEMA_DEFS.PK]
108
+
109
+ utils.match_pd_vars(
110
+ id_table,
111
+ req_vars={
112
+ entity_pk,
113
+ IDENTIFIERS.ONTOLOGY,
114
+ IDENTIFIERS.IDENTIFIER,
115
+ IDENTIFIERS.URL,
116
+ IDENTIFIERS.BQB,
117
+ },
118
+ allow_series=False,
119
+ ).assert_present()
120
+
121
+ return None
122
+
123
+
124
+ def _sanitize_id_table_values(
125
+ values: Union[str, list, set],
126
+ id_table: pd.DataFrame,
127
+ column_name: str,
128
+ valid_values: Optional[Set[str]] = None,
129
+ value_type_name: str = None,
130
+ ) -> set:
131
+ """
132
+ Generic function to sanitize and validate values against an id_table column.
133
+
134
+ Parameters
135
+ ----------
136
+ values : str, list, or set
137
+ Values to sanitize and validate. Can be a single string, list of strings,
138
+ or set of strings.
139
+ id_table : pd.DataFrame
140
+ DataFrame containing the reference data to validate against.
141
+ column_name : str
142
+ Name of the column in id_table to check values against.
143
+ valid_values : set of str, optional
144
+ Optional set of globally valid values for additional validation
145
+ (e.g., VALID_BQB_TERMS). If provided, values must be a subset of this set.
146
+ value_type_name : str, optional
147
+ Human-readable name for the value type used in error messages.
148
+ If None, defaults to column_name.
149
+
150
+ Returns
151
+ -------
152
+ set
153
+ Set of sanitized and validated values.
154
+
155
+ Raises
156
+ ------
157
+ ValueError
158
+ If values is not a string, list, or set.
159
+ If any values are not in valid_values (when provided).
160
+ If none of the requested values are present in the id_table.
161
+
162
+ Warnings
163
+ --------
164
+ Logs a warning if some (but not all) requested values are missing from id_table.
165
+ """
166
+ if value_type_name is None:
167
+ value_type_name = column_name
168
+
169
+ # Convert to set
170
+ if isinstance(values, str):
171
+ values = {values}
172
+ elif isinstance(values, list):
173
+ values = set(values)
174
+ elif isinstance(values, set):
175
+ pass
176
+ else:
177
+ raise ValueError(
178
+ f"{value_type_name} must be a string, a set, or list, got {type(values).__name__}"
179
+ )
180
+
181
+ # Check against global valid values if provided
182
+ if valid_values is not None:
183
+ invalid_values = values.difference(valid_values)
184
+ if len(invalid_values) > 0:
185
+ raise ValueError(
186
+ f"The following {value_type_name} are not valid: {', '.join(invalid_values)}.\n"
187
+ f"Valid {value_type_name} are {', '.join(valid_values)}"
188
+ )
189
+
190
+ # Check against values present in the id_table
191
+ available_values = set(id_table[column_name].unique())
192
+ missing_values = values.difference(available_values)
193
+
194
+ if len(missing_values) == len(values):
195
+ raise ValueError(
196
+ f"None of the requested {value_type_name} are present in the id_table: {', '.join(missing_values)}.\n"
197
+ f"The included {value_type_name} are {', '.join(available_values)}"
198
+ )
199
+ elif len(missing_values) > 0:
200
+ logger.warning(
201
+ f"The following {value_type_name} are not present in the id_table: {', '.join(missing_values)}.\n"
202
+ f"The included {value_type_name} are {', '.join(available_values)}"
203
+ )
204
+
205
+ return values
206
+
207
+
208
+ def _sanitize_id_table_ontologies(
209
+ ontologies: Union[str, list, set], id_table: pd.DataFrame
210
+ ) -> set:
211
+ """
212
+ Sanitize and validate ontologies against the id_table.
213
+
214
+ Parameters
215
+ ----------
216
+ ontologies : str, list, or set
217
+ Ontology names to validate.
218
+ id_table : pd.DataFrame
219
+ DataFrame containing ontology reference data.
220
+
221
+ Returns
222
+ -------
223
+ set
224
+ Set of validated ontology names.
225
+ """
226
+ return _sanitize_id_table_values(
227
+ values=ontologies,
228
+ id_table=id_table,
229
+ column_name=IDENTIFIERS.ONTOLOGY,
230
+ value_type_name="ontologies",
231
+ )
232
+
233
+
234
+ def _sanitize_id_table_bqbs(bqbs: Union[str, list, set], id_table: pd.DataFrame) -> set:
235
+ """
236
+ Sanitize and validate BQBs against the id_table.
237
+
238
+ Parameters
239
+ ----------
240
+ bqbs : str, list, or set
241
+ BQB terms to validate.
242
+ id_table : pd.DataFrame
243
+ DataFrame containing BQB reference data.
244
+
245
+ Returns
246
+ -------
247
+ set
248
+ Set of validated BQB terms.
249
+ """
250
+ return _sanitize_id_table_values(
251
+ values=bqbs,
252
+ id_table=id_table,
253
+ column_name=IDENTIFIERS.BQB,
254
+ valid_values=VALID_BQB_TERMS,
255
+ value_type_name="bqbs",
256
+ )
257
+
258
+
259
+ def _sanitize_id_table_identifiers(
260
+ identifiers: Union[str, list, set], id_table: pd.DataFrame
261
+ ) -> set:
262
+ """
263
+ Sanitize and validate identifiers against the id_table.
264
+
265
+ Parameters
266
+ ----------
267
+ identifiers : str, list, or set
268
+ Identifier values to validate.
269
+ id_table : pd.DataFrame
270
+ DataFrame containing identifier reference data.
271
+
272
+ Returns
273
+ -------
274
+ set
275
+ Set of validated identifiers.
276
+ """
277
+ return _sanitize_id_table_values(
278
+ values=identifiers,
279
+ id_table=id_table,
280
+ column_name=IDENTIFIERS.IDENTIFIER,
281
+ value_type_name="identifiers",
282
+ )
napistu/sbml_dfs_core.py CHANGED
@@ -19,17 +19,23 @@ from napistu import sbml_dfs_utils
19
19
  from napistu import source
20
20
  from napistu import utils
21
21
  from napistu.ingestion import sbml
22
- from napistu.constants import SBML_DFS
23
- from napistu.constants import SBML_DFS_SCHEMA
24
- from napistu.constants import IDENTIFIERS
25
- from napistu.constants import NAPISTU_STANDARD_OUTPUTS
26
- from napistu.constants import BQB_PRIORITIES
27
- from napistu.constants import ONTOLOGY_PRIORITIES
28
- from napistu.constants import MINI_SBO_FROM_NAME
29
- from napistu.constants import MINI_SBO_TO_NAME
30
- from napistu.constants import SBOTERM_NAMES
31
- from napistu.constants import ENTITIES_W_DATA
32
- from napistu.constants import ENTITIES_TO_ENTITY_DATA
22
+ from napistu.ontologies import id_tables
23
+ from napistu.constants import (
24
+ BQB,
25
+ BQB_DEFINING_ATTRS_LOOSE,
26
+ BQB_PRIORITIES,
27
+ ENTITIES_W_DATA,
28
+ ENTITIES_TO_ENTITY_DATA,
29
+ IDENTIFIERS,
30
+ MINI_SBO_FROM_NAME,
31
+ MINI_SBO_TO_NAME,
32
+ NAPISTU_STANDARD_OUTPUTS,
33
+ ONTOLOGY_PRIORITIES,
34
+ SBML_DFS,
35
+ SBML_DFS_SCHEMA,
36
+ SBOTERM_NAMES,
37
+ SCHEMA_DEFS,
38
+ )
33
39
 
34
40
  logger = logging.getLogger(__name__)
35
41
 
@@ -101,7 +107,7 @@ class SBML_dfs:
101
107
  Remove a reactions data table by label.
102
108
  remove_species_data(label)
103
109
  Remove a species data table by label.
104
- search_by_ids(ids, entity_type, identifiers_df, ontologies=None)
110
+ search_by_ids(id_table, identifiers=None, ontologies=None, bqbs=None)
105
111
  Find entities and identifiers matching a set of query IDs.
106
112
  search_by_name(name, entity_type, partial_match=True)
107
113
  Find entities by exact or partial name match.
@@ -455,12 +461,12 @@ class SBML_dfs:
455
461
  ValueError
456
462
  If id_type is invalid or identifiers are malformed
457
463
  """
458
- selected_table = self.get_table(id_type, {"id"})
464
+ selected_table = self.get_table(id_type, {SCHEMA_DEFS.ID})
459
465
  schema = SBML_DFS_SCHEMA.SCHEMA
460
466
 
461
467
  identifiers_dict = dict()
462
468
  for sysid in selected_table.index:
463
- id_entry = selected_table[schema[id_type]["id"]][sysid]
469
+ id_entry = selected_table[schema[id_type][SCHEMA_DEFS.ID]][sysid]
464
470
 
465
471
  if isinstance(id_entry, identifiers.Identifiers):
466
472
  identifiers_dict[sysid] = pd.DataFrame(id_entry.ids)
@@ -473,16 +479,16 @@ class SBML_dfs:
473
479
  )
474
480
  if not identifiers_dict:
475
481
  # Return empty DataFrame with expected columns if nothing found
476
- return pd.DataFrame(columns=[schema[id_type]["pk"], "entry"])
482
+ return pd.DataFrame(columns=[schema[id_type][SCHEMA_DEFS.PK], "entry"])
477
483
 
478
484
  identifiers_tbl = pd.concat(identifiers_dict)
479
485
 
480
- identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
486
+ identifiers_tbl.index.names = [schema[id_type][SCHEMA_DEFS.PK], "entry"]
481
487
  identifiers_tbl = identifiers_tbl.reset_index()
482
488
 
483
489
  named_identifiers = identifiers_tbl.merge(
484
- selected_table.drop(schema[id_type]["id"], axis=1),
485
- left_on=schema[id_type]["pk"],
490
+ selected_table.drop(schema[id_type][SCHEMA_DEFS.ID], axis=1),
491
+ left_on=schema[id_type][SCHEMA_DEFS.PK],
486
492
  right_index=True,
487
493
  )
488
494
 
@@ -1163,24 +1169,25 @@ class SBML_dfs:
1163
1169
 
1164
1170
  def search_by_ids(
1165
1171
  self,
1166
- ids: list[str],
1167
- entity_type: str,
1168
- identifiers_df: pd.DataFrame,
1169
- ontologies: None | set[str] = None,
1172
+ id_table: pd.DataFrame,
1173
+ identifiers: Optional[Union[str, list, set]] = None,
1174
+ ontologies: Optional[Union[str, list, set]] = None,
1175
+ bqbs: Optional[Union[str, list, set]] = BQB_DEFINING_ATTRS_LOOSE
1176
+ + [BQB.HAS_PART],
1170
1177
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
1171
1178
  """
1172
1179
  Find entities and identifiers matching a set of query IDs.
1173
1180
 
1174
1181
  Parameters
1175
1182
  ----------
1176
- ids : List[str]
1177
- List of identifiers to search for
1178
- entity_type : str
1179
- Type of entity to search (e.g., 'species', 'reactions')
1180
- identifiers_df : pd.DataFrame
1183
+ id_table : pd.DataFrame
1181
1184
  DataFrame containing identifier mappings
1182
- ontologies : Optional[Set[str]], optional
1183
- Set of ontologies to filter by, by default None
1185
+ identifiers : Optional[Union[str, list, set]], optional
1186
+ Identifiers to filter by, by default None
1187
+ ontologies : Optional[Union[str, list, set]], optional
1188
+ Ontologies to filter by, by default None
1189
+ bqbs : Optional[Union[str, list, set]], optional
1190
+ BQB terms to filter by, by default [BQB.IS, BQB.HAS_PART]
1184
1191
 
1185
1192
  Returns
1186
1193
  -------
@@ -1196,42 +1203,25 @@ class SBML_dfs:
1196
1203
  If ontologies is not a set
1197
1204
  """
1198
1205
  # validate inputs
1199
- entity_table = self.get_table(entity_type, required_attributes={"id"})
1200
- entity_pk = self.schema[entity_type]["pk"]
1201
-
1202
- utils.match_pd_vars(
1203
- identifiers_df,
1204
- req_vars={
1205
- entity_pk,
1206
- IDENTIFIERS.ONTOLOGY,
1207
- IDENTIFIERS.IDENTIFIER,
1208
- IDENTIFIERS.URL,
1209
- IDENTIFIERS.BQB,
1210
- },
1211
- allow_series=False,
1212
- ).assert_present()
1213
-
1214
- if ontologies is not None:
1215
- if not isinstance(ontologies, set):
1216
- # for clarity this should not be reachable based on type hints
1217
- raise TypeError(
1218
- f"ontologies must be a set, but got {type(ontologies).__name__}"
1219
- )
1220
- ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
1221
- invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
1222
- if len(invalid_ontologies) > 0:
1223
- raise ValueError(
1224
- f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
1225
- f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
1226
- )
1227
1206
 
1228
- # fitler to just to identifiers matchign the ontologies of interest
1229
- identifiers_df = identifiers_df.query("ontology in @ontologies")
1207
+ entity_type = sbml_dfs_utils.infer_entity_type(id_table)
1208
+ entity_table = self.get_table(entity_type, required_attributes={SCHEMA_DEFS.ID})
1209
+ entity_pk = self.schema[entity_type][SCHEMA_DEFS.PK]
1230
1210
 
1231
- matching_identifiers = identifiers_df.loc[
1232
- identifiers_df["identifier"].isin(ids)
1233
- ]
1234
- entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
1211
+ matching_identifiers = id_tables.filter_id_table(
1212
+ id_table=id_table, identifiers=identifiers, ontologies=ontologies, bqbs=bqbs
1213
+ )
1214
+
1215
+ matching_keys = matching_identifiers[entity_pk].tolist()
1216
+ entity_subset = entity_table.loc[matching_keys]
1217
+
1218
+ if matching_identifiers.shape[0] != entity_subset.shape[0]:
1219
+ raise ValueError(
1220
+ f"Some identifiers did not match to an entity for {entity_type}. "
1221
+ "This suggests that the identifiers and sbml_dfs are not in sync. "
1222
+ "Please create new identifiers with sbml_dfs.get_characteristic_species_ids() "
1223
+ "or sbml_dfs.get_identifiers()."
1224
+ )
1235
1225
 
1236
1226
  return entity_subset, matching_identifiers
1237
1227
 
napistu/sbml_dfs_utils.py CHANGED
@@ -14,24 +14,29 @@ from napistu import utils
14
14
  from napistu import identifiers
15
15
  from napistu import indices
16
16
 
17
- from napistu.constants import BQB
18
- from napistu.constants import SBML_DFS
19
- from napistu.constants import SBML_DFS_SCHEMA
20
- from napistu.constants import IDENTIFIERS
21
- from napistu.constants import BQB_DEFINING_ATTRS
22
- from napistu.constants import BQB_DEFINING_ATTRS_LOOSE
23
- from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
24
- from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
25
- from napistu.constants import SBO_ROLES_DEFS
26
- from napistu.constants import MINI_SBO_FROM_NAME
27
- from napistu.constants import MINI_SBO_TO_NAME
28
- from napistu.constants import SBO_NAME_TO_ROLE
29
- from napistu.constants import ONTOLOGIES
30
- from napistu.constants import VALID_SBO_TERM_NAMES
31
- from napistu.constants import VALID_SBO_TERMS
32
- from napistu.ingestion.constants import VALID_COMPARTMENTS
33
- from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
34
- from napistu.ingestion.constants import GENERIC_COMPARTMENT
17
+ from napistu.constants import (
18
+ BQB,
19
+ BQB_DEFINING_ATTRS,
20
+ BQB_DEFINING_ATTRS_LOOSE,
21
+ SBML_DFS,
22
+ SBML_DFS_SCHEMA,
23
+ SCHEMA_DEFS,
24
+ IDENTIFIERS,
25
+ INTERACTION_EDGELIST_EXPECTED_VARS,
26
+ ONTOLOGIES,
27
+ MINI_SBO_FROM_NAME,
28
+ MINI_SBO_TO_NAME,
29
+ REQUIRED_REACTION_FROMEDGELIST_COLUMNS,
30
+ SBO_ROLES_DEFS,
31
+ SBO_NAME_TO_ROLE,
32
+ VALID_SBO_TERM_NAMES,
33
+ VALID_SBO_TERMS,
34
+ )
35
+ from napistu.ingestion.constants import (
36
+ COMPARTMENTS_GO_TERMS,
37
+ GENERIC_COMPARTMENT,
38
+ VALID_COMPARTMENTS,
39
+ )
35
40
 
36
41
  logger = logging.getLogger(__name__)
37
42
 
@@ -418,6 +423,65 @@ def id_formatter_inv(ids: list[str]) -> list[int]:
418
423
  return id_val
419
424
 
420
425
 
426
+ def infer_entity_type(df: pd.DataFrame) -> str:
427
+ """
428
+ Infer the entity type of a DataFrame based on its structure and schema.
429
+
430
+ Parameters
431
+ ----------
432
+ df : pd.DataFrame
433
+ The DataFrame to analyze
434
+
435
+ Returns
436
+ -------
437
+ str
438
+ The inferred entity type name
439
+
440
+ Raises
441
+ ------
442
+ ValueError
443
+ If no entity type can be determined
444
+ """
445
+ schema = SBML_DFS_SCHEMA.SCHEMA
446
+
447
+ # Get all primary keys
448
+ primary_keys = [
449
+ entity_schema.get(SCHEMA_DEFS.PK) for entity_schema in schema.values()
450
+ ]
451
+ primary_keys = [pk for pk in primary_keys if pk is not None]
452
+
453
+ # Check if index matches a primary key
454
+ if df.index.name in primary_keys:
455
+ for entity_type, entity_schema in schema.items():
456
+ if entity_schema.get(SCHEMA_DEFS.PK) == df.index.name:
457
+ return entity_type
458
+
459
+ # Get DataFrame columns that are also primary keys
460
+ df_columns = set(df.columns).intersection(primary_keys)
461
+
462
+ # Check for exact match with primary key + foreign keys
463
+ for entity_type, entity_schema in schema.items():
464
+ expected_keys = set()
465
+
466
+ # Add primary key
467
+ pk = entity_schema.get(SCHEMA_DEFS.PK)
468
+ if pk:
469
+ expected_keys.add(pk)
470
+
471
+ # Add foreign keys
472
+ fks = entity_schema.get(SCHEMA_DEFS.FK, [])
473
+ expected_keys.update(fks)
474
+
475
+ # Check for exact match
476
+ if df_columns == expected_keys:
477
+ return entity_type
478
+
479
+ # No match found
480
+ raise ValueError(
481
+ f"No entity type matches DataFrame with columns: {sorted(df_columns)}"
482
+ )
483
+
484
+
421
485
  def match_entitydata_index_to_entity(
422
486
  entity_data_dict: dict,
423
487
  an_entity_data_type: str,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: napistu
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Connecting high-dimensional data to curated pathways
5
5
  Home-page: https://github.com/napistu/napistu-py
6
6
  Author: Sean Hackett
@@ -61,7 +61,12 @@ Dynamic: license-file
61
61
 
62
62
  # Napistu Python Library
63
63
 
64
+ [![PyPI version](https://badge.fury.io/py/napistu.svg)](https://badge.fury.io/py/napistu)
64
65
  [![Documentation Status](https://readthedocs.org/projects/napistu/badge/?version=latest)](https://napistu.readthedocs.io/en/latest/?badge=latest)
66
+ [![CI](https://github.com/napistu/napistu-py/actions/workflows/ci.yml/badge.svg)](https://github.com/napistu/napistu-py/actions/workflows/ci.yml)
67
+ [![Release](https://github.com/napistu/napistu-py/actions/workflows/release.yml/badge.svg)](https://github.com/napistu/napistu-py/actions/workflows/release.yml)
68
+ [![Deploy to Cloud Run](https://github.com/napistu/napistu-py/actions/workflows/deploy.yml/badge.svg)](https://github.com/napistu/napistu-py/actions/workflows/deploy.yml)
69
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
65
70
 
66
71
  This Python package hosts the majority of the algorithmic code for the [Napistu project](https://github.com/napistu/napistu).
67
72
 
@@ -1,18 +1,18 @@
1
1
  napistu/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
2
2
  napistu/__main__.py,sha256=xwlbh_0Ig3a-yG6BIJRiDPSN9R2HnX2pEBvlodlO6h4,29015
3
3
  napistu/consensus.py,sha256=xWXiqIM6ot-SSPJZXTrVpohbINSCkZXBtRi-5REfk_g,69897
4
- napistu/constants.py,sha256=CA-8OnE8LudLSML8piIdfPyRwgdp143Yh9eVwk3uLTw,13377
4
+ napistu/constants.py,sha256=8sp1l0cxu2rsnCrWBEEwhcBKvDtc4u0D0f_72zILLW0,13427
5
5
  napistu/identifiers.py,sha256=e2-nTVzr5AINa0y1ER9218bKXyF2kAeJ9At22S4Z00o,33914
6
6
  napistu/indices.py,sha256=Zjg3gE0JQ3T879lCPazYg-WXVE6hvcAr713ZKpJ32rk,9830
7
- napistu/sbml_dfs_core.py,sha256=3Z2Kg-aVnZMGK9iK-_vztY2ORgNpta8BUMuWEZg80iE,73125
8
- napistu/sbml_dfs_utils.py,sha256=hfTxPerJIKuPC_jpGRCGHE8XMagsHTEe1h2E99VEWL4,44980
7
+ napistu/sbml_dfs_core.py,sha256=s0OyoHs-AjOcbZu1d3KNkW_PI7Rxbhu5ZLpfQeO4iY8,72639
8
+ napistu/sbml_dfs_utils.py,sha256=w5dFcJFDKnKDK9jxPOCuCW8IccxdXmyNmP9vCUhVdf8,46184
9
9
  napistu/source.py,sha256=UGpN70bqbC9gnKmM0ivSdQYim9hfzgABeXoQKzRr9oU,13646
10
10
  napistu/utils.py,sha256=PEAsLn7VGN8JlNJQcAMYpjF1gr2mWmb5IqBsypP9hi0,35768
11
11
  napistu/context/__init__.py,sha256=LQBEqipcHKK0E5UlDEg1ct-ymCs93IlUrUaH8BCevf0,242
12
12
  napistu/context/discretize.py,sha256=Qq7zg46F_I-PvQIT2_pEDQV7YEtUQCxKoRvT5Gu9QsE,15052
13
13
  napistu/context/filtering.py,sha256=l1oq-43ysSGqU9VmhTOO_pYT4DSMf20yxvktPC1MI0I,13696
14
14
  napistu/gcs/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
15
- napistu/gcs/constants.py,sha256=g6PaU99GY5XvaRHx4BGmWHUpcJ36-Zh_GzeNVOeHviM,2856
15
+ napistu/gcs/constants.py,sha256=5hLp1pL7SHEiscLNKcdI4IeOP4vUaasBCIHJrEedl0o,2909
16
16
  napistu/gcs/downloads.py,sha256=SvGv9WYr_Vt3guzyz1QiAuBndeKPTBtWSFLj1-QbLf4,6348
17
17
  napistu/gcs/utils.py,sha256=eLSsvewWJdCguyj2k0ozUGP5BTemaE1PZg41Z3aY5kM,571
18
18
  napistu/ingestion/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
@@ -73,6 +73,7 @@ napistu/ontologies/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1
73
73
  napistu/ontologies/constants.py,sha256=GyOFvezSxDK1VigATcruTKtNhjcYaid1ggulEf_HEtQ,4345
74
74
  napistu/ontologies/dogma.py,sha256=VVj6NKBgNym4SdOSu8g22OohALj7cbObhIJmdY2Sfy0,8860
75
75
  napistu/ontologies/genodexito.py,sha256=ZZmb7V38BmFjy9VOGdxbD3-BD5tKGl5izr0nwO_eEdA,24967
76
+ napistu/ontologies/id_tables.py,sha256=q_31eQwlkRNFzLOkJNT4Fp6ra6kkzFOByzgJu5WFh0U,8372
76
77
  napistu/ontologies/mygene.py,sha256=RMFQTWsLkeYxmsOPxxmeIya2phdcUMcF5V2abaS8MVg,11109
77
78
  napistu/ontologies/renaming.py,sha256=aZR5oxjeZhse026fuvFyQiKM8PVzbBT915J8AfXGv1M,7006
78
79
  napistu/rpy2/__init__.py,sha256=8WzSK_tmdcbyMUtb17OmqdQqbisqIBl8OQrDsaFDeX4,8356
@@ -82,7 +83,7 @@ napistu/rpy2/rids.py,sha256=AfXLTfTdonfspgAHYO0Ph7jSUWv8YuyT8x3fyLfAqc8,3413
82
83
  napistu/scverse/__init__.py,sha256=Lgxr3iMQAkTzXE9BNz93CndNP5djzerLvmHM-D0PU3I,357
83
84
  napistu/scverse/constants.py,sha256=0iAkhyJUIeFGHdLLU3fCaEU1O3Oix4qAsxr3CxGTjVs,653
84
85
  napistu/scverse/loading.py,sha256=jqiE71XB-wdV50GyZrauFNY0Lai4bX9Fm2Gv80VR8t8,27016
85
- napistu-0.4.0.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
86
+ napistu-0.4.1.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
86
87
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
88
  tests/conftest.py,sha256=t-GHb0MvSsC-MyhkFpOy2K3t5fi7eaig_Rc2xEQC-t8,9678
88
89
  tests/test_consensus.py,sha256=Hzfrgp4SpkRDnEMVMD3f0UInSycndB8kKzC4wDDvRas,15076
@@ -113,14 +114,15 @@ tests/test_network_ng_utils.py,sha256=QVVuRnvCRfTSIlGdwQTIF9lr0wOwoc5gGeXAUY_Adg
113
114
  tests/test_network_paths.py,sha256=TWZnxY5bF3m6gahcxcYJGrBIawh2-_vUcec1LyPmXV8,1686
114
115
  tests/test_network_precompute.py,sha256=zwJrKNC3s8rIrsyAQfQMYxbl8HZXUr7u09nMJ_K8jiU,9005
115
116
  tests/test_ontologies_genodexito.py,sha256=6fINyUiubHZqu7qxye09DQfJXw28ZMAJc3clPb-cCoY,2298
117
+ tests/test_ontologies_id_tables.py,sha256=CpwpbmQvTc1BaVd6jbDKHAVE2etwN0vx93nC8jpnMlE,7265
116
118
  tests/test_ontologies_mygene.py,sha256=VkdRcKIWmcG6V-2dpfvsBiOJN5dO-j0RqZNxtJRcyBU,1583
117
119
  tests/test_ontologies_renaming.py,sha256=pawp3pV1hxW8nskWc4f2YHwMUqTilEEBD2BtpcSay5Q,3839
118
120
  tests/test_pathwayannot.py,sha256=bceosccNy9tgxQei_7j7ATBSSvBSxOngJvK-mAzR_K0,3312
119
121
  tests/test_rpy2_callr.py,sha256=V4a-QH5krgYOQRgqzksMzIkGAFjBqKOAqgprxrH6bE0,2904
120
122
  tests/test_rpy2_init.py,sha256=T3gnxC1O7XNvYM2P4018ikpPPAy-kwQLm7Erj0RfA-4,5895
121
123
  tests/test_sbml.py,sha256=f25zj1NogYrmLluvBDboLameTuCiQ309433Qn3iPvhg,1483
122
- tests/test_sbml_dfs_core.py,sha256=CH5OXNSAozWTl6qBvbHfgTG0NcgdlKJ_WcG0lTYBm3k,26217
123
- tests/test_sbml_dfs_utils.py,sha256=WLR-b7VQ7VW4d9rv0DePPd9l-CG-FkpsHTa3OOvrbXQ,8630
124
+ tests/test_sbml_dfs_core.py,sha256=nnLPpZTVtCznOBohk7CX67x6sMqktJWt-sZMWQKoaDs,26521
125
+ tests/test_sbml_dfs_utils.py,sha256=gWIhzUEtQlOR9c1TiCyhlSAELmWnBSncn6vCEqH5hl0,11029
124
126
  tests/test_sbo.py,sha256=x_PENFaXYsrZIzOZu9cj_Wrej7i7SNGxgBYYvcigLs0,308
125
127
  tests/test_scverse_loading.py,sha256=bnU1lQSYYWhOAs0IIBoi4ZohqPokDQJ0n_rtkAfEyMU,29948
126
128
  tests/test_set_coverage.py,sha256=J-6m6LuOjcQa9pxRuWglSfJk4Ltm7kt_eOrn_Q-7P6Q,1604
@@ -129,8 +131,8 @@ tests/test_uncompartmentalize.py,sha256=nAk5kfAVLU9a2VWe2x2HYVcKqj-EnwmwddERIPRa
129
131
  tests/test_utils.py,sha256=qPSpV-Q9b6vmdycgaDmQqtcvzKnAVnN9j5xJ9x-T6bg,23959
130
132
  tests/utils.py,sha256=SoWQ_5roJteFGcMaOeEiQ5ucwq3Z2Fa3AAs9iXHTsJY,749
131
133
  tests/test_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
- napistu-0.4.0.dist-info/METADATA,sha256=fejMxUkQvO_EqBaZIowvumw5KRzXsjZ9gGGX0tc-YkE,3397
133
- napistu-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
134
- napistu-0.4.0.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
135
- napistu-0.4.0.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
136
- napistu-0.4.0.dist-info/RECORD,,
134
+ napistu-0.4.1.dist-info/METADATA,sha256=zl_710wCsatB3lKZAgHba-MLEOPSDOyrxs3b5FB6toA,4078
135
+ napistu-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
136
+ napistu-0.4.1.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
137
+ napistu-0.4.1.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
138
+ napistu-0.4.1.dist-info/RECORD,,
@@ -0,0 +1,198 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from unittest.mock import patch
4
+
5
+ from napistu.ontologies import id_tables
6
+ from napistu.constants import (
7
+ BQB,
8
+ IDENTIFIERS,
9
+ ONTOLOGIES,
10
+ SBML_DFS,
11
+ VALID_BQB_TERMS,
12
+ )
13
+
14
+
15
+ @pytest.fixture
16
+ def sample_id_table():
17
+ """Create a sample DataFrame for testing."""
18
+ return pd.DataFrame(
19
+ {
20
+ SBML_DFS.S_ID: ["s1", "s2", "s3", "s4"],
21
+ IDENTIFIERS.ONTOLOGY: [
22
+ ONTOLOGIES.GO,
23
+ ONTOLOGIES.KEGG,
24
+ ONTOLOGIES.REACTOME,
25
+ ONTOLOGIES.WIKIPATHWAYS,
26
+ ],
27
+ IDENTIFIERS.IDENTIFIER: ["GO:0001", "hsa00010", "R-HSA-123", "WP123"],
28
+ IDENTIFIERS.BQB: [BQB.IS, BQB.HAS_PART, BQB.IS_PART_OF, BQB.IS_VERSION_OF],
29
+ IDENTIFIERS.URL: ["foo", "bar", "baz", "qux"],
30
+ "other_col": ["a", "b", "c", "d"],
31
+ }
32
+ )
33
+
34
+
35
+ def test_sanitize_id_table_values_valid_cases(sample_id_table):
36
+ """Test all valid use cases for _sanitize_id_table_values function."""
37
+
38
+ # Test string input conversion
39
+ result = id_tables._sanitize_id_table_values(
40
+ ONTOLOGIES.GO, sample_id_table, IDENTIFIERS.ONTOLOGY
41
+ )
42
+ assert result == {ONTOLOGIES.GO}
43
+ assert isinstance(result, set)
44
+
45
+ # Test list input conversion
46
+ result = id_tables._sanitize_id_table_values(
47
+ [ONTOLOGIES.GO, ONTOLOGIES.KEGG], sample_id_table, IDENTIFIERS.ONTOLOGY
48
+ )
49
+ assert result == {ONTOLOGIES.GO, ONTOLOGIES.KEGG}
50
+ assert isinstance(result, set)
51
+
52
+ # Test set input unchanged
53
+ input_set = {ONTOLOGIES.GO, ONTOLOGIES.KEGG}
54
+ result = id_tables._sanitize_id_table_values(
55
+ input_set, sample_id_table, IDENTIFIERS.ONTOLOGY
56
+ )
57
+ assert result == input_set
58
+ assert isinstance(result, set)
59
+
60
+ # Test successful validation against valid_values
61
+ result = id_tables._sanitize_id_table_values(
62
+ BQB.IS, sample_id_table, IDENTIFIERS.BQB, set(VALID_BQB_TERMS)
63
+ )
64
+ assert result == {BQB.IS}
65
+
66
+ # Test duplicate values in input list are handled correctly
67
+ result = id_tables._sanitize_id_table_values(
68
+ [ONTOLOGIES.GO, ONTOLOGIES.GO, ONTOLOGIES.KEGG],
69
+ sample_id_table,
70
+ IDENTIFIERS.ONTOLOGY,
71
+ )
72
+ assert result == {
73
+ ONTOLOGIES.GO,
74
+ ONTOLOGIES.KEGG,
75
+ } # Duplicates removed by set conversion
76
+
77
+ # Test all values present in table
78
+ result = id_tables._sanitize_id_table_values(
79
+ [ONTOLOGIES.GO, ONTOLOGIES.KEGG, ONTOLOGIES.REACTOME],
80
+ sample_id_table,
81
+ IDENTIFIERS.ONTOLOGY,
82
+ )
83
+ assert result == {ONTOLOGIES.GO, ONTOLOGIES.KEGG, ONTOLOGIES.REACTOME}
84
+
85
+ # Test single value present in table
86
+ result = id_tables._sanitize_id_table_values(
87
+ ONTOLOGIES.WIKIPATHWAYS, sample_id_table, IDENTIFIERS.ONTOLOGY
88
+ )
89
+ assert result == {ONTOLOGIES.WIKIPATHWAYS}
90
+
91
+ # Test with different column (BQB)
92
+ result = id_tables._sanitize_id_table_values(
93
+ BQB.HAS_PART, sample_id_table, IDENTIFIERS.BQB
94
+ )
95
+ assert result == {BQB.HAS_PART}
96
+
97
+
98
+ @patch("napistu.ontologies.id_tables.logger")
99
+ def test_sanitize_id_table_values_error_cases(mock_logger, sample_id_table):
100
+ """Test error cases and edge cases for _sanitize_id_table_values function."""
101
+
102
+ # Test invalid input types raise ValueError
103
+ with pytest.raises(ValueError, match="ontology must be a string, a set, or list"):
104
+ id_tables._sanitize_id_table_values(123, sample_id_table, IDENTIFIERS.ONTOLOGY)
105
+
106
+ with pytest.raises(ValueError, match="ontology must be a string, a set, or list"):
107
+ id_tables._sanitize_id_table_values(
108
+ {"key": "value"}, sample_id_table, IDENTIFIERS.ONTOLOGY
109
+ )
110
+
111
+ # Test validation failure against valid_values
112
+ with pytest.raises(
113
+ ValueError, match="The following bqb are not valid: INVALID_BQB"
114
+ ):
115
+ id_tables._sanitize_id_table_values(
116
+ "INVALID_BQB", sample_id_table, IDENTIFIERS.BQB, set(VALID_BQB_TERMS), "bqb"
117
+ )
118
+
119
+ # Test multiple invalid values against valid_values
120
+ with pytest.raises(ValueError, match="The following bqb are not valid"):
121
+ id_tables._sanitize_id_table_values(
122
+ ["INVALID1", "INVALID2"],
123
+ sample_id_table,
124
+ IDENTIFIERS.BQB,
125
+ set(VALID_BQB_TERMS),
126
+ "bqb",
127
+ )
128
+
129
+ # Test all values missing from table raises error
130
+ missing_values = {"MISSING1", "MISSING2"}
131
+ with pytest.raises(ValueError, match="None of the requested ontology are present"):
132
+ id_tables._sanitize_id_table_values(
133
+ missing_values, sample_id_table, IDENTIFIERS.ONTOLOGY
134
+ )
135
+
136
+ # Test case-sensitive matching (lowercase 'go' should fail)
137
+ with pytest.raises(ValueError, match="None of the requested ontology are present"):
138
+ id_tables._sanitize_id_table_values(
139
+ "INVALID_ONTOLOGY", sample_id_table, IDENTIFIERS.ONTOLOGY
140
+ )
141
+
142
+ # Test custom value_type_name in error messages
143
+ with pytest.raises(ValueError, match="custom_type must be a string"):
144
+ id_tables._sanitize_id_table_values(
145
+ 123, sample_id_table, IDENTIFIERS.ONTOLOGY, value_type_name="custom_type"
146
+ )
147
+
148
+ # Test default value_type_name uses column_name
149
+ with pytest.raises(ValueError, match="test_column must be a string"):
150
+ id_tables._sanitize_id_table_values(123, sample_id_table, "test_column")
151
+
152
+ # Test empty dataframe column
153
+ empty_df = pd.DataFrame({"ontology": []})
154
+ with pytest.raises(ValueError, match="None of the requested ontology are present"):
155
+ id_tables._sanitize_id_table_values("GO", empty_df, IDENTIFIERS.ONTOLOGY)
156
+
157
+ # Test partial values missing logs warning but doesn't raise error
158
+ mixed_values = {ONTOLOGIES.GO, "MISSING"} # GO exists, MISSING doesn't
159
+ result = id_tables._sanitize_id_table_values(
160
+ mixed_values, sample_id_table, IDENTIFIERS.ONTOLOGY
161
+ )
162
+
163
+ assert result == mixed_values
164
+ mock_logger.warning.assert_called_once()
165
+ warning_call = mock_logger.warning.call_args[0][0]
166
+ assert "MISSING" in warning_call
167
+ assert "not present in the id_table" in warning_call
168
+
169
+ # Test multiple partial missing values
170
+ mock_logger.reset_mock()
171
+ mixed_values = {ONTOLOGIES.GO, ONTOLOGIES.KEGG, "MISSING1", "MISSING2"}
172
+ result = id_tables._sanitize_id_table_values(
173
+ mixed_values, sample_id_table, IDENTIFIERS.ONTOLOGY
174
+ )
175
+
176
+ assert result == mixed_values
177
+ mock_logger.warning.assert_called_once()
178
+ warning_call = mock_logger.warning.call_args[0][0]
179
+ assert "MISSING1" in warning_call and "MISSING2" in warning_call
180
+
181
+
182
+ def test_filter_id_table_basic(sample_id_table):
183
+ """Basic test for filter_id_table filtering by identifier, ontology, and bqb."""
184
+
185
+ # Use a known identifier, ontology, and bqb from the fixture
186
+ filtered = id_tables.filter_id_table(
187
+ id_table=sample_id_table,
188
+ identifiers=["GO:0001"],
189
+ ontologies=[ONTOLOGIES.GO],
190
+ bqbs=[BQB.IS],
191
+ )
192
+ # Should return a DataFrame with only the matching row
193
+ assert isinstance(filtered, pd.DataFrame)
194
+ assert len(filtered) == 1
195
+ row = filtered.iloc[0]
196
+ assert row[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.GO
197
+ assert row[IDENTIFIERS.IDENTIFIER] == "GO:0001"
198
+ assert row[IDENTIFIERS.BQB] == BQB.IS
@@ -13,10 +13,12 @@ from napistu.modify import pathwayannot
13
13
 
14
14
  from napistu import identifiers as napistu_identifiers
15
15
  from napistu.constants import (
16
- SBML_DFS,
16
+ BQB,
17
17
  BQB_DEFINING_ATTRS,
18
18
  BQB_DEFINING_ATTRS_LOOSE,
19
- BQB,
19
+ SBML_DFS,
20
+ SCHEMA_DEFS,
21
+ ONTOLOGIES,
20
22
  )
21
23
  from napistu.sbml_dfs_core import SBML_dfs
22
24
  from unittest.mock import patch
@@ -291,53 +293,62 @@ def test_read_sbml_with_invalid_ids():
291
293
 
292
294
 
293
295
  def test_get_table(sbml_dfs):
294
- assert isinstance(sbml_dfs.get_table("species"), pd.DataFrame)
295
- assert isinstance(sbml_dfs.get_table("species", {"id"}), pd.DataFrame)
296
+ assert isinstance(sbml_dfs.get_table(SBML_DFS.SPECIES), pd.DataFrame)
297
+ assert isinstance(
298
+ sbml_dfs.get_table(SBML_DFS.SPECIES, {SCHEMA_DEFS.ID}), pd.DataFrame
299
+ )
296
300
 
297
301
  # invalid table
298
302
  with pytest.raises(ValueError):
299
- sbml_dfs.get_table("foo", {"id"})
303
+ sbml_dfs.get_table("foo", {SCHEMA_DEFS.ID})
300
304
 
301
305
  # bad type
302
306
  with pytest.raises(TypeError):
303
- sbml_dfs.get_table("reaction_species", "id")
307
+ sbml_dfs.get_table(SBML_DFS.REACTION_SPECIES, SCHEMA_DEFS.ID)
304
308
 
305
309
  # reaction species don't have ids
306
310
  with pytest.raises(ValueError):
307
- sbml_dfs.get_table("reaction_species", {"id"})
311
+ sbml_dfs.get_table(SBML_DFS.REACTION_SPECIES, {SCHEMA_DEFS.ID})
308
312
 
309
313
 
310
314
  def test_search_by_name(sbml_dfs_metabolism):
311
- assert sbml_dfs_metabolism.search_by_name("atp", "species", False).shape[0] == 1
312
- assert sbml_dfs_metabolism.search_by_name("pyr", "species").shape[0] == 3
313
- assert sbml_dfs_metabolism.search_by_name("kinase", "reactions").shape[0] == 4
315
+ assert (
316
+ sbml_dfs_metabolism.search_by_name("atp", SBML_DFS.SPECIES, False).shape[0] == 1
317
+ )
318
+ assert sbml_dfs_metabolism.search_by_name("pyr", SBML_DFS.SPECIES).shape[0] == 3
319
+ assert (
320
+ sbml_dfs_metabolism.search_by_name("kinase", SBML_DFS.REACTIONS).shape[0] == 4
321
+ )
314
322
 
315
323
 
316
324
  def test_search_by_id(sbml_dfs_metabolism):
317
- identifiers_tbl = sbml_dfs_metabolism.get_identifiers("species")
325
+ identifiers_tbl = sbml_dfs_metabolism.get_identifiers(SBML_DFS.SPECIES)
318
326
  ids, species = sbml_dfs_metabolism.search_by_ids(
319
- ["P40926"], "species", identifiers_tbl
327
+ identifiers_tbl, identifiers=["P40926"]
320
328
  )
321
329
  assert ids.shape[0] == 1
322
330
  assert species.shape[0] == 1
323
331
 
324
332
  ids, species = sbml_dfs_metabolism.search_by_ids(
325
- ["57540", "30744"], "species", identifiers_tbl, {"chebi"}
333
+ identifiers_tbl,
334
+ identifiers=["57540", "30744"],
335
+ ontologies={ONTOLOGIES.CHEBI},
326
336
  )
327
337
  assert ids.shape[0] == 2
328
338
  assert species.shape[0] == 2
329
339
 
330
- ids, species = sbml_dfs_metabolism.search_by_ids(
331
- ["baz"], "species", identifiers_tbl
332
- )
333
- assert ids.shape[0] == 0
334
- assert species.shape[0] == 0
340
+ with pytest.raises(
341
+ ValueError, match="None of the requested identifiers are present"
342
+ ):
343
+ ids, species = sbml_dfs_metabolism.search_by_ids(
344
+ identifiers_tbl, identifiers=["baz"] # Non-existent identifier
345
+ )
335
346
 
336
347
 
337
348
  def test_species_status(sbml_dfs):
338
349
 
339
350
  species = sbml_dfs.species
340
- select_species = species[species["s_name"] == "OxyHbA"]
351
+ select_species = species[species[SBML_DFS.S_NAME] == "OxyHbA"]
341
352
  assert select_species.shape[0] == 1
342
353
 
343
354
  status = sbml_dfs.species_status(select_species.index[0])
@@ -264,3 +264,73 @@ def test_sbo_constants_internal_consistency():
264
264
  assert MINI_SBO_TO_NAME[term] == name
265
265
  for term, name in MINI_SBO_TO_NAME.items():
266
266
  assert MINI_SBO_FROM_NAME[name] == term
267
+
268
+
269
+ def test_infer_entity_type():
270
+ """Test entity type inference with valid keys"""
271
+ # when index matches primary key.
272
+ # Test compartments with index as primary key
273
+ df = pd.DataFrame(
274
+ {SBML_DFS.C_NAME: ["cytoplasm"], SBML_DFS.C_IDENTIFIERS: ["GO:0005737"]}
275
+ )
276
+ df.index.name = SBML_DFS.C_ID
277
+ result = sbml_dfs_utils.infer_entity_type(df)
278
+ assert result == SBML_DFS.COMPARTMENTS
279
+
280
+ # Test species with index as primary key
281
+ df = pd.DataFrame(
282
+ {SBML_DFS.S_NAME: ["glucose"], SBML_DFS.S_IDENTIFIERS: ["CHEBI:17234"]}
283
+ )
284
+ df.index.name = SBML_DFS.S_ID
285
+ result = sbml_dfs_utils.infer_entity_type(df)
286
+ assert result == SBML_DFS.SPECIES
287
+
288
+ # Test entity type inference by exact column matching.
289
+ # Test compartmentalized_species (has foreign keys)
290
+ df = pd.DataFrame(
291
+ {
292
+ SBML_DFS.SC_ID: ["glucose_c"],
293
+ SBML_DFS.S_ID: ["glucose"],
294
+ SBML_DFS.C_ID: ["cytoplasm"],
295
+ }
296
+ )
297
+ result = sbml_dfs_utils.infer_entity_type(df)
298
+ assert result == "compartmentalized_species"
299
+
300
+ # Test reaction_species (has foreign keys)
301
+ df = pd.DataFrame(
302
+ {
303
+ SBML_DFS.RSC_ID: ["rxn1_glc"],
304
+ SBML_DFS.R_ID: ["rxn1"],
305
+ SBML_DFS.SC_ID: ["glucose_c"],
306
+ }
307
+ )
308
+ result = sbml_dfs_utils.infer_entity_type(df)
309
+ assert result == SBML_DFS.REACTION_SPECIES
310
+
311
+ # Test reactions (only primary key)
312
+ df = pd.DataFrame({SBML_DFS.R_ID: ["rxn1"]})
313
+ result = sbml_dfs_utils.infer_entity_type(df)
314
+ assert result == SBML_DFS.REACTIONS
315
+
316
+
317
+ def test_infer_entity_type_errors():
318
+ """Test error cases for entity type inference."""
319
+ # Test no matching entity type
320
+ df = pd.DataFrame({"random_column": ["value"], "another_col": ["data"]})
321
+ with pytest.raises(ValueError, match="No entity type matches DataFrame"):
322
+ sbml_dfs_utils.infer_entity_type(df)
323
+
324
+ # Test partial match (missing required foreign key)
325
+ df = pd.DataFrame(
326
+ {SBML_DFS.SC_ID: ["glucose_c"], SBML_DFS.S_ID: ["glucose"]}
327
+ ) # Missing c_id
328
+ with pytest.raises(ValueError):
329
+ sbml_dfs_utils.infer_entity_type(df)
330
+
331
+ # Test extra primary keys that shouldn't be there
332
+ df = pd.DataFrame(
333
+ {SBML_DFS.R_ID: ["rxn1"], SBML_DFS.S_ID: ["glucose"]}
334
+ ) # Two primary keys
335
+ with pytest.raises(ValueError):
336
+ sbml_dfs_utils.infer_entity_type(df)