napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
napistu/rpy2/rids.py ADDED
@@ -0,0 +1,697 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ import pandas as pd
6
+ from napistu import consensus
7
+ from napistu import constants
8
+ from napistu import identifiers
9
+ from napistu import sbml_dfs_core
10
+ from napistu import source
11
+ from napistu import utils
12
+ from napistu.rpy2 import callr
13
+ from napistu.rpy2 import report_r_exceptions
14
+ from napistu.rpy2 import warn_if_no_rpy2
15
+
16
+ from napistu.constants import SBML_DFS
17
+ from napistu.constants import BQB
18
+ from napistu.constants import IDENTIFIERS
19
+ from napistu.constants import ONTOLOGIES
20
+ from napistu.constants import ONTOLOGY_ALIASES
21
+ from napistu.rpy2.constants import BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
22
+ from napistu.rpy2.constants import BIOC_DOGMATIC_MAPPING_ONTOLOGIES
23
+ from napistu.rpy2.constants import BIOC_PROTEIN_ONTOLOGIES
24
+ from napistu.rpy2.constants import BIOC_NAME_ONTOLOGIES
25
+ from napistu.rpy2.constants import BIOC_GENE_ONTOLOGIES # noqa
26
+ from napistu.rpy2.constants import BIOC_NOMENCLATURE
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @warn_if_no_rpy2
32
+ @report_r_exceptions
33
+ def expand_identifiers(
34
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
35
+ id_type: str,
36
+ species: str,
37
+ expanded_ontologies: list[str],
38
+ r_paths: str | None = None,
39
+ ) -> pd.Series:
40
+ """
41
+ Expand Identifiers
42
+
43
+ Update a table's identifiers to include additional related ontologies
44
+
45
+ Ontologies are pulled from the bioconductor "org" packages. This is effective, but inelegant.
46
+
47
+ Parameters
48
+ ----------
49
+ sbml_dfs : SBML_dfs
50
+ A relational pathway model built around reactions interconverting compartmentalized species.
51
+ id_type: str
52
+ Identifiers to expand: species, compartments, or reactions
53
+ species: str
54
+ Species name
55
+ expanded_ontologies: list
56
+ Ontologies to add or complete
57
+ r_paths: str
58
+ Path to an R packages directory
59
+
60
+ Returns
61
+ -------
62
+ a pd.Series with identifiers as the index and updated Identifiers objects as values
63
+ """
64
+
65
+ if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
66
+ raise TypeError("sbml_dfs is not an sbml_dfs_core.SBML_dfs object")
67
+
68
+ # pull out all identifiers as a pd.DataFrame
69
+ all_entity_identifiers = sbml_dfs.get_identifiers(id_type)
70
+ assert isinstance(all_entity_identifiers, pd.DataFrame)
71
+
72
+ if id_type == "species":
73
+ all_entity_identifiers = _check_species_identifiers_entrez_gene_ontology(
74
+ all_entity_identifiers
75
+ )
76
+
77
+ valid_expanded_ontologies = BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
78
+ elif id_type in ["reactions", "compartments"]:
79
+ raise NotImplementedError(
80
+ f"No converters implemented to expand {id_type} annotations"
81
+ )
82
+ else:
83
+ raise ValueError(f"{id_type} is an invalid id_type")
84
+
85
+ invalid_expanded_ontologies = set(expanded_ontologies).difference(
86
+ valid_expanded_ontologies
87
+ )
88
+
89
+ if len(invalid_expanded_ontologies) != 0:
90
+ raise NotImplementedError(
91
+ f"No converters implemented to expand {id_type} annotations to {', '.join(invalid_expanded_ontologies)}"
92
+ )
93
+
94
+ # find entries in valid_expanded_ontologies which are already present
95
+ # these are the entries that will be used to expand to other ontologies
96
+ # or fill in ontologies with incomplete annotations
97
+ starting_ontologies = valid_expanded_ontologies.intersection(
98
+ set(all_entity_identifiers["ontology"])
99
+ )
100
+
101
+ if len(starting_ontologies) == 0:
102
+ raise ValueError(f"No ontologies with {id_type} converters are present")
103
+
104
+ required_conversion_ontologies = set(starting_ontologies).union(
105
+ set(expanded_ontologies)
106
+ )
107
+
108
+ # pull down entrez ids + mapping to other ontologies
109
+ mapping_ontologies = required_conversion_ontologies.intersection(
110
+ BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
111
+ )
112
+
113
+ mappings_dict = create_bioconductor_mapping_tables(
114
+ mappings=mapping_ontologies, species=species, r_paths=r_paths
115
+ )
116
+
117
+ # start with entrez IDs (since all other ontologies are mapped to them in the
118
+ # bioconductor "org" packages)
119
+
120
+ # get these values by just looking up the mappings between entrez genes and genomic loci
121
+ running_ids = merge_bioconductor_mappings(mappings_dict, mapping_ontologies)
122
+
123
+ # map from existing ontologies to expanded ontologies
124
+ ontology_mappings = list()
125
+ # starting w/
126
+ for start in starting_ontologies:
127
+ # ending w/
128
+ for end in expanded_ontologies:
129
+ if start == end:
130
+ continue
131
+ lookup = (
132
+ running_ids[[start, end]]
133
+ .rename(columns={start: IDENTIFIERS.IDENTIFIER, end: "new_identifier"})
134
+ .assign(ontology=start)
135
+ .assign(new_ontology=end)
136
+ )
137
+ ontology_mappings.append(lookup)
138
+
139
+ ontology_mappings_df = pd.concat(ontology_mappings).dropna()
140
+
141
+ # old identifiers joined with new identifiers
142
+
143
+ # first, define the names of keys and ids
144
+ table_pk_var = sbml_dfs.schema[id_type]["pk"]
145
+ table_id_var = sbml_dfs.schema[id_type]["id"]
146
+
147
+ # retain bqb terms to define how an identifier is related to sid
148
+ # this relation will be preserved for the new ids
149
+
150
+ merged_identifiers = all_entity_identifiers[
151
+ [
152
+ table_pk_var,
153
+ IDENTIFIERS.ONTOLOGY,
154
+ IDENTIFIERS.IDENTIFIER,
155
+ IDENTIFIERS.BQB,
156
+ ]
157
+ ].merge(ontology_mappings_df)
158
+
159
+ # new, possibly redundant identifiers
160
+ new_identifiers = merged_identifiers[
161
+ [table_pk_var, "new_ontology", "new_identifier", IDENTIFIERS.BQB]
162
+ ].rename(
163
+ columns={
164
+ "new_ontology": IDENTIFIERS.ONTOLOGY,
165
+ "new_identifier": IDENTIFIERS.IDENTIFIER,
166
+ }
167
+ )
168
+
169
+ expanded_identifiers_df = (
170
+ pd.concat(
171
+ [
172
+ all_entity_identifiers[
173
+ [
174
+ table_pk_var,
175
+ IDENTIFIERS.ONTOLOGY,
176
+ IDENTIFIERS.IDENTIFIER,
177
+ IDENTIFIERS.URL,
178
+ IDENTIFIERS.BQB,
179
+ ]
180
+ ],
181
+ new_identifiers,
182
+ # ignore new identifier if it already exists
183
+ ]
184
+ )
185
+ # remove duplicated identifiers
186
+ .groupby([table_pk_var, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER])
187
+ .first()
188
+ .reset_index()
189
+ .set_index(table_pk_var)
190
+ )
191
+
192
+ # create a dictionary of new Identifiers objects
193
+ expanded_identifiers_dict = {
194
+ i: _expand_identifiers_new_entries(i, expanded_identifiers_df)
195
+ for i in expanded_identifiers_df.index.unique()
196
+ }
197
+
198
+ output = pd.Series(expanded_identifiers_dict).rename(table_id_var)
199
+ output.index.name = table_pk_var
200
+
201
+ return output
202
+
203
+
204
+ @warn_if_no_rpy2
205
+ @report_r_exceptions
206
+ def create_bioconductor_mapping_tables(
207
+ mappings: set[str], species: str, r_paths: str | None = None
208
+ ) -> dict[str, pd.DataFrame]:
209
+ """
210
+ Create Bioconductor Mapping Tables
211
+
212
+ Creating a dictionary of mappings between entrez and other ontologies.
213
+
214
+ Args:
215
+ mappings (set):
216
+ A set of ontologies to work with. The valid ontologies are:
217
+ "ensembl_gene", "ensembl_transcript", and "uniprot".
218
+ species (str):
219
+ The organismal species that we are working with (e.g., Homo sapiens).
220
+ r_paths (str, optional):
221
+ Optional path to a library of R packages.
222
+
223
+ Returns:
224
+ mappings_dict (dict):
225
+ A table of entrez ids, and tables mapping from each ontology in "mappings" to entrez.
226
+
227
+ """
228
+
229
+ assert isinstance(mappings, set)
230
+ assert isinstance(species, str)
231
+
232
+ logger.info(
233
+ f"Creating mapping tables from entrez genes to/from {', '.join(mappings)}"
234
+ )
235
+
236
+ invalid_mappings = set(mappings).difference(BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES)
237
+
238
+ if len(invalid_mappings) > 0:
239
+ raise ValueError(
240
+ f"{len(invalid_mappings)} mappings could not be created: {', '.join(invalid_mappings)}.\n"
241
+ f"The valid mappings are {', '.join(BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES)}"
242
+ )
243
+
244
+ mappings_dict = dict()
245
+
246
+ # all mappings are with respect to entrez. so we will always want to obtain entrez ids
247
+ mappings_dict[ONTOLOGIES.NCBI_ENTREZ_GENE] = (
248
+ callr.r_dataframe_to_pandas(
249
+ callr.bioconductor_org_r_function(
250
+ BIOC_NOMENCLATURE.CHR_TBL, species, r_paths=None
251
+ )
252
+ )
253
+ .drop(BIOC_NOMENCLATURE.CHROMOSOME, axis=1)
254
+ .rename(
255
+ columns={BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE}
256
+ )
257
+ .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
258
+ )
259
+
260
+ if ONTOLOGIES.ENSEMBL_GENE in mappings:
261
+ # "entrez <> ensembl genes"
262
+ mappings_dict[ONTOLOGIES.ENSEMBL_GENE] = (
263
+ callr.r_dataframe_to_pandas(
264
+ callr.bioconductor_org_r_function(
265
+ BIOC_NOMENCLATURE.ENSG_TBL, species, r_paths=r_paths
266
+ )
267
+ )
268
+ .rename(
269
+ columns={
270
+ BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
271
+ BIOC_NOMENCLATURE.ENSEMBL_GENE: ONTOLOGIES.ENSEMBL_GENE,
272
+ }
273
+ )
274
+ .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
275
+ )
276
+
277
+ if ONTOLOGIES.ENSEMBL_TRANSCRIPT in mappings:
278
+ # "entrez <> ensembl transcripts"
279
+ mappings_dict[ONTOLOGIES.ENSEMBL_TRANSCRIPT] = (
280
+ callr.r_dataframe_to_pandas(
281
+ callr.bioconductor_org_r_function(
282
+ BIOC_NOMENCLATURE.ENST_TBL, species, r_paths=r_paths
283
+ )
284
+ )
285
+ .rename(
286
+ columns={
287
+ BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
288
+ BIOC_NOMENCLATURE.ENSEMBL_TRANSCRIPT: ONTOLOGIES.ENSEMBL_TRANSCRIPT,
289
+ }
290
+ )
291
+ .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
292
+ )
293
+
294
+ if ONTOLOGIES.ENSEMBL_PROTEIN in mappings:
295
+ # "entrez <> ensembl proteins"
296
+ mappings_dict[ONTOLOGIES.ENSEMBL_PROTEIN] = (
297
+ callr.r_dataframe_to_pandas(
298
+ callr.bioconductor_org_r_function(
299
+ BIOC_NOMENCLATURE.ENSP_TBL, species, r_paths=r_paths
300
+ )
301
+ )
302
+ .rename(
303
+ columns={
304
+ BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
305
+ BIOC_NOMENCLATURE.ENSEMBL_PROTEIN: ONTOLOGIES.ENSEMBL_PROTEIN,
306
+ }
307
+ )
308
+ .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
309
+ )
310
+
311
+ if ONTOLOGIES.UNIPROT in mappings:
312
+ # "entrez <> uniprot"
313
+ mappings_dict[ONTOLOGIES.UNIPROT] = (
314
+ callr.r_dataframe_to_pandas(
315
+ callr.bioconductor_org_r_function(
316
+ BIOC_NOMENCLATURE.UNIPROT_TBL, species, r_paths=r_paths
317
+ )
318
+ )
319
+ .rename(
320
+ columns={
321
+ BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
322
+ BIOC_NOMENCLATURE.UNIPROT: ONTOLOGIES.UNIPROT,
323
+ }
324
+ )
325
+ .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
326
+ )
327
+
328
+ if ONTOLOGIES.GENE_NAME in mappings:
329
+ # "entrez <> gene name"
330
+ mappings_dict[ONTOLOGIES.GENE_NAME] = (
331
+ callr.r_dataframe_to_pandas(
332
+ callr.bioconductor_org_r_function(
333
+ BIOC_NOMENCLATURE.NAME_TBL, species, r_paths=r_paths
334
+ )
335
+ )
336
+ .rename(
337
+ columns={
338
+ BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
339
+ BIOC_NOMENCLATURE.GENE_NAME: ONTOLOGIES.GENE_NAME,
340
+ }
341
+ )
342
+ .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
343
+ )
344
+
345
+ if ONTOLOGIES.SYMBOL in mappings:
346
+ # "entrez <> gene symbol"
347
+ mappings_dict[ONTOLOGIES.SYMBOL] = (
348
+ callr.r_dataframe_to_pandas(
349
+ callr.bioconductor_org_r_function(
350
+ BIOC_NOMENCLATURE.SYMBOL_TBL, species, r_paths=r_paths
351
+ )
352
+ )
353
+ .rename(
354
+ columns={
355
+ BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
356
+ BIOC_NOMENCLATURE.SYMBOL: ONTOLOGIES.SYMBOL,
357
+ }
358
+ )
359
+ .set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
360
+ )
361
+
362
+ return mappings_dict
363
+
364
+
365
+ def merge_bioconductor_mappings(
366
+ mappings_dict: dict, mapping_ontologies: set[str]
367
+ ) -> pd.DataFrame:
368
+ """Combine multiple ontologies by recursively joining on Entrez Gene"""
369
+
370
+ running_ids = mappings_dict[ONTOLOGIES.NCBI_ENTREZ_GENE]
371
+
372
+ for mapping in mapping_ontologies:
373
+ logger.debug(f"adding entries for {mapping} to running_ids")
374
+ mapping_df = mappings_dict[mapping]
375
+
376
+ running_ids = running_ids.join(mapping_df)
377
+
378
+ running_ids = running_ids.reset_index()
379
+
380
+ return running_ids
381
+
382
+
383
+ def stack_bioconductor_mappings(
384
+ mappings_dict: dict[str, pd.DataFrame], mapping_ontologies: set[str]
385
+ ) -> pd.DataFrame:
386
+ """
387
+ Stack Bioconductor Mappings
388
+
389
+ Convert a dict of mappings between entrez identifiers and other identifiers to a single table.
390
+
391
+ Args:
392
+ mappings_dict (dict):
393
+ A dictionary containing mappings between entrez and other ontologies.
394
+ mapping_ontologies (set):
395
+ A set of mappings to combine.
396
+
397
+ Returns:
398
+ mappings_df (pd.DataFrame):
399
+ A table containing entrez_gene_id, ontology, and identifier.
400
+ """
401
+
402
+ mappings_list = list()
403
+ for ont in mapping_ontologies:
404
+ one_mapping_df = (
405
+ mappings_dict[ont].assign(ontology=ont).rename({ont: "identifier"}, axis=1)
406
+ )
407
+
408
+ mappings_list.append(one_mapping_df)
409
+
410
+ return pd.concat(mappings_list)
411
+
412
+
413
+ def _check_species_identifiers_entrez_gene_ontology(
414
+ entity_identifiers_df: pd.DataFrame,
415
+ ) -> pd.DataFrame:
416
+ """
417
+ Check whether species ontologies contain ncbigene or ncbi_gene
418
+ If so, replaced them to ncbi_entrez_gene.
419
+ Return: entity_identifiers_df with proper gene ontology types.
420
+ """
421
+
422
+ intersect_gene_onto = set(entity_identifiers_df["ontology"]).intersection(
423
+ ONTOLOGY_ALIASES.NCBI_ENTREZ_GENE
424
+ )
425
+
426
+ # if entity_identifiers_df contains members of ENTREZ_ONTOLOGY_ALIASES,
427
+ # replace to ncbi_entrez_gene
428
+ if intersect_gene_onto:
429
+ logger.info(
430
+ f" Replace unmatching ontology {', '.join(intersect_gene_onto)} to {ONTOLOGIES.NCBI_ENTREZ_GENE}."
431
+ )
432
+
433
+ filtered_onto_df = entity_identifiers_df[
434
+ entity_identifiers_df["ontology"].isin(list(intersect_gene_onto))
435
+ ]
436
+
437
+ entity_identifiers_df.loc[filtered_onto_df.index, "ontology"] = (
438
+ ONTOLOGIES.NCBI_ENTREZ_GENE
439
+ )
440
+
441
+ return entity_identifiers_df
442
+
443
+
444
+ def update_expanded_identifiers(
445
+ model: sbml_dfs_core.SBML_dfs, id_type: str, expanded_ids: pd.Series
446
+ ) -> sbml_dfs_core.SBML_dfs:
447
+ """Update the expanded identifiers for a model.
448
+
449
+ Args:
450
+ model (sbml_dfs_core.SBML_dfs): _description_
451
+ id_type (str): _description_
452
+ expanded_ids (str): _description_
453
+ """
454
+ ids = getattr(model, id_type)
455
+
456
+ # make sure expanded_ids and original model.species have same number of s_ids
457
+ # if a s_id only in model.species, adding it to expanded_ids.
458
+ if ids.shape[0] != expanded_ids.shape[0]:
459
+ matched_expanded_ids = expanded_ids.combine_first(ids[SBML_DFS.S_IDENTIFIERS])
460
+ logger.debug(
461
+ f"{ids.shape[0] - expanded_ids.shape[0]} "
462
+ "ids are not included in expanded ids"
463
+ )
464
+ else:
465
+ matched_expanded_ids = expanded_ids
466
+
467
+ updated_ids = ids.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
468
+ pd.DataFrame(matched_expanded_ids)
469
+ )
470
+
471
+ setattr(model, id_type, updated_ids)
472
+
473
+ return model
474
+
475
+
476
+ def create_dogmatic_sbml_dfs(
477
+ species: str, r_paths: str | None = None
478
+ ) -> sbml_dfs_core.SBML_dfs:
479
+ """
480
+ Create Dogmatic SMBL_DFs
481
+
482
+ Create an SBML_dfs model which is pretty much just proteins and no
483
+ reactions, as well as annotations linking proteins to genes, and
484
+ creating nice labels for genes/proteins.
485
+
486
+ Args:
487
+ species (str):
488
+ An organismal species (e.g., Homo sapiens)
489
+ r_paths (str or None)
490
+ Optional, p]ath to an R packages directory
491
+
492
+ Returns:
493
+ dogmatic_sbml_dfs (sbml.SBML_dfs)
494
+ A pathway model which (pretty much) just contains proteins and
495
+ diverse identifiers
496
+ """
497
+
498
+ dogmatic_mappings = connect_dogmatic_mappings(species)
499
+
500
+ logger.info("Creating inputs for sbml_dfs_from_edgelist()")
501
+
502
+ # format entries for sbml_dfs_from_edgelist()
503
+ species_df = dogmatic_mappings["cluster_consensus_identifiers_df"].join(
504
+ dogmatic_mappings["s_name_series"]
505
+ )
506
+
507
+ # stub required but invariant variables
508
+ compartments_df = sbml_dfs_core._stub_compartments()
509
+ interaction_source = source.Source(init=True)
510
+
511
+ # interactions table. This is required to create the sbml_dfs but we'll drop the info later
512
+ interaction_edgelist = species_df.rename(
513
+ columns={
514
+ "s_name": "upstream_name",
515
+ SBML_DFS.S_IDENTIFIERS: SBML_DFS.R_IDENTIFIERS,
516
+ }
517
+ )
518
+ interaction_edgelist["downstream_name"] = interaction_edgelist["upstream_name"]
519
+ interaction_edgelist["upstream_compartment"] = "cellular_component"
520
+ interaction_edgelist["downstream_compartment"] = "cellular_component"
521
+ interaction_edgelist["r_name"] = interaction_edgelist["upstream_name"]
522
+ interaction_edgelist["sbo_term"] = constants.MINI_SBO_FROM_NAME["reactant"]
523
+ interaction_edgelist["r_isreversible"] = False
524
+
525
+ dogmatic_sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
526
+ interaction_edgelist=interaction_edgelist,
527
+ species_df=species_df,
528
+ compartments_df=compartments_df,
529
+ interaction_source=interaction_source,
530
+ upstream_stoichiometry=-1,
531
+ downstream_stoichiometry=1,
532
+ downstream_sbo_name="product",
533
+ )
534
+
535
+ # remove all reactions except 1 (so it still passes sbml_dfs.validate())
536
+ # this self reaction will be removed when creating the graph
537
+ dogmatic_sbml_dfs.remove_reactions(dogmatic_sbml_dfs.reactions.index.tolist()[1::])
538
+
539
+ return dogmatic_sbml_dfs
540
+
541
+
542
+ def connect_dogmatic_mappings(species: str, r_paths: str | None = None) -> dict:
543
+ """
544
+ Connect Dogmatic Mappings
545
+
546
+ Merge all ontologies into greedy clusters based on shared associations to entrez ids
547
+
548
+ Args:
549
+ species (str):
550
+ An organismal species (e.g., Homo sapiens)
551
+ r_paths (str or None)
552
+ Optional, p]ath to an R packages directory
553
+
554
+ Returns:
555
+ dict with:
556
+ - s_name_series: a series where the index is distinct molecular species and the values are names.
557
+ - cluster_consensus_identifiers_df: a pd.DataFrame where the index is distinct molecular species
558
+ and values are identifiers objects.
559
+ """
560
+
561
+ mappings_dict = create_bioconductor_mapping_tables(
562
+ mappings=BIOC_DOGMATIC_MAPPING_ONTOLOGIES,
563
+ species=species,
564
+ r_paths=r_paths,
565
+ )
566
+
567
+ protein_mappings = stack_bioconductor_mappings(
568
+ mappings_dict, set(BIOC_PROTEIN_ONTOLOGIES)
569
+ )
570
+
571
+ # apply greedy graph-based clustering to connect proteins with a common mapping to entrez
572
+ edgelist_df = utils.format_identifiers_as_edgelist(
573
+ protein_mappings, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
574
+ )
575
+ connected_indices = utils.find_weakly_connected_subgraphs(
576
+ edgelist_df[["ind", "id"]]
577
+ )
578
+
579
+ # add clusters to proteins. Each cluster will be a distinct molecular species
580
+ protein_mappings_w_clusters = protein_mappings.reset_index().merge(
581
+ connected_indices
582
+ )
583
+
584
+ # combine entrez + cluster so we can pass cluster to non-protein attributes
585
+ entrez_clusters = protein_mappings_w_clusters[
586
+ [ONTOLOGIES.NCBI_ENTREZ_GENE, "cluster"]
587
+ ].drop_duplicates()
588
+ other_ontologies = BIOC_DOGMATIC_MAPPING_ONTOLOGIES.difference(
589
+ set(BIOC_PROTEIN_ONTOLOGIES)
590
+ )
591
+ other_mappings = stack_bioconductor_mappings(mappings_dict, other_ontologies)
592
+ other_mappings_w_clusters = entrez_clusters.merge(
593
+ other_mappings, left_on=ONTOLOGIES.NCBI_ENTREZ_GENE, right_index=True
594
+ )
595
+
596
+ possible_names = pd.concat(
597
+ [
598
+ protein_mappings_w_clusters.query(
599
+ "ontology in @BIOC_NAME_ONTOLOGIES.keys()"
600
+ ),
601
+ other_mappings_w_clusters.query("ontology in @BIOC_NAME_ONTOLOGIES.keys()"),
602
+ ]
603
+ )[["cluster", IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]]
604
+
605
+ possible_names.loc[:, "ontology_preference"] = possible_names[
606
+ IDENTIFIERS.ONTOLOGY
607
+ ].map(BIOC_NAME_ONTOLOGIES)
608
+
609
+ # remove possible names which are present in multiple clusters.
610
+ # all clusters will need unique names to use sbml_dfs_from_edgelist()
611
+ id_counts = (
612
+ possible_names[["cluster", IDENTIFIERS.IDENTIFIER]]
613
+ .drop_duplicates()
614
+ .value_counts(IDENTIFIERS.IDENTIFIER)
615
+ )
616
+ possible_names = possible_names[
617
+ ~possible_names[IDENTIFIERS.IDENTIFIER].isin(
618
+ id_counts[id_counts > 1].index.tolist()
619
+ )
620
+ ]
621
+
622
+ s_name_series = (
623
+ consensus._add_nameness_score(possible_names, IDENTIFIERS.IDENTIFIER)
624
+ .sort_values(["ontology_preference", "nameness_score"])
625
+ .groupby("cluster")
626
+ .first()
627
+ .rename(columns={IDENTIFIERS.IDENTIFIER: SBML_DFS.S_NAME})[SBML_DFS.S_NAME]
628
+ )
629
+
630
+ protein_ids = protein_mappings_w_clusters.assign(bqb=BQB.IS)[
631
+ ["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
632
+ ]
633
+ gene_ids = other_mappings_w_clusters.query(
634
+ "ontology in @BIOC_GENE_ONTOLOGIES"
635
+ ).assign(bqb=BQB.IS_ENCODED_BY)[
636
+ ["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
637
+ ]
638
+ entrez_ids = entrez_clusters.assign(
639
+ ontology=ONTOLOGIES.NCBI_ENTREZ_GENE, bqb=BQB.IS_ENCODED_BY
640
+ ).rename(columns={ONTOLOGIES.NCBI_ENTREZ_GENE: IDENTIFIERS.IDENTIFIER})[
641
+ ["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
642
+ ]
643
+
644
+ # combine all ids to setup a single cluster-level Identifiers
645
+ all_ids = pd.concat([protein_ids, gene_ids, entrez_ids])
646
+ all_ids.loc[:, IDENTIFIERS.URL] = [
647
+ identifiers.create_uri_url(x, y)
648
+ for x, y in zip(all_ids[IDENTIFIERS.ONTOLOGY], all_ids[IDENTIFIERS.IDENTIFIER])
649
+ ]
650
+
651
+ # create one Identifiers object for each new species
652
+ cluster_consensus_identifiers = {
653
+ k: identifiers.Identifiers(
654
+ list(
655
+ v[
656
+ [
657
+ IDENTIFIERS.ONTOLOGY,
658
+ IDENTIFIERS.IDENTIFIER,
659
+ IDENTIFIERS.URL,
660
+ IDENTIFIERS.BQB,
661
+ ]
662
+ ]
663
+ .reset_index(drop=True)
664
+ .T.to_dict()
665
+ .values()
666
+ )
667
+ )
668
+ for k, v in all_ids.groupby("cluster")
669
+ }
670
+
671
+ cluster_consensus_identifiers_df = pd.DataFrame(
672
+ cluster_consensus_identifiers, index=[SBML_DFS.S_IDENTIFIERS]
673
+ ).T
674
+ cluster_consensus_identifiers_df.index.name = "cluster"
675
+
676
+ out_dict = {
677
+ "s_name_series": s_name_series,
678
+ "cluster_consensus_identifiers_df": cluster_consensus_identifiers_df,
679
+ }
680
+
681
+ return out_dict
682
+
683
+
684
+ @warn_if_no_rpy2
685
+ def _expand_identifiers_new_entries(
686
+ sysid: str, expanded_identifiers_df: pd.DataFrame
687
+ ) -> identifiers.Identifiers:
688
+ """Expand Identifiers to include Bioconductor annotations"""
689
+ entry = expanded_identifiers_df.loc[sysid]
690
+
691
+ if type(entry) is pd.Series:
692
+ sysis_id_list = [entry.to_dict()]
693
+ else:
694
+ # multiple annotations
695
+ sysis_id_list = list(entry.reset_index(drop=True).T.to_dict().values())
696
+
697
+ return identifiers.Identifiers(sysis_id_list)