napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,1381 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ import logging
5
+ import os
6
+ import re
7
+
8
+ from fs import open_fs
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from napistu import identifiers
13
+ from napistu import sbml_dfs_core
14
+ from napistu import sbml_dfs_utils
15
+ from napistu import source
16
+ from napistu import utils
17
+
18
+ from napistu.constants import SBML_DFS
19
+ from napistu.constants import BQB
20
+ from napistu.constants import IDENTIFIERS
21
+ from napistu.constants import SBOTERM_NAMES
22
+ from napistu.constants import MINI_SBO_FROM_NAME
23
+ from napistu.constants import ONTOLOGIES
24
+ from napistu.constants import ENSEMBL_PREFIX_TO_ONTOLOGY
25
+ from napistu.modify.constants import COFACTOR_SCHEMA
26
+ from napistu.modify.constants import COFACTOR_CHEBI_IDS
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def identify_cofactors(sbml_dfs: sbml_dfs_core.SBML_dfs) -> pd.Series:
32
+ """
33
+ Identify Cofactors
34
+
35
+ Find cofactors which are playing a supporting role in a reaction (e.g., ATP -> ADP or water).
36
+
37
+ Parameters:
38
+ ----------
39
+ sbml_dfs: SBML_dfs
40
+ A pathway model
41
+
42
+ Returns:
43
+ ----------
44
+ pd.Series with index of rsc_ids and values containing the reason why a reaction species is a cofactor
45
+
46
+ """
47
+
48
+ # load definitions of cofactors and their systematic IDs
49
+ cofactor_ids_list = COFACTOR_CHEBI_IDS[ONTOLOGIES.CHEBI].tolist()
50
+
51
+ if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
52
+ raise TypeError(
53
+ f"sbml_dfs was type {type(sbml_dfs)} and must be an sbml_dfs_core.SBML_dfs"
54
+ )
55
+
56
+ # find sbml_dfs species matching possible cofactors
57
+ species_identifiers = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
58
+ # filter to small molecules ignoring cases where a small molecule is just a part of the species
59
+ species_identifiers = species_identifiers[
60
+ [
61
+ o == ONTOLOGIES.CHEBI and b == BQB.IS
62
+ for o, b in zip(
63
+ species_identifiers[IDENTIFIERS.ONTOLOGY],
64
+ species_identifiers[IDENTIFIERS.BQB],
65
+ )
66
+ ]
67
+ ]
68
+
69
+ species_identifiers = species_identifiers.rename(
70
+ columns={IDENTIFIERS.IDENTIFIER: ONTOLOGIES.CHEBI}
71
+ )
72
+
73
+ if species_identifiers.shape[0] == 0:
74
+ raise ValueError("No species had ChEBI IDs, cofactors can not be filtered")
75
+
76
+ species_identifiers[ONTOLOGIES.CHEBI] = species_identifiers[
77
+ ONTOLOGIES.CHEBI
78
+ ].astype(int)
79
+ species_identifiers = species_identifiers[
80
+ species_identifiers[ONTOLOGIES.CHEBI].isin(cofactor_ids_list)
81
+ ]
82
+
83
+ logger.info(
84
+ f"There were {species_identifiers.shape[0]} cofactor species: "
85
+ f"{', '.join(species_identifiers[SBML_DFS.S_NAME].tolist())}"
86
+ )
87
+
88
+ # report cofactors that were not found
89
+
90
+ cofactors_missed = COFACTOR_CHEBI_IDS[
91
+ ~COFACTOR_CHEBI_IDS[ONTOLOGIES.CHEBI].isin(
92
+ species_identifiers[ONTOLOGIES.CHEBI].tolist()
93
+ )
94
+ ]["cofactor"].tolist()
95
+ if len(cofactors_missed) != 0:
96
+ logger.warning(
97
+ f"{len(cofactors_missed)} of {len(cofactor_ids_list)} "
98
+ "cofactors were not located in the pathway model: "
99
+ f"{', '.join(cofactors_missed)}"
100
+ )
101
+
102
+ # join species to cofactor schema using labels
103
+
104
+ cofactor_species = (
105
+ species_identifiers.reset_index()
106
+ .merge(COFACTOR_CHEBI_IDS)
107
+ .set_index(SBML_DFS.S_ID)
108
+ )
109
+ cofactor_cspecies = sbml_dfs.compartmentalized_species.merge(
110
+ cofactor_species["cofactor"], left_on=SBML_DFS.S_ID, right_index=True
111
+ )
112
+ # filter reaction species to cofactor species
113
+ cofactor_rscspecies = sbml_dfs.reaction_species.merge(
114
+ cofactor_cspecies["cofactor"], left_on=SBML_DFS.SC_ID, right_index=True
115
+ )
116
+ # drop entries which arent produced or consumed
117
+ cofactor_rscspecies = cofactor_rscspecies[
118
+ cofactor_rscspecies[SBML_DFS.STOICHIOMETRY] != 0
119
+ ]
120
+
121
+ logger.info(
122
+ f"Cofactor species are present {cofactor_rscspecies.shape[0]} times in reactions"
123
+ )
124
+
125
+ # loop through reactions with cofactors at test
126
+
127
+ reactions = set(cofactor_rscspecies[SBML_DFS.R_ID].tolist())
128
+
129
+ logger.info(
130
+ f"{len(reactions)} of {sbml_dfs.reactions.shape[0]} reactions include cofactor species"
131
+ )
132
+
133
+ filtered_rscs = list()
134
+ for rxn in reactions:
135
+ one_rxns_species = cofactor_rscspecies[
136
+ cofactor_rscspecies[SBML_DFS.R_ID] == rxn
137
+ ]
138
+
139
+ for filter_type, cofactor_filter in COFACTOR_SCHEMA.items():
140
+ dropped_species = filter_one_reactions_cofactors(
141
+ one_rxns_species, filter_type, cofactor_filter
142
+ )
143
+ if dropped_species is not None:
144
+ filtered_rscs.append(dropped_species)
145
+
146
+ return pd.concat(filtered_rscs)
147
+
148
+
149
+ def filter_one_reactions_cofactors(
150
+ one_rxns_species: pd.DataFrame, filter_type: str, cofactor_filter: dict
151
+ ) -> pd.Series:
152
+ """
153
+ Filter One Reaction's Cofactors
154
+
155
+ Apply a cofactor filter to one reaction's species
156
+
157
+ Parameters:
158
+ ----------
159
+ one_rxns_species (pd.DataFrame):
160
+ Rows of reactions species containing cofactors
161
+ filter_type: str
162
+ Reason to filter species with this filter
163
+ cofactor_filter: dict
164
+ Species included in filter
165
+
166
+ Returns:
167
+ ----------
168
+ pd.Series with index of rsc_ids and values containing the reason why a
169
+ reaction species is a cofactor, or None if filter was not triggered.
170
+
171
+ """
172
+
173
+ # see if all cofactor species are present
174
+ rsc_labels_set = set(one_rxns_species["cofactor"].tolist())
175
+ missing_reqs = set(cofactor_filter["if_all"]).difference(rsc_labels_set)
176
+ if len(missing_reqs) != 0:
177
+ return None
178
+
179
+ # ignore cases involving "except_any" species
180
+ if "except_any" in cofactor_filter.keys():
181
+ detected_exceptions = set(cofactor_filter["except_any"]).intersection(
182
+ rsc_labels_set
183
+ )
184
+ if len(detected_exceptions) != 0:
185
+ return None
186
+
187
+ # consider a reaction only if "as_substrate" is a substrate
188
+ if "as_substrate" in cofactor_filter.keys():
189
+ substrates_set = set(
190
+ one_rxns_species[one_rxns_species["stoichiometry"] < 0]["cofactor"].tolist()
191
+ )
192
+ substrates_detected = set(cofactor_filter["as_substrate"]).intersection(
193
+ substrates_set
194
+ )
195
+
196
+ if len(substrates_detected) == 0:
197
+ return None
198
+
199
+ # save the dropped species and filter type (reason for filtering) to a dict
200
+ dropped_species = one_rxns_species[
201
+ one_rxns_species["cofactor"].isin(cofactor_filter["if_all"])
202
+ ]
203
+
204
+ return dropped_species.assign(filter_reason=filter_type)["filter_reason"]
205
+
206
+
207
+ def drop_cofactors(sbml_dfs: sbml_dfs_core.SBML_dfs) -> sbml_dfs_core.SBML_dfs:
208
+ """
209
+ Drop Cofactors
210
+
211
+ Remove reaction species when they are acting as cofactors
212
+
213
+ Parameters:
214
+ ----------
215
+ sbml_dfs: SBML_dfs
216
+ A pathway model
217
+
218
+ Returns:
219
+ ----------
220
+ sbml_dfs (SBML_dfs):
221
+ A pathway model with some reaction species filtered
222
+ """
223
+
224
+ all_cofactors = identify_cofactors(sbml_dfs)
225
+
226
+ logger.info(
227
+ f"{all_cofactors.shape[0]} of {sbml_dfs.reaction_species.shape[0]}"
228
+ f" reaction species will be filtered as cofactors"
229
+ )
230
+
231
+ styled_df = all_cofactors.value_counts().to_frame()
232
+ logger.info(utils.style_df(styled_df))
233
+
234
+ sbml_dfs_working = copy.copy(sbml_dfs)
235
+ sbml_dfs_working.reaction_species = sbml_dfs_working.reaction_species[
236
+ ~sbml_dfs_working.reaction_species.index.isin(all_cofactors.index.tolist())
237
+ ]
238
+
239
+ return sbml_dfs_working
240
+
241
+
242
+ def add_complex_formation_species(
243
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
244
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
245
+ """
246
+ Add Complex Formation - Species
247
+
248
+ Define all species in complexes and format newly created species
249
+
250
+ Parameters
251
+ ----------
252
+ sbml_dfs: sbml_dfs_core.SBML_dfs
253
+ A relational mechanistic network
254
+
255
+ Returns
256
+ -------
257
+ merged_membership: pd.DataFrame
258
+ A table of complexes and their component members
259
+ new_species_for_sbml_dfs: pd.DataFrame
260
+ New entries to add to sbml_dfs.species
261
+ complex_component_species_ids: pd.DataFrame
262
+ All complex components
263
+ """
264
+
265
+ # define all species
266
+ species_ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
267
+ species_defining_attributes = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.IS]
268
+ complex_membership = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
269
+
270
+ # find the species corresponding to complex components (if they exist)
271
+ merged_membership = complex_membership.merge(
272
+ species_defining_attributes[
273
+ [
274
+ SBML_DFS.S_ID,
275
+ IDENTIFIERS.ONTOLOGY,
276
+ IDENTIFIERS.IDENTIFIER,
277
+ IDENTIFIERS.URL,
278
+ ]
279
+ ].rename({SBML_DFS.S_ID: "component_s_id"}, axis=1),
280
+ how="left",
281
+ )
282
+
283
+ # define unique component species
284
+ complex_component_species = merged_membership[
285
+ [
286
+ "component_s_id",
287
+ IDENTIFIERS.ONTOLOGY,
288
+ IDENTIFIERS.IDENTIFIER,
289
+ IDENTIFIERS.URL,
290
+ ]
291
+ ].drop_duplicates()
292
+
293
+ # turn unnlisted identifiers back into identifier format
294
+ complex_component_species[SBML_DFS.S_IDENTIFIERS] = [
295
+ identifiers.Identifiers(
296
+ [
297
+ {
298
+ IDENTIFIERS.ONTOLOGY: complex_component_species[
299
+ IDENTIFIERS.ONTOLOGY
300
+ ].iloc[i],
301
+ IDENTIFIERS.IDENTIFIER: complex_component_species[
302
+ IDENTIFIERS.IDENTIFIER
303
+ ].iloc[i],
304
+ IDENTIFIERS.URL: complex_component_species[IDENTIFIERS.URL].iloc[i],
305
+ IDENTIFIERS.BQB: BQB.IS,
306
+ }
307
+ ]
308
+ )
309
+ for i in range(0, complex_component_species.shape[0])
310
+ ]
311
+
312
+ # create an identifier -> source lookup by collapsing all sources with the same defining id
313
+ indexed_members = merged_membership.set_index(
314
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER, IDENTIFIERS.URL]
315
+ ).sort_index()
316
+ collapsed_sources = [
317
+ source.merge_sources(indexed_members.loc[ind][SBML_DFS.S_SOURCE].tolist())
318
+ for ind in indexed_members.index.unique()
319
+ ]
320
+ collapsed_sources = pd.Series(
321
+ collapsed_sources, index=indexed_members.index.unique(), name=SBML_DFS.S_SOURCE
322
+ )
323
+
324
+ # add sources to unique complex components
325
+ complex_component_species = complex_component_species.merge(
326
+ collapsed_sources,
327
+ left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER, IDENTIFIERS.URL],
328
+ right_index=True,
329
+ )
330
+
331
+ # define the maximum current id so that we can make new ids without collisions
332
+ max_existing_sid = max(
333
+ sbml_dfs_utils.id_formatter_inv(sbml_dfs.species.index.tolist())
334
+ )
335
+ # if s_ids used an alternative convention then they'll be nans here; which is fine
336
+ if max_existing_sid is np.nan:
337
+ max_existing_sid = int(-1)
338
+
339
+ new_species = complex_component_species[
340
+ complex_component_species["component_s_id"].isna()
341
+ ]
342
+ new_species["component_s_id"] = sbml_dfs_utils.id_formatter(
343
+ range(max_existing_sid + 1, max_existing_sid + new_species.shape[0] + 1),
344
+ SBML_DFS.S_ID,
345
+ )
346
+
347
+ # format new species and add to sbml_dfs.species
348
+ new_species_for_sbml_dfs = (
349
+ new_species.rename(
350
+ {"component_s_id": SBML_DFS.S_ID, "identifier": SBML_DFS.S_NAME}, axis=1
351
+ )[[SBML_DFS.S_ID, SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]]
352
+ .set_index("s_id")
353
+ .sort_index()
354
+ )
355
+
356
+ # prepend zzauto so the string comes late alphanumerically. this way a properly named species will
357
+ # be preferred when merging species by identifiers
358
+ new_species_for_sbml_dfs[SBML_DFS.S_NAME] = (
359
+ "zzauto " + new_species_for_sbml_dfs[SBML_DFS.S_NAME]
360
+ )
361
+
362
+ # combine existing and newly defined complex components
363
+ complex_component_species_ids = pd.concat(
364
+ [
365
+ complex_component_species[
366
+ ~complex_component_species["component_s_id"].isna()
367
+ ],
368
+ new_species,
369
+ ]
370
+ )
371
+
372
+ return merged_membership, new_species_for_sbml_dfs, complex_component_species_ids
373
+
374
+
375
+ def add_complex_formation(sbml_dfs: sbml_dfs_core.SBML_dfs):
376
+ """
377
+ Add Complex Formation
378
+
379
+ Using Reactome-style complex annotations,
380
+ where complex components are an attribute of complexes,
381
+ add explicit complex formation reactions.
382
+
383
+ Reactome represents complexers using BQB_HAS_PART
384
+ annotations, which are extracted into identifiers.Identifiers
385
+ objects. This is sufficient to define membership but does
386
+ not include stoichiometry. Also, in this approach components
387
+ are defined by their identifiers (URIs) rather than internal
388
+ s_ids/sc_ids.
389
+ """
390
+
391
+ raise NotImplementedError(
392
+ "TO DO - Need to look closer to see if the unformed complexes really need a formation reaction"
393
+ )
394
+
395
+
396
+ """ # define species present in complexes
397
+ (
398
+ merged_membership,
399
+ new_species_for_sbml_dfs,
400
+ complex_component_species_ids,
401
+ ) = add_complex_formation_species(sbml_dfs)
402
+
403
+ # define compartmentalized species present in complexes
404
+ (
405
+ new_compartmentalized_species_for_sbml_dfs,
406
+ updated_compartmentalized_membership,
407
+ ) = _add_complex_formation_compartmentalized_species(
408
+ sbml_dfs,
409
+ merged_membership,
410
+ new_species_for_sbml_dfs,
411
+ complex_component_species_ids.drop("s_Source", axis=1),
412
+ )
413
+
414
+ # remove complex formation for reactions which already have clear formation reactions
415
+ # to flag these complexes look for cases where the membership of the substrates
416
+ # and products (including complex membership) are the same
417
+
418
+ reaction_species_expanded_complexes = sbml_dfs.reaction_species.merge(
419
+ updated_compartmentalized_membership[["sc_id", "component_sc_id"]], how="left"
420
+ )
421
+
422
+ # if a species is not a complex then it is its own component
423
+ reaction_species_expanded_complexes["component_sc_id"] = [
424
+ x if z else y
425
+ for x, y, z in zip(
426
+ reaction_species_expanded_complexes["sc_id"],
427
+ reaction_species_expanded_complexes["component_sc_id"],
428
+ reaction_species_expanded_complexes["component_sc_id"].isna(),
429
+ )
430
+ ]
431
+
432
+ # check for equal membership of substrates and products
433
+ reaction_species_expanded_complexes = reaction_species_expanded_complexes.set_index(
434
+ "r_id"
435
+ )
436
+
437
+ complex_formation_reactions = list()
438
+ for rxn in reaction_species_expanded_complexes.index.unique():
439
+ rxn_species = reaction_species_expanded_complexes.loc[rxn]
440
+ substrates = set(
441
+ rxn_species[rxn_species["stoichiometry"] < 0]["component_sc_id"].tolist()
442
+ )
443
+ products = set(
444
+ rxn_species[rxn_species["stoichiometry"] > 0]["component_sc_id"].tolist()
445
+ )
446
+
447
+ if substrates == products:
448
+ complex_formation_reactions.append(rxn)
449
+
450
+ # find complexes which are products of complex formation reactions
451
+
452
+ compartmentalized_complexes = updated_compartmentalized_membership["sc_id"].unique()
453
+
454
+ # is a complex formation reaction
455
+ formed_complexes = sbml_dfs.reaction_species[
456
+ sbml_dfs.reaction_species["r_id"].isin(complex_formation_reactions)
457
+ ]
458
+ # is a complex
459
+ formed_complexes = formed_complexes[
460
+ formed_complexes["sc_id"].isin(compartmentalized_complexes)
461
+ ]
462
+ # complex is product
463
+ formed_complexes = formed_complexes[formed_complexes["stoichiometry"] > 0]
464
+
465
+ formed_complexes = formed_complexes["sc_id"].unique()
466
+ _ = set(compartmentalized_complexes).difference(set(formed_complexes))
467
+
468
+ # add formation and dissolution reactions for all complexes without explicit formation reactions
469
+ """
470
+
471
+
472
+ def add_entity_sets(
473
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
474
+ neo4j_members: str,
475
+ ) -> sbml_dfs_core.SBML_dfs:
476
+ """
477
+ Add Entity Sets
478
+
479
+ Reactome represents some sets of interchangeable molecules as "entity sets".
480
+ Common examples are ligands for a receptor. This function add members
481
+ of each entity set as a "is a" style reaction.
482
+
483
+ Parameters
484
+ ----------
485
+ sbml_dfs: sbml_dfs_core.SBML_dfs
486
+ A relational mechanistic network
487
+ neo4j_members: str
488
+ Path to a table containing Reactome entity sets and corresponding members.
489
+ This is currently extracted manually with Neo4j.
490
+
491
+ Returns
492
+ -------
493
+ sbml_dfs: sbml_dfs_core.SBML_dfs
494
+ An updated database which includes entity set species and formation reactions
495
+
496
+ """
497
+
498
+ # read and reformat Reactome entity sets table
499
+ reactome_members = _read_neo4j_members(neo4j_members)
500
+
501
+ # create missing species and compartmentalized species
502
+ (
503
+ merged_membership,
504
+ new_species_for_sbml_dfs,
505
+ set_component_species_ids,
506
+ ) = _add_entity_sets_species(sbml_dfs, reactome_members)
507
+
508
+ (
509
+ new_compartmentalized_species_for_sbml_dfs,
510
+ updated_compartmentalized_membership,
511
+ ) = _add_complex_formation_compartmentalized_species(
512
+ sbml_dfs,
513
+ merged_membership,
514
+ new_species_for_sbml_dfs,
515
+ set_component_species_ids,
516
+ )
517
+
518
+ (
519
+ new_reactions_for_sbml_dfs,
520
+ new_reaction_species_for_sbml_dfs,
521
+ ) = _add_entity_sets_reactions(
522
+ sbml_dfs,
523
+ new_compartmentalized_species_for_sbml_dfs,
524
+ updated_compartmentalized_membership,
525
+ )
526
+
527
+ # add all of the new entries to the sbml_dfs
528
+ sbml_dfs_working = copy.copy(sbml_dfs)
529
+
530
+ sbml_dfs_working.species = pd.concat(
531
+ [sbml_dfs_working.species, new_species_for_sbml_dfs]
532
+ )
533
+ sbml_dfs_working.compartmentalized_species = pd.concat(
534
+ [
535
+ sbml_dfs_working.compartmentalized_species,
536
+ new_compartmentalized_species_for_sbml_dfs,
537
+ ]
538
+ )
539
+ sbml_dfs_working.reactions = pd.concat(
540
+ [sbml_dfs_working.reactions, new_reactions_for_sbml_dfs]
541
+ )
542
+ sbml_dfs_working.reaction_species = pd.concat(
543
+ [sbml_dfs_working.reaction_species, new_reaction_species_for_sbml_dfs]
544
+ )
545
+
546
+ return sbml_dfs_working
547
+
548
+
549
+ def add_reactome_identifiers(
550
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
551
+ crossref_path: str,
552
+ ) -> sbml_dfs_core.SBML_dfs:
553
+ """
554
+ Add Reactome Identifiers
555
+
556
+ Add reactome-specific identifiers to existing species
557
+
558
+ Params
559
+ ------
560
+ sbml_dfs: sbml_dfs_core.SBML_dfs
561
+ A pathway model
562
+ crossref_path:
563
+ Path to the cross ref file extracted from Reactome's Neo4j database
564
+
565
+ Returns
566
+ -------
567
+ sbml_dfs: sbml_dfs_core.SBML_dfs
568
+ A pathway model with updated species' identifiers
569
+
570
+ """
571
+
572
+ select_reactome_ids = _format_reactome_crossref_ids(crossref_path)
573
+
574
+ # read all current identifiers
575
+ current_ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
576
+ # filter annotations of homologues and literature references
577
+ current_molecular_ids = (
578
+ current_ids[current_ids[IDENTIFIERS.BQB].isin([BQB.IS, BQB.HAS_PART])]
579
+ .set_index([SBML_DFS.S_ID, IDENTIFIERS.BQB])
580
+ .sort_index()
581
+ .copy()
582
+ )
583
+
584
+ # combine existing s_ids with additional cross-ref annotations using uniprot ids
585
+ merged_crossrefs = _merge_reactome_crossref_ids(
586
+ current_molecular_ids, select_reactome_ids
587
+ )
588
+
589
+ # create identifiers objects for each s_id
590
+ combined_ids = (
591
+ pd.concat(
592
+ [
593
+ current_ids[
594
+ [
595
+ SBML_DFS.S_ID,
596
+ IDENTIFIERS.ONTOLOGY,
597
+ IDENTIFIERS.IDENTIFIER,
598
+ IDENTIFIERS.URL,
599
+ IDENTIFIERS.BQB,
600
+ ]
601
+ ],
602
+ merged_crossrefs[
603
+ [
604
+ SBML_DFS.S_ID,
605
+ IDENTIFIERS.ONTOLOGY,
606
+ IDENTIFIERS.IDENTIFIER,
607
+ IDENTIFIERS.URL,
608
+ IDENTIFIERS.BQB,
609
+ ]
610
+ ],
611
+ ]
612
+ )
613
+ .reset_index(drop=True)
614
+ .drop_duplicates()
615
+ )
616
+
617
+ updated_identifiers = {
618
+ k: identifiers.Identifiers(
619
+ list(
620
+ v[
621
+ [
622
+ IDENTIFIERS.ONTOLOGY,
623
+ IDENTIFIERS.IDENTIFIER,
624
+ IDENTIFIERS.URL,
625
+ IDENTIFIERS.BQB,
626
+ ]
627
+ ]
628
+ .T.to_dict()
629
+ .values()
630
+ )
631
+ )
632
+ for k, v in combined_ids.groupby(SBML_DFS.S_ID)
633
+ }
634
+ updated_identifiers = pd.Series(
635
+ updated_identifiers, index=updated_identifiers.keys()
636
+ )
637
+ updated_identifiers.index.name = SBML_DFS.S_ID
638
+ updated_identifiers.name = "new_Identifiers"
639
+
640
+ # add new identifiers to species table
641
+ updated_species = sbml_dfs.species.merge(
642
+ updated_identifiers,
643
+ left_index=True,
644
+ right_index=True,
645
+ how="outer",
646
+ indicator=True,
647
+ )
648
+
649
+ if updated_species[updated_species["_merge"] == "right_only"].shape[0] > 0:
650
+ raise ValueError("Reactome crossrefs added new sids; this shouldn't occur")
651
+
652
+ updated_species = pd.concat(
653
+ [
654
+ updated_species[updated_species["_merge"] == "both"]
655
+ .drop([SBML_DFS.S_IDENTIFIERS, "_merge"], axis=1)
656
+ .rename({"new_Identifiers": SBML_DFS.S_IDENTIFIERS}, axis=1),
657
+ # retain original Identifiers if there is not new_Identifiers object
658
+ # (this would occur if there were not identifiers)
659
+ updated_species[updated_species["_merge"] == "left_only"].drop(
660
+ ["new_Identifiers", "_merge"], axis=1
661
+ ),
662
+ ]
663
+ )
664
+
665
+ n_species_diff = updated_species.shape[0] - sbml_dfs.species.shape[0]
666
+ if n_species_diff != 0:
667
+ raise ValueError(
668
+ f"There are {n_species_diff} more species in the updated "
669
+ "species table than the original one; this is unexpected behavior"
670
+ )
671
+
672
+ # create a copy to return a new object rather than update the provided one
673
+ sbml_dfs_working = copy.copy(sbml_dfs)
674
+ sbml_dfs_working.species = updated_species
675
+ return sbml_dfs_working
676
+
677
+
678
+ def _add_entity_sets_species(
679
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
680
+ reactome_members: pd.DataFrame,
681
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
682
+ """
683
+ Add Entity Sets - Species
684
+
685
+ Define all species which are part of "entity sets" in the pathway
686
+
687
+ Parameters
688
+ ----------
689
+ sbml_dfs: sbml_dfs_core.SBML_dfs
690
+ A relational mechanistic network
691
+ reactome_members: pd.DataFrame
692
+ A table of all Reactome entity sets members - obtained using a Neo4j query
693
+
694
+ Returns
695
+ -------
696
+ merged_membership: pd.DataFrame
697
+ A table of complexes and their component members
698
+ new_species_for_sbml_dfs: pd.DataFrame
699
+ New entries to add to sbml_dfs.species
700
+ set_component_species_ids: pd.DataFrame
701
+ All set components
702
+ """
703
+
704
+ species_ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
705
+ reactome_ids = species_ids[
706
+ species_ids[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.REACTOME
707
+ ].copy()
708
+ reactome_ids = reactome_ids[reactome_ids[IDENTIFIERS.BQB] == BQB.IS]
709
+
710
+ # compare Reactome ids in sbml_dfs and reactome_members to make sure
711
+ # they are for the same species
712
+ identifiers.check_reactome_identifier_compatibility(
713
+ reactome_members["member_id"], reactome_ids[IDENTIFIERS.IDENTIFIER]
714
+ )
715
+
716
+ # merge each species' entity sets to define entities which must exist in this pathway
717
+ merged_membership = (
718
+ reactome_ids[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER, SBML_DFS.S_SOURCE]]
719
+ .rename({IDENTIFIERS.IDENTIFIER: "set_id"}, axis=1)
720
+ .merge(reactome_members)
721
+ )
722
+
723
+ # define unique component species
724
+ set_component_species = merged_membership[
725
+ [
726
+ "member_id",
727
+ IDENTIFIERS.ONTOLOGY,
728
+ IDENTIFIERS.IDENTIFIER,
729
+ IDENTIFIERS.URL,
730
+ "member_s_name",
731
+ ]
732
+ ].drop_duplicates()
733
+
734
+ distinct_members = set_component_species.set_index(
735
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER, IDENTIFIERS.URL, "member_s_name"]
736
+ ).sort_index()
737
+
738
+ # since reactome IDs are compartmentalized, use external IDs only
739
+ # to determine distinct species, but then add reactome IDs as well
740
+ distinct_members = pd.Series(
741
+ [
742
+ identifiers.Identifiers(
743
+ [
744
+ {
745
+ IDENTIFIERS.ONTOLOGY: ind[0],
746
+ IDENTIFIERS.IDENTIFIER: str(ind[1]),
747
+ IDENTIFIERS.URL: ind[2],
748
+ IDENTIFIERS.BQB: BQB.IS,
749
+ }
750
+ ]
751
+ + [
752
+ {
753
+ IDENTIFIERS.ONTOLOGY: ONTOLOGIES.REACTOME,
754
+ IDENTIFIERS.IDENTIFIER: x,
755
+ IDENTIFIERS.URL: "",
756
+ IDENTIFIERS.BQB: BQB.IS,
757
+ }
758
+ for x in utils.safe_series_tolist(
759
+ distinct_members.loc[ind, "member_id"]
760
+ )
761
+ ]
762
+ )
763
+ for ind in distinct_members.index.unique()
764
+ ],
765
+ index=distinct_members.index.unique(),
766
+ name=SBML_DFS.S_IDENTIFIERS,
767
+ )
768
+
769
+ utils.check_unique_index(distinct_members, "distinct_members")
770
+
771
+ # combine identical species' sources
772
+ indexed_members = merged_membership.set_index(
773
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER, IDENTIFIERS.URL]
774
+ ).sort_index()
775
+
776
+ collapsed_sources = [
777
+ source._safe_source_merge(indexed_members.loc[ind][SBML_DFS.S_SOURCE])
778
+ for ind in indexed_members.index.unique()
779
+ ]
780
+ collapsed_sources = pd.Series(
781
+ collapsed_sources, index=indexed_members.index.unique(), name=SBML_DFS.S_SOURCE
782
+ )
783
+
784
+ # add sources to unique set components
785
+ distinct_members = distinct_members.to_frame().join(collapsed_sources.to_frame())
786
+
787
+ utils.check_unique_index(distinct_members, "distinct_members (with sources)")
788
+
789
+ # define set members which already exist as species versus those that must be added
790
+ set_component_species["is_already_included"] = set_component_species[
791
+ "member_id"
792
+ ].isin(reactome_ids[IDENTIFIERS.IDENTIFIER])
793
+
794
+ # define the maximum current id so that we can make new ids without collisions
795
+ max_existing_sid = max(
796
+ sbml_dfs_utils.id_formatter_inv(sbml_dfs.species.index.tolist())
797
+ )
798
+ # if s_ids used an alternative convention then they'll be nans here; which is fine
799
+ if max_existing_sid is np.nan:
800
+ max_existing_sid = int(-1)
801
+
802
+ new_species = set_component_species[
803
+ ~set_component_species["is_already_included"]
804
+ ].copy()
805
+ new_species["component_s_id"] = sbml_dfs_utils.id_formatter(
806
+ range(max_existing_sid + 1, max_existing_sid + new_species.shape[0] + 1),
807
+ SBML_DFS.S_ID,
808
+ )
809
+
810
+ # define new unique species
811
+ new_species_for_sbml_dfs = (
812
+ new_species.merge(
813
+ distinct_members,
814
+ left_on=[
815
+ IDENTIFIERS.ONTOLOGY,
816
+ IDENTIFIERS.IDENTIFIER,
817
+ IDENTIFIERS.URL,
818
+ "member_s_name",
819
+ ],
820
+ right_index=True,
821
+ )[
822
+ [
823
+ "component_s_id",
824
+ "member_s_name",
825
+ SBML_DFS.S_IDENTIFIERS,
826
+ SBML_DFS.S_SOURCE,
827
+ ]
828
+ ]
829
+ .rename(
830
+ {"component_s_id": SBML_DFS.S_ID, "member_s_name": SBML_DFS.S_NAME}, axis=1
831
+ )
832
+ .set_index(SBML_DFS.S_ID)
833
+ .sort_index()
834
+ )
835
+
836
+ utils.check_unique_index(new_species_for_sbml_dfs, "new_species_for_sbml_dfs")
837
+
838
+ # combine existing and newly defined set components
839
+ set_component_species_ids = pd.concat(
840
+ [
841
+ set_component_species[set_component_species["is_already_included"]].merge(
842
+ reactome_ids[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
843
+ {
844
+ IDENTIFIERS.IDENTIFIER: "member_id",
845
+ SBML_DFS.S_ID: "component_s_id",
846
+ },
847
+ axis=1,
848
+ )
849
+ ),
850
+ new_species,
851
+ ]
852
+ )
853
+
854
+ return merged_membership, new_species_for_sbml_dfs, set_component_species_ids
855
+
856
+
857
+ def _add_entity_sets_reactions(
858
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
859
+ new_compartmentalized_species_for_sbml_dfs: pd.DataFrame,
860
+ updated_compartmentalized_membership: pd.DataFrame,
861
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
862
+ """
863
+ Add Entity Sets - Reactions
864
+
865
+ Create reactions which indicate membership in an entity set
866
+
867
+ Parameters
868
+ ----------
869
+ sbml_dfs: sbml_dfs_core.SBML_dfs
870
+ A relational mechanistic network
871
+ new_compartmentalized_species_for_sbml_dfs: pd.DataFrame
872
+ New entries to add to sbml_dfs.compartmentalized_species
873
+ updated_compartmentalized_membership: pd.DataFrame
874
+ Compartmentalized complex components with updated IDs
875
+
876
+ Returns
877
+ -------
878
+ new_reactions_for_sbml_dfs: pd.DataFrame
879
+ New entries to add to sbml_dfs.reactions
880
+ new_reaction_species_for_sbml_dfs: pd.DataFrame
881
+ New entries to add to sbml_dfs.reaction_species
882
+ """
883
+
884
+ all_compartmentalized_species = pd.concat(
885
+ [sbml_dfs.compartmentalized_species, new_compartmentalized_species_for_sbml_dfs]
886
+ )
887
+
888
+ # create a table with named "entity sets" and their members
889
+ # each row will be turned into an "IS A" reaction
890
+ named_set_components = updated_compartmentalized_membership[
891
+ [SBML_DFS.SC_ID, SBML_DFS.SC_NAME, SBML_DFS.SC_SOURCE, "component_sc_id"]
892
+ ].merge(
893
+ all_compartmentalized_species[[SBML_DFS.SC_NAME]].rename(
894
+ {SBML_DFS.SC_NAME: "component_sc_name"}, axis=1
895
+ ),
896
+ left_on="component_sc_id",
897
+ right_index=True,
898
+ how="left",
899
+ )
900
+
901
+ if any(named_set_components["component_sc_name"].isna()):
902
+ raise ValueError("Some components could not be merged")
903
+
904
+ # define newly added reactions
905
+ max_existing_rid = max(
906
+ sbml_dfs_utils.id_formatter_inv(sbml_dfs.reactions.index.tolist())
907
+ )
908
+ # if s_ids used an alternative convention then they'll be nans here; which is fine
909
+ if max_existing_rid is np.nan:
910
+ max_existing_rid = int(-1)
911
+
912
+ # name the reaction following the "IS A" convention
913
+ named_set_components[SBML_DFS.R_NAME] = [
914
+ f"{comp_sc} IS A {sc}"
915
+ for comp_sc, sc in zip(
916
+ named_set_components["component_sc_name"], named_set_components["sc_name"]
917
+ )
918
+ ]
919
+
920
+ named_set_components[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
921
+ range(
922
+ max_existing_rid + 1,
923
+ max_existing_rid + named_set_components.shape[0] + 1,
924
+ ),
925
+ SBML_DFS.R_ID,
926
+ )
927
+
928
+ named_set_components[SBML_DFS.R_SOURCE] = named_set_components[SBML_DFS.SC_SOURCE]
929
+ named_set_components[SBML_DFS.R_IDENTIFIERS] = [
930
+ identifiers.Identifiers([]) for i in range(0, named_set_components.shape[0])
931
+ ]
932
+
933
+ new_reactions_for_sbml_dfs = (
934
+ named_set_components[
935
+ [SBML_DFS.R_ID, SBML_DFS.R_NAME, SBML_DFS.R_IDENTIFIERS, SBML_DFS.R_SOURCE]
936
+ ]
937
+ .set_index(SBML_DFS.R_ID)
938
+ .sort_index()
939
+ .assign(r_isreversible=False)
940
+ )
941
+
942
+ # define newly added reactions' species
943
+
944
+ max_existing_rscid = max(
945
+ sbml_dfs_utils.id_formatter_inv(sbml_dfs.reaction_species.index.tolist())
946
+ )
947
+ if max_existing_rscid is np.nan:
948
+ max_existing_rscid = int(-1)
949
+
950
+ new_reaction_species_for_sbml_dfs = pd.concat(
951
+ [
952
+ named_set_components[["component_sc_id", SBML_DFS.R_ID]]
953
+ .rename({"component_sc_id": SBML_DFS.SC_ID}, axis=1)
954
+ .assign(stoichiometry=-1)
955
+ .assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]),
956
+ named_set_components[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
957
+ .assign(stoichiometry=1)
958
+ .assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]),
959
+ ]
960
+ ).sort_values([SBML_DFS.R_ID, SBML_DFS.STOICHIOMETRY])
961
+
962
+ new_reaction_species_for_sbml_dfs[SBML_DFS.RSC_ID] = sbml_dfs_utils.id_formatter(
963
+ range(
964
+ max_existing_rscid + 1,
965
+ max_existing_rscid + new_reaction_species_for_sbml_dfs.shape[0] + 1,
966
+ ),
967
+ SBML_DFS.RSC_ID,
968
+ )
969
+
970
+ new_reaction_species_for_sbml_dfs = new_reaction_species_for_sbml_dfs.set_index(
971
+ SBML_DFS.RSC_ID
972
+ ).sort_index()
973
+
974
+ return new_reactions_for_sbml_dfs, new_reaction_species_for_sbml_dfs
975
+
976
+
977
+ def _add_complex_formation_compartmentalized_species(
978
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
979
+ merged_membership: pd.DataFrame,
980
+ new_species_for_sbml_dfs: pd.DataFrame,
981
+ complex_component_species_ids: pd.DataFrame,
982
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
983
+ """
984
+ Add Complex Formation - Compartmentalized Species
985
+
986
+ Define all compartmentalized species in complexes and format newly created compartmentalized species
987
+
988
+ Parameters
989
+ ----------
990
+ sbml_dfs: sbml_dfs_core.SBML_dfs
991
+ A relational mechanistic network
992
+ merged_membership: pd.DataFrame
993
+ A table of complexes and their component members
994
+ new_species_for_sbml_dfs: pd.DataFrame
995
+ New entries to add to sbml_dfs.species
996
+ complex_component_species_ids: pd.DataFrame
997
+ All complex components
998
+
999
+ Returns
1000
+ -------
1001
+ new_compartmentalized_species_for_sbml_dfs: pd.DataFrame
1002
+ New entries to add to sbml_dfs.compartmentalized_species
1003
+ updated_compartmentalized_membership: pd.DataFrame
1004
+ Compartmentalized complex components with updated IDs
1005
+ """
1006
+
1007
+ # filter compartmentalized species to complexes
1008
+ complexes = merged_membership[SBML_DFS.S_ID].unique()
1009
+ compartmentalized_complexes = sbml_dfs.compartmentalized_species[
1010
+ sbml_dfs.compartmentalized_species[SBML_DFS.S_ID].isin(complexes)
1011
+ ]
1012
+
1013
+ # create appropriate compartmentalized species
1014
+ # merge compartmentalized complexes with their membership
1015
+ merged_compartmentalized_membership = (
1016
+ compartmentalized_complexes.reset_index().merge(
1017
+ merged_membership[
1018
+ [
1019
+ SBML_DFS.S_ID,
1020
+ IDENTIFIERS.ONTOLOGY,
1021
+ IDENTIFIERS.IDENTIFIER,
1022
+ IDENTIFIERS.URL,
1023
+ ]
1024
+ ].merge(complex_component_species_ids)
1025
+ )
1026
+ )
1027
+
1028
+ # define all of the compartmentalized species that should exist
1029
+ complex_component_compartmentalized_species = (
1030
+ merged_compartmentalized_membership[["component_s_id", SBML_DFS.C_ID]]
1031
+ .drop_duplicates()
1032
+ .merge(
1033
+ sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID, SBML_DFS.C_ID]]
1034
+ .reset_index()
1035
+ .rename({SBML_DFS.S_ID: "component_s_id"}, axis=1),
1036
+ how="left",
1037
+ )
1038
+ )
1039
+
1040
+ new_compartmentalized_species = complex_component_compartmentalized_species[
1041
+ complex_component_compartmentalized_species[SBML_DFS.SC_ID].isna()
1042
+ ].copy()
1043
+
1044
+ # add new identifiers
1045
+ max_existing_scid = max(
1046
+ sbml_dfs_utils.id_formatter_inv(
1047
+ sbml_dfs.compartmentalized_species.index.tolist()
1048
+ )
1049
+ )
1050
+ if max_existing_scid is np.nan:
1051
+ max_existing_scid = int(-1)
1052
+
1053
+ new_compartmentalized_species[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
1054
+ range(
1055
+ max_existing_scid + 1,
1056
+ max_existing_scid + new_compartmentalized_species.shape[0] + 1,
1057
+ ),
1058
+ SBML_DFS.SC_ID,
1059
+ )
1060
+
1061
+ all_species = pd.concat([sbml_dfs.species, new_species_for_sbml_dfs])
1062
+
1063
+ # name new sc_ids and inherit sources from their complexes
1064
+ new_compartmentalized_species_names = new_compartmentalized_species.merge(
1065
+ all_species[SBML_DFS.S_NAME],
1066
+ left_on="component_s_id",
1067
+ right_index=True,
1068
+ how="left",
1069
+ ).merge(
1070
+ sbml_dfs.compartments[SBML_DFS.C_NAME],
1071
+ left_on=SBML_DFS.C_ID,
1072
+ right_index=True,
1073
+ how="left",
1074
+ )
1075
+
1076
+ if any(new_compartmentalized_species_names[SBML_DFS.S_NAME].isna()):
1077
+ raise ValueError("Some species were unnamed")
1078
+ if any(new_compartmentalized_species_names[SBML_DFS.C_NAME].isna()):
1079
+ raise ValueError("Some compartmnets were unnamed")
1080
+
1081
+ # name compartmentalized species
1082
+ new_compartmentalized_species_names[SBML_DFS.SC_NAME] = [
1083
+ f"{s_name} [{c_name}]"
1084
+ for s_name, c_name in zip(
1085
+ new_compartmentalized_species_names[SBML_DFS.S_NAME],
1086
+ new_compartmentalized_species_names[SBML_DFS.C_NAME],
1087
+ )
1088
+ ]
1089
+
1090
+ # add sources from the complexes that compartmentalized species belong to
1091
+ indexed_cmembers = (
1092
+ merged_compartmentalized_membership[
1093
+ ["component_s_id", SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
1094
+ ]
1095
+ .set_index(["component_s_id", SBML_DFS.C_ID])
1096
+ .sort_index()
1097
+ )
1098
+
1099
+ collapsed_csources = [
1100
+ (
1101
+ source.merge_sources(indexed_cmembers.loc[ind][SBML_DFS.SC_SOURCE].tolist())
1102
+ if len(ind) == 1
1103
+ else indexed_cmembers.loc[ind][SBML_DFS.SC_SOURCE]
1104
+ )
1105
+ for ind in indexed_cmembers.index.unique()
1106
+ ]
1107
+ collapsed_csources = pd.Series(
1108
+ collapsed_csources,
1109
+ index=indexed_cmembers.index.unique(),
1110
+ name=SBML_DFS.SC_SOURCE,
1111
+ )
1112
+
1113
+ new_compartmentalized_species_names = new_compartmentalized_species_names.merge(
1114
+ collapsed_csources, left_on=["component_s_id", SBML_DFS.C_ID], right_index=True
1115
+ )
1116
+
1117
+ new_compartmentalized_species_for_sbml_dfs = (
1118
+ new_compartmentalized_species_names[
1119
+ [
1120
+ SBML_DFS.SC_ID,
1121
+ SBML_DFS.SC_NAME,
1122
+ "component_s_id",
1123
+ SBML_DFS.C_ID,
1124
+ SBML_DFS.SC_SOURCE,
1125
+ ]
1126
+ ]
1127
+ .rename({"component_s_id": SBML_DFS.S_ID}, axis=1)
1128
+ .set_index(SBML_DFS.SC_ID)
1129
+ )
1130
+
1131
+ utils.check_unique_index(
1132
+ new_compartmentalized_species_for_sbml_dfs,
1133
+ "new_compartmentalized_species_for_sbml_dfs",
1134
+ )
1135
+
1136
+ # combine old and new compartmentalized species using current sc_ids
1137
+ complex_compartmentalized_components_ids = pd.concat(
1138
+ [
1139
+ complex_component_compartmentalized_species[
1140
+ ~complex_component_compartmentalized_species[SBML_DFS.SC_ID].isna()
1141
+ ],
1142
+ new_compartmentalized_species,
1143
+ ]
1144
+ ).rename({SBML_DFS.SC_ID: "component_sc_id"}, axis=1)
1145
+
1146
+ updated_compartmentalized_membership = merged_compartmentalized_membership[
1147
+ [
1148
+ SBML_DFS.SC_ID,
1149
+ SBML_DFS.SC_NAME,
1150
+ SBML_DFS.S_ID,
1151
+ SBML_DFS.C_ID,
1152
+ "component_s_id",
1153
+ SBML_DFS.SC_SOURCE,
1154
+ ]
1155
+ ].merge(complex_compartmentalized_components_ids)
1156
+
1157
+ return (
1158
+ new_compartmentalized_species_for_sbml_dfs,
1159
+ updated_compartmentalized_membership,
1160
+ )
1161
+
1162
+
1163
+ def _read_neo4j_members(neo4j_members: str) -> pd.DataFrame:
1164
+ """Read a table containing entity sets (members) derived from Reactome's Neo4J database."""
1165
+
1166
+ # load a list containing Reactome entity sets -> members
1167
+ # entity sets are categories of molecular species that
1168
+ # share a common property such as serving as ligands for a receptor
1169
+ # these relationships are not represented in the Reactome .sbml
1170
+ # so they are pulled out of the Neo4j database.
1171
+ base, path = os.path.split(neo4j_members)
1172
+ with open_fs(base) as bfs:
1173
+ with bfs.open(path, "rb") as f:
1174
+ reactome_members = pd.read_csv(f).assign(url="")
1175
+
1176
+ reactome_members[IDENTIFIERS.ONTOLOGY] = reactome_members[
1177
+ IDENTIFIERS.ONTOLOGY
1178
+ ].str.lower()
1179
+
1180
+ # add an uncompartmentalized name
1181
+ reactome_members["member_s_name"] = [
1182
+ re.sub(" \\[[A-Za-z ]+\\]$", "", x) for x in reactome_members["member_name"]
1183
+ ]
1184
+ reactome_members[IDENTIFIERS.IDENTIFIER] = reactome_members[
1185
+ IDENTIFIERS.IDENTIFIER
1186
+ ].astype(str)
1187
+
1188
+ return reactome_members
1189
+
1190
+
1191
+ def _merge_reactome_crossref_ids(
1192
+ current_molecular_ids: pd.DataFrame,
1193
+ select_reactome_ids: pd.DataFrame,
1194
+ ) -> pd.DataFrame:
1195
+ """
1196
+ Merge Reactome CrossRef IDs
1197
+
1198
+ Combine existing molecular IDs with Reactome crossref identifiers.
1199
+
1200
+ Params
1201
+ ------
1202
+ current_molecular_ids: pd.DataFrame
1203
+ Molecular features in the current pathway model
1204
+ select_reactome_ids: pd.DataFrame
1205
+ Crossref identifiers produced by _format_reactome_crossref_ids()
1206
+
1207
+ Returns
1208
+ -------
1209
+ merged_crossrefs: pd.DataFrame
1210
+ Molecular feature sids matched to crossref annotations
1211
+
1212
+ """
1213
+
1214
+ # reactome IDs to identifiers.Identifiers
1215
+ id_indices = current_molecular_ids.index.unique()
1216
+ # ind = id_indices[1]
1217
+
1218
+ # loop through all s_id x bqb pairs
1219
+ uniprot_ids = list()
1220
+ uniprot_ids_w_reactome = list()
1221
+ for ind in id_indices:
1222
+ ind_ids = current_molecular_ids.loc[ind]
1223
+ ontologies_present = ind_ids[IDENTIFIERS.ONTOLOGY].unique()
1224
+ if ONTOLOGIES.UNIPROT in ontologies_present:
1225
+ # return all (s_id, bqb) -> uniprot entries
1226
+ # save the uniprot source since it will be propagated to new ids joined to the uniprot id
1227
+ entry_uniprot_ids = (
1228
+ ind_ids.loc[ind_ids[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.UNIPROT]
1229
+ .reset_index()[
1230
+ [
1231
+ SBML_DFS.S_ID,
1232
+ IDENTIFIERS.BQB,
1233
+ IDENTIFIERS.IDENTIFIER,
1234
+ SBML_DFS.S_SOURCE,
1235
+ ]
1236
+ ]
1237
+ .rename({IDENTIFIERS.IDENTIFIER: ONTOLOGIES.UNIPROT}, axis=1)
1238
+ )
1239
+ # remove trailing dashes in uniprot ids since they are not present in the crossref identifiers
1240
+ entry_uniprot_ids[ONTOLOGIES.UNIPROT] = entry_uniprot_ids[
1241
+ ONTOLOGIES.UNIPROT
1242
+ ].replace("\\-[0-9]+$", "", regex=True)
1243
+
1244
+ uniprot_ids.append(entry_uniprot_ids)
1245
+
1246
+ # add reactome ids to lookup if they exist (they won't for BQB_HAS_PART qualifiers)
1247
+ if ONTOLOGIES.REACTOME in ontologies_present:
1248
+ # create the all x all cross of bqb-matched reactome and uniprot ids
1249
+ entry_reactome = (
1250
+ ind_ids.loc[ind_ids[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.REACTOME]
1251
+ .reset_index()[
1252
+ [SBML_DFS.S_ID, IDENTIFIERS.BQB, IDENTIFIERS.IDENTIFIER]
1253
+ ]
1254
+ .rename({IDENTIFIERS.IDENTIFIER: "reactome_id"}, axis=1)
1255
+ )
1256
+ uniprot_ids_w_reactome.append(entry_uniprot_ids.merge(entry_reactome))
1257
+
1258
+ uniprot_ids = pd.concat(uniprot_ids)
1259
+ uniprot_ids_w_reactome = pd.concat(uniprot_ids_w_reactome)
1260
+
1261
+ # uniprot_ids_w_reactome
1262
+ uni_rct_with_crossrefs = uniprot_ids_w_reactome.merge(select_reactome_ids)
1263
+ # check ontologies
1264
+ uni_rct_with_crossrefs_ensembl_genes = uni_rct_with_crossrefs.loc[
1265
+ uni_rct_with_crossrefs[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.ENSEMBL_GENE,
1266
+ SBML_DFS.S_ID,
1267
+ ].unique()
1268
+
1269
+ failed_joins = uniprot_ids_w_reactome[
1270
+ ~uniprot_ids_w_reactome[SBML_DFS.S_ID].isin(
1271
+ uni_rct_with_crossrefs_ensembl_genes
1272
+ )
1273
+ ]
1274
+ # most of the failed joins are pathogens so they wouldn't match to human ensembl genes
1275
+ if failed_joins.shape[0] > 0:
1276
+ logged_join_fails = failed_joins.sample(min(failed_joins.shape[0], 5)).drop(
1277
+ SBML_DFS.S_SOURCE, axis=1
1278
+ )
1279
+ logger.warning(
1280
+ f"{failed_joins.shape[0]} network uniprot IDs were not matched to the Reactome Crossref IDs"
1281
+ )
1282
+
1283
+ utils.style_df(logged_join_fails, headers="keys", hide_index=True)
1284
+
1285
+ # entries without reactome IDs join just by uniprot
1286
+ # outer join back to uni_rct_with_crossrefs so we won't consider a uniprot-only match
1287
+ # when a uniprot + reactome match worked [its not entirely clear that this does anything]
1288
+ uni_no_rct_with_crossrefs = uniprot_ids.merge(select_reactome_ids).merge(
1289
+ uni_rct_with_crossrefs[[SBML_DFS.S_ID, IDENTIFIERS.BQB]].drop_duplicates(),
1290
+ how="outer",
1291
+ indicator=True,
1292
+ )
1293
+ uni_no_rct_with_crossrefs = uni_no_rct_with_crossrefs[
1294
+ uni_no_rct_with_crossrefs["_merge"] == "left_only"
1295
+ ].drop("_merge", axis=1)
1296
+
1297
+ merged_crossrefs = pd.concat([uni_rct_with_crossrefs, uni_no_rct_with_crossrefs])
1298
+ assert (
1299
+ uni_rct_with_crossrefs.shape[0] + uni_no_rct_with_crossrefs.shape[0]
1300
+ ) == merged_crossrefs.shape[0]
1301
+
1302
+ species_with_protein_and_no_gene = current_molecular_ids[
1303
+ current_molecular_ids[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.UNIPROT
1304
+ ].merge(
1305
+ merged_crossrefs.loc[
1306
+ merged_crossrefs[IDENTIFIERS.ONTOLOGY] == ONTOLOGIES.ENSEMBL_GENE,
1307
+ [SBML_DFS.S_ID, IDENTIFIERS.BQB],
1308
+ ].drop_duplicates(),
1309
+ how="outer",
1310
+ left_index=True,
1311
+ right_on=[SBML_DFS.S_ID, IDENTIFIERS.BQB],
1312
+ indicator=True,
1313
+ )
1314
+ species_with_protein_and_no_gene = species_with_protein_and_no_gene[
1315
+ species_with_protein_and_no_gene["_merge"] == "left_only"
1316
+ ][[SBML_DFS.S_ID, SBML_DFS.S_NAME, IDENTIFIERS.BQB]].drop_duplicates()
1317
+
1318
+ if species_with_protein_and_no_gene.shape[0] > 0:
1319
+ logged_join_fails = species_with_protein_and_no_gene.sample(
1320
+ min(species_with_protein_and_no_gene.shape[0], 5)
1321
+ )
1322
+
1323
+ logger.warning(
1324
+ f"A gene ID could not be found for {species_with_protein_and_no_gene.shape[0]} "
1325
+ "(species, bqb) pairs with a protein ID"
1326
+ )
1327
+
1328
+ logger.warning(
1329
+ utils.style_df(logged_join_fails, headers="keys", hide_index=True)
1330
+ )
1331
+
1332
+ return merged_crossrefs
1333
+
1334
+
1335
+ def _format_reactome_crossref_ids(
1336
+ crossref_path: str,
1337
+ ) -> str:
1338
+ """
1339
+ Format Reactome CrossRef IDs
1340
+
1341
+ Read and reformat Reactome's crossref identifiers
1342
+
1343
+ Params
1344
+ ------
1345
+ crossref_path: str
1346
+ Path to the cross ref file extracted from Reactome's Neo4j database
1347
+
1348
+ Returns
1349
+ -------
1350
+ select_reactome_ids: str
1351
+ Crossref identifiers
1352
+
1353
+ """
1354
+
1355
+ base, path = os.path.split(crossref_path)
1356
+ with open_fs(base) as bfs:
1357
+ with bfs.open(path, "rb") as f:
1358
+ reactome_ids = pd.read_csv(f)
1359
+
1360
+ # only use ensembl and pharos for now
1361
+
1362
+ # rename pharos ontology
1363
+ pharos_ids = reactome_ids[
1364
+ reactome_ids[IDENTIFIERS.ONTOLOGY] == "Pharos - Targets"
1365
+ ].copy()
1366
+ pharos_ids[IDENTIFIERS.ONTOLOGY] = ONTOLOGIES.PHAROS
1367
+
1368
+ # format ensembl ids using conventions in identifiers.Identifiers
1369
+ ensembl_ids = reactome_ids[reactome_ids[IDENTIFIERS.ONTOLOGY] == "Ensembl"].copy()
1370
+ # distinguish ensembl genes/transcripts/proteins
1371
+ ensembl_ids["ontology_prefix"] = ensembl_ids[IDENTIFIERS.IDENTIFIER].str.slice(
1372
+ start=0, stop=4
1373
+ )
1374
+ ensembl_ids[IDENTIFIERS.ONTOLOGY] = [
1375
+ ENSEMBL_PREFIX_TO_ONTOLOGY[p] for p in ensembl_ids["ontology_prefix"]
1376
+ ]
1377
+ ensembl_ids = ensembl_ids.drop("ontology_prefix", axis=1)
1378
+
1379
+ select_reactome_ids = pd.concat([pharos_ids, ensembl_ids])
1380
+
1381
+ return select_reactome_ids