napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
napistu/constants.py
ADDED
@@ -0,0 +1,500 @@
|
|
1
|
+
"""Module to contain all constants for CPR"""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import libsbml
|
6
|
+
|
7
|
+
from types import SimpleNamespace
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
PROTEINATLAS_SUBCELL_LOC_URL = (
|
11
|
+
"https://www.proteinatlas.org/download/tsv/subcellular_location.tsv.zip"
|
12
|
+
)
|
13
|
+
|
14
|
+
# GTEx
|
15
|
+
GTEX_RNASEQ_EXPRESSION_URL = "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
|
16
|
+
|
17
|
+
# Gencode
|
18
|
+
GENCODE_URL = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_26/gencode.v26.transcripts.fa.gz"
|
19
|
+
|
20
|
+
FILE_EXT_ZIP = "zip"
|
21
|
+
FILE_EXT_GZ = "gz"
|
22
|
+
|
23
|
+
# SBML_dfs
|
24
|
+
|
25
|
+
SBML_DFS = SimpleNamespace(
|
26
|
+
COMPARTMENTS="compartments",
|
27
|
+
SPECIES="species",
|
28
|
+
COMPARTMENTALIZED_SPECIES="compartmentalized_species",
|
29
|
+
REACTIONS="reactions",
|
30
|
+
REACTION_SPECIES="reaction_species",
|
31
|
+
SPECIES_DATA="species_data",
|
32
|
+
REACTIONS_DATA="reactions_data",
|
33
|
+
C_ID="c_id",
|
34
|
+
C_NAME="c_name",
|
35
|
+
C_IDENTIFIERS="c_Identifiers",
|
36
|
+
C_SOURCE="c_Source",
|
37
|
+
S_ID="s_id",
|
38
|
+
S_NAME="s_name",
|
39
|
+
S_IDENTIFIERS="s_Identifiers",
|
40
|
+
S_SOURCE="s_Source",
|
41
|
+
SC_ID="sc_id",
|
42
|
+
SC_NAME="sc_name",
|
43
|
+
SC_SOURCE="sc_Source",
|
44
|
+
R_ID="r_id",
|
45
|
+
R_NAME="r_name",
|
46
|
+
R_IDENTIFIERS="r_Identifiers",
|
47
|
+
R_SOURCE="r_Source",
|
48
|
+
R_ISREVERSIBLE="r_isreversible",
|
49
|
+
RSC_ID="rsc_id",
|
50
|
+
STOICHIOMETRY="stoichiometry",
|
51
|
+
SBO_TERM="sbo_term",
|
52
|
+
)
|
53
|
+
|
54
|
+
SBML_DFS_SCHEMA = SimpleNamespace(
|
55
|
+
SCHEMA={
|
56
|
+
SBML_DFS.COMPARTMENTS: {
|
57
|
+
"pk": SBML_DFS.C_ID,
|
58
|
+
"label": SBML_DFS.C_NAME,
|
59
|
+
"id": SBML_DFS.C_IDENTIFIERS,
|
60
|
+
"source": SBML_DFS.C_SOURCE,
|
61
|
+
"vars": [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE],
|
62
|
+
},
|
63
|
+
SBML_DFS.SPECIES: {
|
64
|
+
"pk": SBML_DFS.S_ID,
|
65
|
+
"label": SBML_DFS.S_NAME,
|
66
|
+
"id": SBML_DFS.S_IDENTIFIERS,
|
67
|
+
"source": SBML_DFS.S_SOURCE,
|
68
|
+
"vars": [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE],
|
69
|
+
},
|
70
|
+
SBML_DFS.COMPARTMENTALIZED_SPECIES: {
|
71
|
+
"pk": SBML_DFS.SC_ID,
|
72
|
+
"label": SBML_DFS.SC_NAME,
|
73
|
+
"fk": [SBML_DFS.S_ID, SBML_DFS.C_ID],
|
74
|
+
"source": SBML_DFS.SC_SOURCE,
|
75
|
+
"vars": [
|
76
|
+
SBML_DFS.SC_NAME,
|
77
|
+
SBML_DFS.S_ID,
|
78
|
+
SBML_DFS.C_ID,
|
79
|
+
SBML_DFS.SC_SOURCE,
|
80
|
+
],
|
81
|
+
},
|
82
|
+
SBML_DFS.REACTIONS: {
|
83
|
+
"pk": SBML_DFS.R_ID,
|
84
|
+
"label": SBML_DFS.R_NAME,
|
85
|
+
"id": SBML_DFS.R_IDENTIFIERS,
|
86
|
+
"source": SBML_DFS.R_SOURCE,
|
87
|
+
"vars": [
|
88
|
+
SBML_DFS.R_NAME,
|
89
|
+
SBML_DFS.R_IDENTIFIERS,
|
90
|
+
SBML_DFS.R_SOURCE,
|
91
|
+
SBML_DFS.R_ISREVERSIBLE,
|
92
|
+
],
|
93
|
+
},
|
94
|
+
SBML_DFS.REACTION_SPECIES: {
|
95
|
+
"pk": SBML_DFS.RSC_ID,
|
96
|
+
"fk": [SBML_DFS.R_ID, SBML_DFS.SC_ID],
|
97
|
+
"vars": [
|
98
|
+
SBML_DFS.R_ID,
|
99
|
+
SBML_DFS.SC_ID,
|
100
|
+
SBML_DFS.STOICHIOMETRY,
|
101
|
+
SBML_DFS.SBO_TERM,
|
102
|
+
],
|
103
|
+
},
|
104
|
+
},
|
105
|
+
REQUIRED_ENTITIES={
|
106
|
+
SBML_DFS.COMPARTMENTS,
|
107
|
+
SBML_DFS.SPECIES,
|
108
|
+
SBML_DFS.COMPARTMENTALIZED_SPECIES,
|
109
|
+
SBML_DFS.REACTIONS,
|
110
|
+
SBML_DFS.REACTION_SPECIES,
|
111
|
+
},
|
112
|
+
OPTIONAL_ENTITIES={
|
113
|
+
SBML_DFS.SPECIES_DATA,
|
114
|
+
SBML_DFS.REACTIONS_DATA,
|
115
|
+
},
|
116
|
+
)
|
117
|
+
|
118
|
+
ENTITIES_W_DATA = {SBML_DFS.SPECIES, SBML_DFS.REACTIONS}
|
119
|
+
|
120
|
+
REQUIRED_REACTION_FROMEDGELIST_COLUMNS = [
|
121
|
+
"sc_id_up",
|
122
|
+
"sc_id_down",
|
123
|
+
"sbo_term",
|
124
|
+
"r_name",
|
125
|
+
"r_Identifiers",
|
126
|
+
"r_isreversible",
|
127
|
+
]
|
128
|
+
|
129
|
+
CPR_STANDARD_OUTPUTS = SimpleNamespace(
|
130
|
+
SPECIES_IDENTIFIERS="species_identifiers.tsv",
|
131
|
+
SPECIES="species.json",
|
132
|
+
REACTIONS="reactions.json",
|
133
|
+
REACTION_SPECIES="reaction_species.json",
|
134
|
+
COMPARTMENTS="compartments.json",
|
135
|
+
COMPARTMENTALIZED_SPECIES="compartmentalized_species.json",
|
136
|
+
)
|
137
|
+
|
138
|
+
INTERACTION_EDGELIST_EXPECTED_VARS = {
|
139
|
+
"upstream_name",
|
140
|
+
"downstream_name",
|
141
|
+
"upstream_compartment",
|
142
|
+
"downstream_compartment",
|
143
|
+
"r_name",
|
144
|
+
"sbo_term",
|
145
|
+
"r_Identifiers",
|
146
|
+
"r_isreversible",
|
147
|
+
}
|
148
|
+
|
149
|
+
BQB_PRIORITIES = pd.DataFrame(
|
150
|
+
[{"bqb": "BQB_IS", "bqb_rank": 1}, {"bqb": "BQB_HAS_PART", "bqb_rank": 2}]
|
151
|
+
)
|
152
|
+
|
153
|
+
ONTOLOGY_PRIORITIES = pd.DataFrame(
|
154
|
+
[
|
155
|
+
{"ontology": "reactome", "ontology_rank": 1},
|
156
|
+
{"ontology": "ensembl_gene", "ontology_rank": 2},
|
157
|
+
{"ontology": "chebi", "ontology_rank": 3},
|
158
|
+
{"ontology": "uniprot", "ontology_rank": 4},
|
159
|
+
{"ontology": "go", "ontology_rank": 5},
|
160
|
+
]
|
161
|
+
)
|
162
|
+
|
163
|
+
# SBML
|
164
|
+
# Biological qualifiers
|
165
|
+
# Biomodels qualifiers
|
166
|
+
BQB = SimpleNamespace(
|
167
|
+
IS="BQB_IS",
|
168
|
+
HAS_PART="BQB_HAS_PART",
|
169
|
+
IS_PART_OF="BQB_IS_PART_OF",
|
170
|
+
IS_VERSION_OF="BQB_IS_VERSION_OF",
|
171
|
+
HAS_VERSION="BQB_HAS_VERSION",
|
172
|
+
IS_HOMOLOG_TO="BQB_IS_HOMOLOG_TO",
|
173
|
+
IS_DESCRIBED_BY="BQB_IS_DESCRIBED_BY",
|
174
|
+
IS_ENCODED_BY="BQB_IS_ENCODED_BY",
|
175
|
+
ENCODES="BQB_ENCODES",
|
176
|
+
OCCURS_IN="BQB_OCCURS_IN",
|
177
|
+
HAS_PROPERTY="BQB_HAS_PROPERTY",
|
178
|
+
IS_PROPERTY_OF="BQB_IS_PROPERTY_OF",
|
179
|
+
HAS_TAXON="BQB_HAS_TAXON",
|
180
|
+
UNKNOWN="BQB_UNKNOWN",
|
181
|
+
)
|
182
|
+
|
183
|
+
# molecules are distinctly defined by these BQB terms
|
184
|
+
BQB_DEFINING_ATTRS = ["BQB_IS", "IS_HOMOLOG_TO"]
|
185
|
+
|
186
|
+
# a looser convention which will aggregate genes, transcripts, and proteins
|
187
|
+
# if they are linked with the appropriate bioqualifiers
|
188
|
+
BQB_DEFINING_ATTRS_LOOSE = [
|
189
|
+
"BQB_IS",
|
190
|
+
"IS_HOMOLOG_TO",
|
191
|
+
"BQB_IS_ENCODED_BY",
|
192
|
+
"BQB_ENCODES",
|
193
|
+
]
|
194
|
+
|
195
|
+
# identifiers
|
196
|
+
IDENTIFIERS = SimpleNamespace(
|
197
|
+
ONTOLOGY="ontology", IDENTIFIER="identifier", BQB="bqb", URL="url"
|
198
|
+
)
|
199
|
+
|
200
|
+
SPECIES_IDENTIFIERS_REQUIRED_VARS = {
|
201
|
+
SBML_DFS.S_ID,
|
202
|
+
IDENTIFIERS.ONTOLOGY,
|
203
|
+
IDENTIFIERS.IDENTIFIER,
|
204
|
+
IDENTIFIERS.BQB,
|
205
|
+
SBML_DFS.S_NAME,
|
206
|
+
}
|
207
|
+
|
208
|
+
BIOLOGICAL_QUALIFIERS = [
|
209
|
+
"BQB_IS",
|
210
|
+
"BQB_HAS_PART",
|
211
|
+
"BQB_IS_PART_OF",
|
212
|
+
"BQB_IS_VERSION_OF",
|
213
|
+
"BQB_HAS_VERSION",
|
214
|
+
"BQB_IS_HOMOLOG_TO",
|
215
|
+
"BQB_IS_DESCRIBED_BY",
|
216
|
+
"BQB_IS_ENCODED_BY",
|
217
|
+
"BQB_ENCODES",
|
218
|
+
"BQB_OCCURS_IN",
|
219
|
+
"BQB_HAS_PROPERTY",
|
220
|
+
"BQB_IS_PROPERTY_OF",
|
221
|
+
"BQB_HAS_TAXON",
|
222
|
+
"BQB_UNKNOWN",
|
223
|
+
]
|
224
|
+
|
225
|
+
|
226
|
+
def get_biological_qualifier_codes():
|
227
|
+
bio_qualifier_codes = {getattr(libsbml, bqb): bqb for bqb in BIOLOGICAL_QUALIFIERS}
|
228
|
+
|
229
|
+
return bio_qualifier_codes
|
230
|
+
|
231
|
+
|
232
|
+
BIOLOGICAL_QUALIFIER_CODES = get_biological_qualifier_codes()
|
233
|
+
|
234
|
+
# Systems biology ontology
|
235
|
+
SBOTERM_NAMES = SimpleNamespace(
|
236
|
+
REACTANT="reactant",
|
237
|
+
PRODUCT="product",
|
238
|
+
CATALYST="catalyst",
|
239
|
+
INHIBITOR="inhibitor",
|
240
|
+
STIMULATOR="stimulator",
|
241
|
+
MODIFIER="modifier",
|
242
|
+
INTERACTOR="interactor",
|
243
|
+
)
|
244
|
+
|
245
|
+
MINI_SBO_TO_NAME = {
|
246
|
+
"SBO:0000010": SBOTERM_NAMES.REACTANT,
|
247
|
+
"SBO:0000011": SBOTERM_NAMES.PRODUCT,
|
248
|
+
"SBO:0000013": SBOTERM_NAMES.CATALYST,
|
249
|
+
"SBO:0000020": SBOTERM_NAMES.INHIBITOR,
|
250
|
+
"SBO:0000459": SBOTERM_NAMES.STIMULATOR,
|
251
|
+
"SBO:0000019": SBOTERM_NAMES.MODIFIER,
|
252
|
+
"SBO:0000336": SBOTERM_NAMES.INTERACTOR,
|
253
|
+
}
|
254
|
+
|
255
|
+
MINI_SBO_FROM_NAME = {
|
256
|
+
SBOTERM_NAMES.REACTANT: "SBO:0000010",
|
257
|
+
SBOTERM_NAMES.PRODUCT: "SBO:0000011",
|
258
|
+
SBOTERM_NAMES.CATALYST: "SBO:0000013",
|
259
|
+
SBOTERM_NAMES.INHIBITOR: "SBO:0000020",
|
260
|
+
SBOTERM_NAMES.STIMULATOR: "SBO:0000459",
|
261
|
+
SBOTERM_NAMES.MODIFIER: "SBO:0000019", # parent category of inhibitor and stimulator (i.e., activator)
|
262
|
+
SBOTERM_NAMES.INTERACTOR: "SBO:0000336", # entity participating in a physical or functional interaction
|
263
|
+
}
|
264
|
+
|
265
|
+
SBO_MODIFIER_NAMES = {
|
266
|
+
SBOTERM_NAMES.INHIBITOR,
|
267
|
+
SBOTERM_NAMES.STIMULATOR,
|
268
|
+
SBOTERM_NAMES.MODIFIER,
|
269
|
+
}
|
270
|
+
|
271
|
+
MINI_SBO_NAME_TO_POLARITY = {
|
272
|
+
SBOTERM_NAMES.REACTANT: "activation",
|
273
|
+
SBOTERM_NAMES.PRODUCT: "activation",
|
274
|
+
SBOTERM_NAMES.CATALYST: "activation",
|
275
|
+
SBOTERM_NAMES.INHIBITOR: "inhibition",
|
276
|
+
SBOTERM_NAMES.STIMULATOR: "activation",
|
277
|
+
SBOTERM_NAMES.MODIFIER: "ambiguous",
|
278
|
+
SBOTERM_NAMES.INTERACTOR: "ambiguous",
|
279
|
+
}
|
280
|
+
|
281
|
+
# how does changing a reactions' membership
|
282
|
+
# affect whether a reaction can occur
|
283
|
+
# for example, if I remove any substrate a reaction won't occur
|
284
|
+
# but I would have to remove all catalysts for it to not occur
|
285
|
+
SBO_NAME_TO_ROLE = {
|
286
|
+
SBOTERM_NAMES.REACTANT: "DEFINING",
|
287
|
+
SBOTERM_NAMES.PRODUCT: "DEFINING",
|
288
|
+
SBOTERM_NAMES.INTERACTOR: "DEFINING",
|
289
|
+
SBOTERM_NAMES.CATALYST: "REQUIRED",
|
290
|
+
SBOTERM_NAMES.INHIBITOR: "OPTIONAL",
|
291
|
+
SBOTERM_NAMES.STIMULATOR: "OPTIONAL",
|
292
|
+
SBOTERM_NAMES.MODIFIER: "OPTIONAL",
|
293
|
+
}
|
294
|
+
|
295
|
+
# see also https://github.com/calico/netcontextr/blob/main/R/reactionTrimmingFunctions.R
|
296
|
+
VALID_SBO_ROLES = (
|
297
|
+
# there is a direct correspondence between the set of defining entries and the identity of a reaction
|
298
|
+
# e.g., the stoichiometery of a metabolic reaction or the members of a protein-protein interaction
|
299
|
+
"DEFINING",
|
300
|
+
# 1+ entries are needed if entries were initially defined. i.e., reactions which require a catalyst
|
301
|
+
# would no longer exist if the catalyst was removed, but many reactions do not require a catalyst.
|
302
|
+
"REQUIRED",
|
303
|
+
# 0+ entries. optional species can be added or removed to a reaction without changing its identity
|
304
|
+
"OPTIONAL",
|
305
|
+
)
|
306
|
+
|
307
|
+
# required variables for the edgelist formats used by mechanism_matching
|
308
|
+
CPR_EDGELIST = SimpleNamespace(
|
309
|
+
S_ID_UPSTREAM="s_id_upstream",
|
310
|
+
S_ID_DOWNSTREAM="s_id_downstream",
|
311
|
+
SC_ID_UPSTREAM="sc_id_upstream",
|
312
|
+
SC_ID_DOWNSTREAM="sc_id_downstream",
|
313
|
+
IDENTIFIER_UPSTREAM="identifier_upstream",
|
314
|
+
IDENTIFIER_DOWNSTREAM="identifier_downstream",
|
315
|
+
S_NAME_UPSTREAM="s_name_upstream",
|
316
|
+
S_NAME_DOWNSTREAM="s_name_downstream",
|
317
|
+
SC_ID_ORIGIN="sc_id_origin",
|
318
|
+
SC_ID_DEST="sc_id_dest",
|
319
|
+
)
|
320
|
+
|
321
|
+
IDENTIFIER_EDGELIST_REQ_VARS = {
|
322
|
+
CPR_EDGELIST.IDENTIFIER_UPSTREAM,
|
323
|
+
CPR_EDGELIST.IDENTIFIER_DOWNSTREAM,
|
324
|
+
}
|
325
|
+
|
326
|
+
CPR_EDGELIST_REQ_VARS = {
|
327
|
+
CPR_EDGELIST.S_ID_UPSTREAM,
|
328
|
+
CPR_EDGELIST.S_ID_DOWNSTREAM,
|
329
|
+
CPR_EDGELIST.SC_ID_UPSTREAM,
|
330
|
+
CPR_EDGELIST.SC_ID_DOWNSTREAM,
|
331
|
+
}
|
332
|
+
|
333
|
+
CPR_PATH_REQ_VARS = {CPR_EDGELIST.SC_ID_ORIGIN, CPR_EDGELIST.SC_ID_DEST}
|
334
|
+
|
335
|
+
# specifying weighting schemes schema
|
336
|
+
|
337
|
+
DEFAULT_WT_TRANS = "identity"
|
338
|
+
|
339
|
+
DEFINED_WEIGHT_TRANSFORMATION = {
|
340
|
+
DEFAULT_WT_TRANS: "_wt_transformation_identity",
|
341
|
+
"string": "_wt_transformation_string",
|
342
|
+
"string_inv": "_wt_transformation_string_inv",
|
343
|
+
}
|
344
|
+
|
345
|
+
SCORE_CALIBRATION_POINTS_DICT = {
|
346
|
+
"weights": {"strong": 3, "good": 7, "okay": 20, "weak": 40},
|
347
|
+
"string_wt": {"strong": 950, "good": 400, "okay": 230, "weak": 150},
|
348
|
+
}
|
349
|
+
|
350
|
+
SOURCE_VARS_DICT = {"string_wt": 10}
|
351
|
+
|
352
|
+
# source
|
353
|
+
SOURCE_SPEC = SimpleNamespace(
|
354
|
+
PATHWAY_ID="pathway_id",
|
355
|
+
MODEL="model",
|
356
|
+
SOURCE="source",
|
357
|
+
SPECIES="species",
|
358
|
+
NAME="name",
|
359
|
+
ENTRY="entry",
|
360
|
+
N_COLLAPSED_PATHWAYS="n_collapsed_pathways",
|
361
|
+
INDEX_NAME="entry",
|
362
|
+
FILE="file",
|
363
|
+
DATE="date",
|
364
|
+
)
|
365
|
+
|
366
|
+
EXPECTED_PW_INDEX_COLUMNS = {
|
367
|
+
SOURCE_SPEC.FILE,
|
368
|
+
SOURCE_SPEC.PATHWAY_ID,
|
369
|
+
SOURCE_SPEC.SOURCE,
|
370
|
+
SOURCE_SPEC.SPECIES,
|
371
|
+
SOURCE_SPEC.NAME,
|
372
|
+
SOURCE_SPEC.DATE,
|
373
|
+
}
|
374
|
+
|
375
|
+
# rules for specific ontologies
|
376
|
+
|
377
|
+
ONTOLOGIES = SimpleNamespace(
|
378
|
+
CHEBI="chebi",
|
379
|
+
ENSEMBL_GENE="ensembl_gene",
|
380
|
+
ENSEMBL_TRANSCRIPT="ensembl_transcript",
|
381
|
+
ENSEMBL_PROTEIN="ensembl_protein",
|
382
|
+
GENE_NAME="gene_name",
|
383
|
+
GO="go",
|
384
|
+
MIRBASE="mirbase",
|
385
|
+
NCBI_ENTREZ_GENE="ncbi_entrez_gene",
|
386
|
+
PHAROS="pharos",
|
387
|
+
REACTOME="reactome",
|
388
|
+
SYMBOL="symbol",
|
389
|
+
UNIPROT="uniprot",
|
390
|
+
)
|
391
|
+
|
392
|
+
CHARACTERISTIC_COMPLEX_ONTOLOGIES = [
|
393
|
+
ONTOLOGIES.ENSEMBL_GENE,
|
394
|
+
ONTOLOGIES.NCBI_ENTREZ_GENE,
|
395
|
+
ONTOLOGIES.MIRBASE,
|
396
|
+
]
|
397
|
+
|
398
|
+
ONTOLOGY_ALIASES = SimpleNamespace(NCBI_ENTREZ_GENE={"ncbigene", "ncbi_gene"})
|
399
|
+
|
400
|
+
ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY = {
|
401
|
+
"G": ONTOLOGIES.ENSEMBL_GENE,
|
402
|
+
"T": ONTOLOGIES.ENSEMBL_TRANSCRIPT,
|
403
|
+
"P": ONTOLOGIES.ENSEMBL_PROTEIN,
|
404
|
+
}
|
405
|
+
|
406
|
+
ENSEMBL_MOLECULE_TYPES_FROM_ONTOLOGY = {
|
407
|
+
ONTOLOGIES.ENSEMBL_GENE: "G",
|
408
|
+
ONTOLOGIES.ENSEMBL_TRANSCRIPT: "T",
|
409
|
+
ONTOLOGIES.ENSEMBL_PROTEIN: "P",
|
410
|
+
}
|
411
|
+
|
412
|
+
ENSEMBL_SPECIES_FROM_CODE = {"MUS": "Mus musculus"}
|
413
|
+
|
414
|
+
ENSEMBL_SPECIES_TO_CODE = {"Mus musculus": "MUS"}
|
415
|
+
|
416
|
+
ENSEMBL_PREFIX_TO_ONTOLOGY = {
|
417
|
+
"ENSG": ONTOLOGIES.ENSEMBL_GENE,
|
418
|
+
"ENST": ONTOLOGIES.ENSEMBL_TRANSCRIPT,
|
419
|
+
"ENSP": ONTOLOGIES.ENSEMBL_PROTEIN,
|
420
|
+
}
|
421
|
+
|
422
|
+
COMPARTMENTS = {
|
423
|
+
"NUCLEOPLASM": "nucleoplasm",
|
424
|
+
"CYTOPLASM": "cytoplasm",
|
425
|
+
"CELLULAR_COMPONENT": "cellular_component",
|
426
|
+
"CYTOSOL": "cytosol",
|
427
|
+
"MITOCHONDRIA": "mitochondria",
|
428
|
+
"MITOMEMBRANE": "mitochondrial membrane",
|
429
|
+
"INNERMITOCHONDRIA": "inner mitochondria",
|
430
|
+
"MITOMATRIX": "mitochondrial matrix",
|
431
|
+
"ENDOPLASMICRETICULUM": "endoplasmic reticulum",
|
432
|
+
"ERMEMBRANE": "endoplasmic reticulum membrane",
|
433
|
+
"ERLUMEN": "endoplasmic reticulum lumen",
|
434
|
+
"GOLGIAPPARATUS": "golgi apparatus",
|
435
|
+
"GOLGIMEMBRANE": "golgi membrane",
|
436
|
+
"NUCLEUS": "nucleus",
|
437
|
+
"NUCLEARLUMEN": "nuclear lumen",
|
438
|
+
"NUCLEOLUS": "nucleolus",
|
439
|
+
"LYSOSOME": "lysosome",
|
440
|
+
"PEROXISOME": "peroxisome",
|
441
|
+
"EXTRACELLULAR": "extracellular",
|
442
|
+
}
|
443
|
+
|
444
|
+
COMPARTMENT_ALIASES = {
|
445
|
+
"NUCLEOPLASM": ["nucleoplasm", "Nucleoplasm"],
|
446
|
+
"CYTOPLASM": ["cytoplasm", "Cytoplasm"],
|
447
|
+
"CELLULAR_COMPONENT": ["cellular_component", "Cellular_component"],
|
448
|
+
"CYTOSOL": ["cytosol", "Cytosol"],
|
449
|
+
"MITOCHONDRIA": ["mitochondria", "Mitochondria"],
|
450
|
+
"MITOMEMBRANE": ["mitochondrial membrane", "Mitochondrial membrane"],
|
451
|
+
"INNERMITOCHONDRIA": [
|
452
|
+
"inner mitochondria",
|
453
|
+
"Inner mitochondria",
|
454
|
+
"inner mitochondrial compartment",
|
455
|
+
],
|
456
|
+
"MITOMATRIX": [
|
457
|
+
"mitochondrial matrix",
|
458
|
+
"Mitochondrial matrix",
|
459
|
+
"mitochondrial lumen",
|
460
|
+
"Mitochondrial lumen",
|
461
|
+
],
|
462
|
+
"ENDOPLASMICRETICULUM": ["endoplasmic reticulum", "Endoplasmic reticulum"],
|
463
|
+
"ERMEMBRANE": ["endoplasmic reticulum membrane", "Endoplasmic reticulum membrane"],
|
464
|
+
"ERLUMEN": ["endoplasmic reticulum lumen", "Endoplasmic reticulum lumen"],
|
465
|
+
"GOLGIAPPARATUS": ["golgi apparatus", "Golgi apparatus"],
|
466
|
+
"GOLGIMEMBRANE": ["Golgi membrane", "golgi membrane"],
|
467
|
+
"NUCLEUS": ["nucleus", "Nucleus"],
|
468
|
+
"NUCLEARLUMEN": ["nuclear lumen", "Nuclear lumen"],
|
469
|
+
"NUCLEOLUS": ["nucleolus", "Nucleolus"],
|
470
|
+
"LYSOSOME": ["lysosome", "Lysosome"],
|
471
|
+
"PEROXISOME": ["peroxisome", "Peroxisome", "peroxisome/glyoxysome"],
|
472
|
+
"EXTRACELLULAR": [
|
473
|
+
"extracellular",
|
474
|
+
"Extracellular",
|
475
|
+
"extracellular space",
|
476
|
+
"Extracellular space",
|
477
|
+
],
|
478
|
+
}
|
479
|
+
|
480
|
+
COMPARTMENTS_GO_TERMS = {
|
481
|
+
"NUCLEOPLASM": "GO:0005654",
|
482
|
+
"CELLULAR_COMPONENT": "GO:0005575",
|
483
|
+
"CYTOPLASM": "GO:0005737",
|
484
|
+
"CYTOSOL": "GO:0005829",
|
485
|
+
"MITOCHONDRIA": "GO:0005739",
|
486
|
+
"MITOMEMBRANE": "GO:0031966",
|
487
|
+
"INNERMITOCHONDRIA": "GO:0005743",
|
488
|
+
"MITOMATRIX": "GO:0005759",
|
489
|
+
"ENDOPLASMICRETICULUM": "GO:0005783",
|
490
|
+
"ERMEMBRANE": "GO:0005789",
|
491
|
+
"ERLUMEN": "GO:0005788",
|
492
|
+
"GOLGIAPPARATUS": "GO:0005794",
|
493
|
+
"GOLGIMEMBRANE": "GO:0000139",
|
494
|
+
"NUCLEUS": "GO:0005634",
|
495
|
+
"NUCLEARLUMEN": "GO:0031981",
|
496
|
+
"NUCLEOLUS": "GO:0005730",
|
497
|
+
"LYSOSOME": "GO:0005764",
|
498
|
+
"PEROXISOME": "GO:0005777",
|
499
|
+
"EXTRACELLULAR": "GO:0005615",
|
500
|
+
}
|
napistu/gcs/__init__.py
ADDED
napistu/gcs/constants.py
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
# GCS constants
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from types import SimpleNamespace
|
5
|
+
|
6
|
+
GCS_SUBASSET_NAMES = SimpleNamespace(
|
7
|
+
SBML_DFS="sbml_dfs",
|
8
|
+
IDENTIFIERS="identifiers",
|
9
|
+
REGULATORY_GRAPH="regulatory_graph",
|
10
|
+
REGULATORY_DISTANCES="regulatory_distances",
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
GCS_FILETYPES = SimpleNamespace(
|
15
|
+
SBML_DFS="sbml_dfs.pkl",
|
16
|
+
IDENTIFIERS="identifiers.tsv",
|
17
|
+
REGULATORY_GRAPH="regulatory_graph.pkl",
|
18
|
+
REGULATORY_DISTANCES="regulatory_distances.json",
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
GCS_ASSETS = SimpleNamespace(
|
23
|
+
PROJECT="calico-public-data",
|
24
|
+
BUCKET="calico-cpr-public",
|
25
|
+
ASSETS={
|
26
|
+
"test_pathway": {
|
27
|
+
"file": "test_pathway.tar.gz",
|
28
|
+
"subassets": {
|
29
|
+
GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
|
30
|
+
GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
|
31
|
+
GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
|
32
|
+
GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
|
33
|
+
},
|
34
|
+
"public_url": "https://storage.googleapis.com/calico-cpr-public/test_pathway.tar.gz",
|
35
|
+
},
|
36
|
+
"human_consensus": {
|
37
|
+
"file": "human_consensus.tar.gz",
|
38
|
+
"subassets": {
|
39
|
+
GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
|
40
|
+
GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
|
41
|
+
GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
|
42
|
+
},
|
43
|
+
"public_url": "https://storage.googleapis.com/calico-cpr-public/human_consensus.tar.gz",
|
44
|
+
},
|
45
|
+
"human_consensus_w_distances": {
|
46
|
+
"file": "human_consensus_w_distances.tar.gz",
|
47
|
+
"subassets": {
|
48
|
+
GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
|
49
|
+
GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
|
50
|
+
GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
|
51
|
+
GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
|
52
|
+
},
|
53
|
+
"public_url": "https://storage.googleapis.com/calico-cpr-public/human_consensus_w_distances.tar.gz",
|
54
|
+
},
|
55
|
+
"reactome_members": {
|
56
|
+
"file": "external_pathways/external_pathways_reactome_neo4j_members.csv",
|
57
|
+
"subassets": None,
|
58
|
+
"public_url": "https://storage.googleapis.com/calico-cpr-public/external_pathways/external_pathways_reactome_neo4j_members.csv",
|
59
|
+
},
|
60
|
+
"reactome_xrefs": {
|
61
|
+
"file": "external_pathways/external_pathways_reactome_neo4j_crossref.csv",
|
62
|
+
"subassets": None,
|
63
|
+
"public_url": "https://storage.googleapis.com/calico-cpr-public/external_pathways/external_pathways_reactome_neo4j_crossref.csv",
|
64
|
+
},
|
65
|
+
},
|
66
|
+
)
|
67
|
+
|
68
|
+
|
69
|
+
INIT_DATA_DIR_MSG = "The `data_dir` {data_dir} does not exist."
|