napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
@@ -0,0 +1,356 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
from napistu import identifiers
|
7
|
+
from napistu import sbml_dfs_core
|
8
|
+
from napistu import source
|
9
|
+
from napistu import utils
|
10
|
+
from napistu.constants import BQB
|
11
|
+
from napistu.constants import COMPARTMENTS
|
12
|
+
from napistu.constants import MINI_SBO_FROM_NAME
|
13
|
+
from napistu.ingestion import cpr_edgelist
|
14
|
+
from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
|
15
|
+
from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
|
16
|
+
from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
|
17
|
+
from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
|
18
|
+
from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
|
19
|
+
from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
|
20
|
+
from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
|
21
|
+
from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
|
22
|
+
from napistu.ingestion.constants import STRING_PROTEIN_ID
|
23
|
+
from napistu.ingestion.constants import STRING_PROTEIN_ID_RAW
|
24
|
+
from napistu.ingestion.constants import STRING_SOURCE
|
25
|
+
from napistu.ingestion.constants import STRING_TARGET
|
26
|
+
from napistu.ingestion.constants import STRING_TAX_IDS
|
27
|
+
from napistu.ingestion.constants import STRING_UPSTREAM_COMPARTMENT
|
28
|
+
from napistu.ingestion.constants import STRING_UPSTREAM_NAME
|
29
|
+
from napistu.ingestion.constants import STRING_URL_EXPRESSIONS
|
30
|
+
from napistu.ingestion.constants import STRING_VERSION
|
31
|
+
from fs import open_fs
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
|
36
|
+
def get_string_species_url(
|
37
|
+
species: str, asset: str, version: float = STRING_VERSION
|
38
|
+
) -> str:
|
39
|
+
"""
|
40
|
+
STRING Species URL
|
41
|
+
Construct urls for downloading specific STRING tables
|
42
|
+
Args:
|
43
|
+
species (str): A species name: e.g., Homo sapiens.
|
44
|
+
asset (str): The type of table to be downloaded. Currently "interactions" or "aliases".
|
45
|
+
version (float): The version of STRING to work with.
|
46
|
+
Returns:
|
47
|
+
str: The download url
|
48
|
+
"""
|
49
|
+
if species not in STRING_TAX_IDS.keys():
|
50
|
+
raise ValueError(
|
51
|
+
f"{species} is not a valid value for species ids, valid ids are: {', '.join(STRING_TAX_IDS.keys())}"
|
52
|
+
)
|
53
|
+
|
54
|
+
if asset not in STRING_URL_EXPRESSIONS.keys():
|
55
|
+
raise ValueError(
|
56
|
+
f"{asset} is not a valid value for a STRING asset, valid assets are: "
|
57
|
+
f"{', '.join(STRING_URL_EXPRESSIONS.keys())}"
|
58
|
+
)
|
59
|
+
|
60
|
+
url_fstring = STRING_URL_EXPRESSIONS[asset]
|
61
|
+
|
62
|
+
return eval(
|
63
|
+
f'f"{url_fstring}"', {"taxid": STRING_TAX_IDS[species], "version": version}
|
64
|
+
)
|
65
|
+
|
66
|
+
|
67
|
+
def download_string(target_uri: str, species: str) -> None:
|
68
|
+
"""Downloads string to the target uri
|
69
|
+
|
70
|
+
Args:
|
71
|
+
target_uri (str): target url
|
72
|
+
species (str): A species name: e.g., Homo sapiens
|
73
|
+
|
74
|
+
Returns:
|
75
|
+
None
|
76
|
+
"""
|
77
|
+
string_url = get_string_species_url(
|
78
|
+
species, asset="interactions", version=STRING_VERSION
|
79
|
+
)
|
80
|
+
logger.info("Start downloading string db %s to %s", string_url, target_uri)
|
81
|
+
|
82
|
+
utils.download_wget(string_url, target_uri)
|
83
|
+
|
84
|
+
return None
|
85
|
+
|
86
|
+
|
87
|
+
def download_string_aliases(target_uri: str, species: str) -> None:
|
88
|
+
"""Downloads string aliases to the target uri
|
89
|
+
|
90
|
+
Args:
|
91
|
+
target_uri (str): target url
|
92
|
+
species (str): A species name: e.g., Homo sapiens
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
None
|
96
|
+
"""
|
97
|
+
string_aliases_url = get_string_species_url(
|
98
|
+
species, asset="aliases", version=STRING_VERSION
|
99
|
+
)
|
100
|
+
logger.info(
|
101
|
+
"Start downloading string aliases %s to %s", string_aliases_url, target_uri
|
102
|
+
)
|
103
|
+
utils.download_wget(string_aliases_url, target_uri)
|
104
|
+
|
105
|
+
return None
|
106
|
+
|
107
|
+
|
108
|
+
def convert_string_to_sbml_dfs(
|
109
|
+
string_uri: str,
|
110
|
+
string_aliases_uri: str,
|
111
|
+
) -> sbml_dfs_core.SBML_dfs:
|
112
|
+
"""Ingests string to sbml dfs
|
113
|
+
|
114
|
+
Args:
|
115
|
+
string_uri (str): string uri
|
116
|
+
string_aliases_uri (str): string aliases uri
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
sbml_dfs_core.SBML_dfs: sbml dfs
|
120
|
+
"""
|
121
|
+
# Read string raw data
|
122
|
+
string_edgelist = _read_string(string_uri)
|
123
|
+
string_aliases = _read_string_aliases(string_aliases_uri)
|
124
|
+
|
125
|
+
# Start building new sbml dfs
|
126
|
+
|
127
|
+
# remove one edge since reciprocal edges are present; i.e., A-B and B-A
|
128
|
+
# and attributes (e.g., combined_score are the same across both reciprocal
|
129
|
+
# interactions
|
130
|
+
uq_string_edgelist = cpr_edgelist.remove_reciprocal_interactions(
|
131
|
+
string_edgelist, extra_defining_vars=["combined_score"]
|
132
|
+
)
|
133
|
+
|
134
|
+
# Per convention unaggregated models receive an empty source
|
135
|
+
interaction_source = source.Source(init=True)
|
136
|
+
|
137
|
+
# define identifier mapping from aliases to use:
|
138
|
+
alias_to_identifier = {
|
139
|
+
"Ensembl_gene": ("ensembl_gene", BQB.IS_ENCODED_BY),
|
140
|
+
"Ensembl_transcript": ("ensembl_transcript", BQB.IS_ENCODED_BY),
|
141
|
+
"Ensembl_translation": ("ensembl_protein", BQB.IS),
|
142
|
+
"Ensembl_UniProt_AC": ("uniprot", BQB.IS),
|
143
|
+
}
|
144
|
+
|
145
|
+
# filter aliases to only keep required ones
|
146
|
+
string_aliases_fil = string_aliases.query(
|
147
|
+
"source in @alias_to_identifier.keys()"
|
148
|
+
).set_index(STRING_PROTEIN_ID)
|
149
|
+
|
150
|
+
# to save on memory
|
151
|
+
del string_aliases
|
152
|
+
|
153
|
+
# define species
|
154
|
+
species_df = _build_species_df(
|
155
|
+
uq_string_edgelist, string_aliases_fil, alias_to_identifier
|
156
|
+
)
|
157
|
+
|
158
|
+
# Define compartments
|
159
|
+
# Currently we are mapping everything to the `CELLULAR_COMPONENT`
|
160
|
+
# which is a catch-all go: for unknown localisation
|
161
|
+
compartments_df = sbml_dfs_core._stub_compartments()
|
162
|
+
|
163
|
+
# define interactions
|
164
|
+
interaction_edgelist = _build_interactor_edgelist(uq_string_edgelist)
|
165
|
+
|
166
|
+
# build the final object
|
167
|
+
sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
|
168
|
+
interaction_edgelist=interaction_edgelist,
|
169
|
+
species_df=species_df,
|
170
|
+
compartments_df=compartments_df,
|
171
|
+
interaction_source=interaction_source,
|
172
|
+
upstream_stoichiometry=0,
|
173
|
+
downstream_stoichiometry=0,
|
174
|
+
downstream_sbo_name="interactor",
|
175
|
+
keep_reactions_data="string",
|
176
|
+
)
|
177
|
+
return sbml_dfs
|
178
|
+
|
179
|
+
|
180
|
+
def _read_string(string_uri: str) -> pd.DataFrame:
|
181
|
+
"""Reads string from uri
|
182
|
+
|
183
|
+
Args:
|
184
|
+
string_uri (str): string uri
|
185
|
+
|
186
|
+
Returns:
|
187
|
+
pd.DataFrame: string edgelist
|
188
|
+
"""
|
189
|
+
base_path, file_name = utils.get_source_base_and_path(string_uri)
|
190
|
+
# TODO: test on gz versus txt
|
191
|
+
with open_fs(base_path) as base_fs:
|
192
|
+
with base_fs.open(file_name, "rb") as f:
|
193
|
+
string_edges = pd.read_csv(f, sep=" ")
|
194
|
+
return string_edges
|
195
|
+
|
196
|
+
|
197
|
+
def _read_string_aliases(string_aliases_uri: str) -> pd.DataFrame:
|
198
|
+
"""Reads string from uri
|
199
|
+
|
200
|
+
Args:
|
201
|
+
string_aliases_uri (str): string aliases uri
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
pd.DataFrame: string aliases
|
205
|
+
"""
|
206
|
+
base_path, file_name = utils.get_source_base_and_path(string_aliases_uri)
|
207
|
+
# TODO: test on gz versus txt
|
208
|
+
with open_fs(base_path) as base_fs:
|
209
|
+
with base_fs.open(file_name, "rb") as f:
|
210
|
+
string_aliases = (
|
211
|
+
pd.read_csv(f, sep="\t")
|
212
|
+
# Rename column with #
|
213
|
+
.rename(columns={STRING_PROTEIN_ID_RAW: STRING_PROTEIN_ID})
|
214
|
+
)
|
215
|
+
return string_aliases
|
216
|
+
|
217
|
+
|
218
|
+
def _get_identifiers(
|
219
|
+
row: pd.DataFrame,
|
220
|
+
alias_to_identifier: dict[str, tuple[str, str]],
|
221
|
+
dat_alias: pd.DataFrame,
|
222
|
+
) -> identifiers.Identifiers:
|
223
|
+
"""Helper function to get identifiers from a row of the string alias file
|
224
|
+
|
225
|
+
Args:
|
226
|
+
row (pd.DataFrame): grouped dataframe
|
227
|
+
alias_to_identifier (dict[str, tuple[str, str]]):
|
228
|
+
map from an alias source to an ontology and a qualifier
|
229
|
+
dat_alias (pd.DataFrame): Helper dataframe with index=string_protein_id
|
230
|
+
and columns=source (the source name), alias (the identifier)
|
231
|
+
|
232
|
+
Returns:
|
233
|
+
identifiers.Identifiers: An Identifiers object containing all identifiers
|
234
|
+
"""
|
235
|
+
if row.shape[0] == 0:
|
236
|
+
return identifiers.Identifiers([])
|
237
|
+
d = dat_alias.loc[row.s_name]
|
238
|
+
ids = []
|
239
|
+
for source_name, (ontology, qualifier) in alias_to_identifier.items():
|
240
|
+
for identifier in d.query(f"source == '{source_name}'")["alias"]:
|
241
|
+
# Here we creating an uri
|
242
|
+
uri = identifiers.create_uri_url(ontology=ontology, identifier=identifier)
|
243
|
+
# This is exactly the output format from: identifiers.format_uri
|
244
|
+
# We are doing it manually here to avoid the overhead of parsing
|
245
|
+
# the uri again
|
246
|
+
id_dict = {
|
247
|
+
"ontology": ontology,
|
248
|
+
"identifier": identifier,
|
249
|
+
"bqb": qualifier,
|
250
|
+
"url": uri,
|
251
|
+
}
|
252
|
+
ids.append(id_dict)
|
253
|
+
identifier = identifiers.Identifiers(ids)
|
254
|
+
return identifier
|
255
|
+
|
256
|
+
|
257
|
+
def _build_species_df(
|
258
|
+
edgelist: pd.DataFrame,
|
259
|
+
aliases: pd.DataFrame,
|
260
|
+
alias_to_identifier: dict,
|
261
|
+
source_col: str = STRING_SOURCE,
|
262
|
+
target_col: str = STRING_TARGET,
|
263
|
+
) -> pd.DataFrame:
|
264
|
+
"""Builds the species dataframe from the edgelist and aliases
|
265
|
+
|
266
|
+
Args:
|
267
|
+
edgelist (pd.DataFrame): edgelist
|
268
|
+
aliases (pd.DataFrame): aliases
|
269
|
+
alias_to_identifier (dict[str, tuple[str, str]]):
|
270
|
+
map from an alias source to an ontology and a qualifier
|
271
|
+
|
272
|
+
Returns:
|
273
|
+
pd.DataFrame: species dataframe
|
274
|
+
"""
|
275
|
+
species_df = (
|
276
|
+
pd.Series(
|
277
|
+
list(set(edgelist[source_col]).union(edgelist[target_col])),
|
278
|
+
name=SBML_SPECIES_DICT_NAME,
|
279
|
+
)
|
280
|
+
.to_frame()
|
281
|
+
.set_index(SBML_SPECIES_DICT_NAME, drop=False)
|
282
|
+
.apply(
|
283
|
+
_get_identifiers,
|
284
|
+
alias_to_identifier=alias_to_identifier,
|
285
|
+
dat_alias=aliases,
|
286
|
+
axis=1,
|
287
|
+
)
|
288
|
+
.rename(SBML_SPECIES_DICT_IDENTIFIERS)
|
289
|
+
.reset_index()
|
290
|
+
)
|
291
|
+
return species_df
|
292
|
+
|
293
|
+
|
294
|
+
def _build_interactor_edgelist(
|
295
|
+
edgelist: pd.DataFrame,
|
296
|
+
upstream_col_name: str = STRING_SOURCE,
|
297
|
+
downstream_col_name: str = STRING_TARGET,
|
298
|
+
add_reverse_interactions: bool = False,
|
299
|
+
sbo_term: str = "interactor",
|
300
|
+
compartment: str = COMPARTMENTS["CELLULAR_COMPONENT"],
|
301
|
+
) -> pd.DataFrame:
|
302
|
+
"""Format STRING interactions as reactions."""
|
303
|
+
|
304
|
+
sbo_interactor = MINI_SBO_FROM_NAME[sbo_term]
|
305
|
+
dat = edgelist.rename(
|
306
|
+
columns={
|
307
|
+
upstream_col_name: STRING_UPSTREAM_NAME,
|
308
|
+
downstream_col_name: STRING_DOWNSTREAM_NAME,
|
309
|
+
}
|
310
|
+
).assign(
|
311
|
+
**{
|
312
|
+
STRING_UPSTREAM_COMPARTMENT: compartment,
|
313
|
+
STRING_DOWNSTREAM_COMPARTMENT: compartment,
|
314
|
+
SMBL_REACTION_SPEC_SBO_TERM: sbo_interactor,
|
315
|
+
SMBL_REACTION_DICT_IDENTIFIERS: lambda x: identifiers.Identifiers([]),
|
316
|
+
}
|
317
|
+
)
|
318
|
+
if add_reverse_interactions:
|
319
|
+
dat = (
|
320
|
+
dat
|
321
|
+
# Add the reverse interactions
|
322
|
+
.pipe(
|
323
|
+
lambda d: pd.concat(
|
324
|
+
[
|
325
|
+
d,
|
326
|
+
d.rename(
|
327
|
+
columns={
|
328
|
+
STRING_UPSTREAM_NAME: STRING_DOWNSTREAM_NAME,
|
329
|
+
STRING_DOWNSTREAM_NAME: STRING_UPSTREAM_NAME,
|
330
|
+
}
|
331
|
+
),
|
332
|
+
]
|
333
|
+
)
|
334
|
+
)
|
335
|
+
)
|
336
|
+
|
337
|
+
interaction_edgelist = dat
|
338
|
+
interaction_edgelist[SMBL_REACTION_DICT_NAME] = _build_string_reaction_name(
|
339
|
+
dat[STRING_UPSTREAM_NAME], dat[STRING_DOWNSTREAM_NAME]
|
340
|
+
)
|
341
|
+
interaction_edgelist[SMBL_REACTION_DICT_IS_REVERSIBLE] = True
|
342
|
+
|
343
|
+
return interaction_edgelist
|
344
|
+
|
345
|
+
|
346
|
+
def _build_string_reaction_name(from_col: pd.Series, to_col: pd.Series) -> pd.Series:
|
347
|
+
"""Helper to build the reaction name for string reactions
|
348
|
+
|
349
|
+
Args:
|
350
|
+
from_col (pd.Series): from species
|
351
|
+
to_col (pd.Series): to species
|
352
|
+
|
353
|
+
Returns:
|
354
|
+
pd.Series: new name column
|
355
|
+
"""
|
356
|
+
return from_col + " - " + to_col
|
@@ -0,0 +1,285 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
from itertools import chain
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
from napistu import identifiers
|
8
|
+
from napistu import sbml_dfs_core
|
9
|
+
from napistu import source
|
10
|
+
from napistu import utils
|
11
|
+
from napistu.constants import MINI_SBO_FROM_NAME
|
12
|
+
from napistu.constants import SBOTERM_NAMES
|
13
|
+
from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_IDENTIFIERS
|
14
|
+
from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_NAME
|
15
|
+
from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
|
16
|
+
from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
|
17
|
+
from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
|
18
|
+
from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
|
19
|
+
from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
|
20
|
+
from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
|
21
|
+
from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
|
22
|
+
from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
|
23
|
+
from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
|
24
|
+
from napistu.ingestion.constants import STRING_UPSTREAM_COMPARTMENT
|
25
|
+
from napistu.ingestion.constants import STRING_UPSTREAM_NAME
|
26
|
+
from napistu.ingestion.constants import TRRUST_COMPARTMENT_NUCLEOPLASM
|
27
|
+
from napistu.ingestion.constants import TRRUST_COMPARTMENT_NUCLEOPLASM_GO_ID
|
28
|
+
from napistu.ingestion.constants import TRRUST_SYMBOL
|
29
|
+
from napistu.ingestion.constants import TRRUST_UNIPROT
|
30
|
+
from napistu.ingestion.constants import TRRUST_UNIPROT_ID
|
31
|
+
from napistu.ingestion.constants import TTRUST_URL_RAW_DATA_HUMAN
|
32
|
+
from napistu.ingestion.constants import TRRUST_SIGNS
|
33
|
+
from napistu.rpy2 import callr
|
34
|
+
from fs import open_fs
|
35
|
+
|
36
|
+
|
37
|
+
def download_trrust(target_uri: str) -> None:
|
38
|
+
"""Downloads trrust to the target uri
|
39
|
+
|
40
|
+
Args:
|
41
|
+
target_uri (str): target url
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
None
|
45
|
+
"""
|
46
|
+
utils.download_wget(TTRUST_URL_RAW_DATA_HUMAN, target_uri)
|
47
|
+
|
48
|
+
return None
|
49
|
+
|
50
|
+
|
51
|
+
def convert_trrust_to_sbml_dfs(
|
52
|
+
trrust_uri: str,
|
53
|
+
) -> sbml_dfs_core.SBML_dfs:
|
54
|
+
"""Ingests trrust to sbml dfs
|
55
|
+
|
56
|
+
Args:
|
57
|
+
trrust_uri (str): trrust uri
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
sbml_dfs
|
61
|
+
"""
|
62
|
+
# Read trrust raw data
|
63
|
+
trrust_edgelist = _read_trrust(trrust_uri)
|
64
|
+
|
65
|
+
# Get uniprot to symbol mapping
|
66
|
+
uniprot_2_symbol = _get_uniprot_2_symbol_mapping()
|
67
|
+
|
68
|
+
# Start building new sbml dfs
|
69
|
+
# Per convention unaggregated models receive an empty source
|
70
|
+
interaction_source = source.Source(init=True)
|
71
|
+
|
72
|
+
# Summarize edges
|
73
|
+
|
74
|
+
edge_summaries_df = (
|
75
|
+
trrust_edgelist.groupby(["from", "to"], as_index=True)
|
76
|
+
.apply(_summarize_trrust_pairs)
|
77
|
+
.reset_index(drop=False)
|
78
|
+
)
|
79
|
+
|
80
|
+
# define distinct species
|
81
|
+
species_df = (
|
82
|
+
pd.DataFrame(
|
83
|
+
{
|
84
|
+
SBML_SPECIES_DICT_NAME: list(
|
85
|
+
{*edge_summaries_df["from"], *edge_summaries_df["to"]}
|
86
|
+
)
|
87
|
+
}
|
88
|
+
)
|
89
|
+
.merge(
|
90
|
+
uniprot_2_symbol.rename({TRRUST_SYMBOL: SBML_SPECIES_DICT_NAME}, axis=1),
|
91
|
+
how="left",
|
92
|
+
)
|
93
|
+
.set_index(SBML_SPECIES_DICT_NAME)
|
94
|
+
)
|
95
|
+
|
96
|
+
# create Identifiers objects for all species with uniprot IDs
|
97
|
+
species_w_ids = species_df[~species_df[TRRUST_UNIPROT_ID].isnull()].sort_index()
|
98
|
+
species_w_ids["url"] = [
|
99
|
+
identifiers.create_uri_url(ontology=TRRUST_UNIPROT, identifier=x)
|
100
|
+
for x in species_w_ids[TRRUST_UNIPROT_ID]
|
101
|
+
]
|
102
|
+
|
103
|
+
# create a series where each row is a gene with 1+ uniprot ids and the value is an
|
104
|
+
# identifiers objects with all uniprot ids
|
105
|
+
species_w_ids_series = pd.Series(
|
106
|
+
[
|
107
|
+
identifiers.Identifiers(
|
108
|
+
[
|
109
|
+
identifiers.format_uri(uri=x, biological_qualifier_type="BQB_IS")
|
110
|
+
for x in species_w_ids.loc[[ind]]["url"].tolist()
|
111
|
+
]
|
112
|
+
)
|
113
|
+
for ind in species_w_ids.index.unique()
|
114
|
+
],
|
115
|
+
index=species_w_ids.index.unique(),
|
116
|
+
).rename(SBML_SPECIES_DICT_IDENTIFIERS)
|
117
|
+
|
118
|
+
# just retain s_name and s_Identifiers
|
119
|
+
# this just needs a source object which will be added later
|
120
|
+
species_df = (
|
121
|
+
species_df.reset_index()
|
122
|
+
.drop(TRRUST_UNIPROT_ID, axis=1)
|
123
|
+
.drop_duplicates()
|
124
|
+
.merge(
|
125
|
+
species_w_ids_series,
|
126
|
+
how="left",
|
127
|
+
left_on=SBML_SPECIES_DICT_NAME,
|
128
|
+
right_index=True,
|
129
|
+
)
|
130
|
+
.reset_index(drop=True)
|
131
|
+
)
|
132
|
+
# stub genes with missing IDs
|
133
|
+
species_df[SBML_SPECIES_DICT_IDENTIFIERS] = species_df[SBML_SPECIES_DICT_IDENTIFIERS].fillna( # type: ignore
|
134
|
+
value=identifiers.Identifiers([])
|
135
|
+
)
|
136
|
+
|
137
|
+
# define distinct compartments
|
138
|
+
compartments_df = pd.DataFrame(
|
139
|
+
{
|
140
|
+
SBML_COMPARTMENT_DICT_NAME: TRRUST_COMPARTMENT_NUCLEOPLASM,
|
141
|
+
SBML_COMPARTMENT_DICT_IDENTIFIERS: identifiers.Identifiers(
|
142
|
+
[
|
143
|
+
identifiers.format_uri(
|
144
|
+
uri=identifiers.create_uri_url(
|
145
|
+
ontology="go",
|
146
|
+
identifier=TRRUST_COMPARTMENT_NUCLEOPLASM_GO_ID,
|
147
|
+
),
|
148
|
+
biological_qualifier_type="BQB_IS",
|
149
|
+
)
|
150
|
+
]
|
151
|
+
),
|
152
|
+
},
|
153
|
+
index=[0],
|
154
|
+
)
|
155
|
+
|
156
|
+
gene_gene_identifier_edgelist = edge_summaries_df.rename(
|
157
|
+
{"from": STRING_UPSTREAM_NAME, "to": STRING_DOWNSTREAM_NAME}, axis=1
|
158
|
+
).assign(
|
159
|
+
upstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
|
160
|
+
downstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
|
161
|
+
)
|
162
|
+
gene_gene_identifier_edgelist[SMBL_REACTION_DICT_NAME] = [
|
163
|
+
f"{x} {y} of {z}"
|
164
|
+
for x, y, z in zip(
|
165
|
+
gene_gene_identifier_edgelist[STRING_UPSTREAM_NAME],
|
166
|
+
gene_gene_identifier_edgelist["sign"],
|
167
|
+
gene_gene_identifier_edgelist[STRING_DOWNSTREAM_NAME],
|
168
|
+
)
|
169
|
+
]
|
170
|
+
|
171
|
+
# convert relationships to SBO terms
|
172
|
+
interaction_edgelist = gene_gene_identifier_edgelist.replace(
|
173
|
+
{"sign": MINI_SBO_FROM_NAME}
|
174
|
+
).rename({"sign": SMBL_REACTION_SPEC_SBO_TERM}, axis=1)
|
175
|
+
|
176
|
+
# format pubmed identifiers of interactions
|
177
|
+
interaction_edgelist[SMBL_REACTION_DICT_IDENTIFIERS] = [
|
178
|
+
_format_pubmed_for_interactions(x) for x in interaction_edgelist["reference"]
|
179
|
+
]
|
180
|
+
|
181
|
+
# directionality: by default, set r_isreversible to False for TRRUST data
|
182
|
+
interaction_edgelist[SMBL_REACTION_DICT_IS_REVERSIBLE] = False
|
183
|
+
|
184
|
+
# reduce to essential variables
|
185
|
+
interaction_edgelist = interaction_edgelist[
|
186
|
+
[
|
187
|
+
STRING_UPSTREAM_NAME,
|
188
|
+
STRING_DOWNSTREAM_NAME,
|
189
|
+
STRING_UPSTREAM_COMPARTMENT,
|
190
|
+
STRING_DOWNSTREAM_COMPARTMENT,
|
191
|
+
SMBL_REACTION_DICT_NAME,
|
192
|
+
SMBL_REACTION_SPEC_SBO_TERM,
|
193
|
+
SMBL_REACTION_DICT_IDENTIFIERS,
|
194
|
+
SMBL_REACTION_DICT_IS_REVERSIBLE,
|
195
|
+
]
|
196
|
+
]
|
197
|
+
|
198
|
+
# Build sbml dfs
|
199
|
+
sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
|
200
|
+
interaction_edgelist=interaction_edgelist,
|
201
|
+
species_df=species_df,
|
202
|
+
compartments_df=compartments_df,
|
203
|
+
interaction_source=interaction_source,
|
204
|
+
)
|
205
|
+
sbml_dfs.validate()
|
206
|
+
return sbml_dfs
|
207
|
+
|
208
|
+
|
209
|
+
def _read_trrust(trrust_uri: str) -> pd.DataFrame:
|
210
|
+
"""Read trrust csv
|
211
|
+
|
212
|
+
Args:
|
213
|
+
trrust_uri (str): uri to the trrust csv
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
pd.DataFrame: Data Frame
|
217
|
+
"""
|
218
|
+
base_path = os.path.dirname(trrust_uri)
|
219
|
+
file_name = os.path.basename(trrust_uri)
|
220
|
+
with open_fs(base_path) as base_fs:
|
221
|
+
with base_fs.open(file_name) as f:
|
222
|
+
trrust_edgelist = pd.read_csv(
|
223
|
+
f, sep="\t", names=["from", "to", "sign", "reference"]
|
224
|
+
).drop_duplicates()
|
225
|
+
return trrust_edgelist
|
226
|
+
|
227
|
+
|
228
|
+
def _summarize_trrust_pairs(pair_data: pd.DataFrame) -> pd.Series:
|
229
|
+
"""Summarize a TF->target relationship based on the sign and source of the interaction."""
|
230
|
+
|
231
|
+
signs = set(pair_data["sign"].tolist())
|
232
|
+
if (TRRUST_SIGNS.ACTIVATION in signs) and (TRRUST_SIGNS.REPRESSION in signs):
|
233
|
+
sign = SBOTERM_NAMES.MODIFIER
|
234
|
+
elif TRRUST_SIGNS.ACTIVATION in signs:
|
235
|
+
sign = SBOTERM_NAMES.STIMULATOR
|
236
|
+
elif TRRUST_SIGNS.REPRESSION in signs:
|
237
|
+
sign = SBOTERM_NAMES.INHIBITOR
|
238
|
+
else:
|
239
|
+
sign = SBOTERM_NAMES.MODIFIER
|
240
|
+
|
241
|
+
refs = set(chain(*[x.split(";") for x in pair_data["reference"]]))
|
242
|
+
return pd.Series({"sign": sign, "reference": refs})
|
243
|
+
|
244
|
+
|
245
|
+
def _get_uniprot_2_symbol_mapping() -> pd.DataFrame:
|
246
|
+
"""Create a mapping from Uniprot IDs to human gene symbols."""
|
247
|
+
|
248
|
+
entrez_2_symbol = callr.r_dataframe_to_pandas(
|
249
|
+
callr.bioconductor_org_r_function(
|
250
|
+
TRRUST_SYMBOL.upper(), species=SPECIES_FULL_NAME_HUMAN
|
251
|
+
)
|
252
|
+
)
|
253
|
+
# only look at symbol which uniquely map to a single gene
|
254
|
+
symbol_counts = entrez_2_symbol.value_counts(TRRUST_SYMBOL)
|
255
|
+
unique_symbols = symbol_counts[symbol_counts == 1].index.tolist()
|
256
|
+
entrez_2_symbol = entrez_2_symbol[
|
257
|
+
entrez_2_symbol[TRRUST_SYMBOL].isin(unique_symbols)
|
258
|
+
]
|
259
|
+
|
260
|
+
# one entrez -> multiple uniprot IDs is okay
|
261
|
+
entrez_2_uniprot = callr.r_dataframe_to_pandas(
|
262
|
+
callr.bioconductor_org_r_function(
|
263
|
+
TRRUST_UNIPROT.upper(), species=SPECIES_FULL_NAME_HUMAN
|
264
|
+
)
|
265
|
+
)
|
266
|
+
|
267
|
+
uniprot_2_symbol = entrez_2_symbol.merge(entrez_2_uniprot).drop("gene_id", axis=1)
|
268
|
+
return uniprot_2_symbol
|
269
|
+
|
270
|
+
|
271
|
+
def _format_pubmed_for_interactions(pubmed_set):
|
272
|
+
"""Format a set of pubmed ids as an Identifiers object."""
|
273
|
+
|
274
|
+
ids = list()
|
275
|
+
for p in pubmed_set:
|
276
|
+
# some pubmed IDs are bogus
|
277
|
+
url = identifiers.create_uri_url(ontology="pubmed", identifier=p, strict=False)
|
278
|
+
if url is not None:
|
279
|
+
valid_url = identifiers.format_uri(
|
280
|
+
uri=url, biological_qualifier_type="BQB_IS_DESCRIBED_BY"
|
281
|
+
)
|
282
|
+
|
283
|
+
ids.append(valid_url)
|
284
|
+
|
285
|
+
return identifiers.Identifiers(ids)
|