napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,356 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ import pandas as pd
6
+ from napistu import identifiers
7
+ from napistu import sbml_dfs_core
8
+ from napistu import source
9
+ from napistu import utils
10
+ from napistu.constants import BQB
11
+ from napistu.constants import COMPARTMENTS
12
+ from napistu.constants import MINI_SBO_FROM_NAME
13
+ from napistu.ingestion import cpr_edgelist
14
+ from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
15
+ from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
16
+ from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
17
+ from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
18
+ from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
19
+ from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
20
+ from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
21
+ from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
22
+ from napistu.ingestion.constants import STRING_PROTEIN_ID
23
+ from napistu.ingestion.constants import STRING_PROTEIN_ID_RAW
24
+ from napistu.ingestion.constants import STRING_SOURCE
25
+ from napistu.ingestion.constants import STRING_TARGET
26
+ from napistu.ingestion.constants import STRING_TAX_IDS
27
+ from napistu.ingestion.constants import STRING_UPSTREAM_COMPARTMENT
28
+ from napistu.ingestion.constants import STRING_UPSTREAM_NAME
29
+ from napistu.ingestion.constants import STRING_URL_EXPRESSIONS
30
+ from napistu.ingestion.constants import STRING_VERSION
31
+ from fs import open_fs
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def get_string_species_url(
37
+ species: str, asset: str, version: float = STRING_VERSION
38
+ ) -> str:
39
+ """
40
+ STRING Species URL
41
+ Construct urls for downloading specific STRING tables
42
+ Args:
43
+ species (str): A species name: e.g., Homo sapiens.
44
+ asset (str): The type of table to be downloaded. Currently "interactions" or "aliases".
45
+ version (float): The version of STRING to work with.
46
+ Returns:
47
+ str: The download url
48
+ """
49
+ if species not in STRING_TAX_IDS.keys():
50
+ raise ValueError(
51
+ f"{species} is not a valid value for species ids, valid ids are: {', '.join(STRING_TAX_IDS.keys())}"
52
+ )
53
+
54
+ if asset not in STRING_URL_EXPRESSIONS.keys():
55
+ raise ValueError(
56
+ f"{asset} is not a valid value for a STRING asset, valid assets are: "
57
+ f"{', '.join(STRING_URL_EXPRESSIONS.keys())}"
58
+ )
59
+
60
+ url_fstring = STRING_URL_EXPRESSIONS[asset]
61
+
62
+ return eval(
63
+ f'f"{url_fstring}"', {"taxid": STRING_TAX_IDS[species], "version": version}
64
+ )
65
+
66
+
67
+ def download_string(target_uri: str, species: str) -> None:
68
+ """Downloads string to the target uri
69
+
70
+ Args:
71
+ target_uri (str): target url
72
+ species (str): A species name: e.g., Homo sapiens
73
+
74
+ Returns:
75
+ None
76
+ """
77
+ string_url = get_string_species_url(
78
+ species, asset="interactions", version=STRING_VERSION
79
+ )
80
+ logger.info("Start downloading string db %s to %s", string_url, target_uri)
81
+
82
+ utils.download_wget(string_url, target_uri)
83
+
84
+ return None
85
+
86
+
87
+ def download_string_aliases(target_uri: str, species: str) -> None:
88
+ """Downloads string aliases to the target uri
89
+
90
+ Args:
91
+ target_uri (str): target url
92
+ species (str): A species name: e.g., Homo sapiens
93
+
94
+ Returns:
95
+ None
96
+ """
97
+ string_aliases_url = get_string_species_url(
98
+ species, asset="aliases", version=STRING_VERSION
99
+ )
100
+ logger.info(
101
+ "Start downloading string aliases %s to %s", string_aliases_url, target_uri
102
+ )
103
+ utils.download_wget(string_aliases_url, target_uri)
104
+
105
+ return None
106
+
107
+
108
+ def convert_string_to_sbml_dfs(
109
+ string_uri: str,
110
+ string_aliases_uri: str,
111
+ ) -> sbml_dfs_core.SBML_dfs:
112
+ """Ingests string to sbml dfs
113
+
114
+ Args:
115
+ string_uri (str): string uri
116
+ string_aliases_uri (str): string aliases uri
117
+
118
+ Returns:
119
+ sbml_dfs_core.SBML_dfs: sbml dfs
120
+ """
121
+ # Read string raw data
122
+ string_edgelist = _read_string(string_uri)
123
+ string_aliases = _read_string_aliases(string_aliases_uri)
124
+
125
+ # Start building new sbml dfs
126
+
127
+ # remove one edge since reciprocal edges are present; i.e., A-B and B-A
128
+ # and attributes (e.g., combined_score are the same across both reciprocal
129
+ # interactions
130
+ uq_string_edgelist = cpr_edgelist.remove_reciprocal_interactions(
131
+ string_edgelist, extra_defining_vars=["combined_score"]
132
+ )
133
+
134
+ # Per convention unaggregated models receive an empty source
135
+ interaction_source = source.Source(init=True)
136
+
137
+ # define identifier mapping from aliases to use:
138
+ alias_to_identifier = {
139
+ "Ensembl_gene": ("ensembl_gene", BQB.IS_ENCODED_BY),
140
+ "Ensembl_transcript": ("ensembl_transcript", BQB.IS_ENCODED_BY),
141
+ "Ensembl_translation": ("ensembl_protein", BQB.IS),
142
+ "Ensembl_UniProt_AC": ("uniprot", BQB.IS),
143
+ }
144
+
145
+ # filter aliases to only keep required ones
146
+ string_aliases_fil = string_aliases.query(
147
+ "source in @alias_to_identifier.keys()"
148
+ ).set_index(STRING_PROTEIN_ID)
149
+
150
+ # to save on memory
151
+ del string_aliases
152
+
153
+ # define species
154
+ species_df = _build_species_df(
155
+ uq_string_edgelist, string_aliases_fil, alias_to_identifier
156
+ )
157
+
158
+ # Define compartments
159
+ # Currently we are mapping everything to the `CELLULAR_COMPONENT`
160
+ # which is a catch-all go: for unknown localisation
161
+ compartments_df = sbml_dfs_core._stub_compartments()
162
+
163
+ # define interactions
164
+ interaction_edgelist = _build_interactor_edgelist(uq_string_edgelist)
165
+
166
+ # build the final object
167
+ sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
168
+ interaction_edgelist=interaction_edgelist,
169
+ species_df=species_df,
170
+ compartments_df=compartments_df,
171
+ interaction_source=interaction_source,
172
+ upstream_stoichiometry=0,
173
+ downstream_stoichiometry=0,
174
+ downstream_sbo_name="interactor",
175
+ keep_reactions_data="string",
176
+ )
177
+ return sbml_dfs
178
+
179
+
180
+ def _read_string(string_uri: str) -> pd.DataFrame:
181
+ """Reads string from uri
182
+
183
+ Args:
184
+ string_uri (str): string uri
185
+
186
+ Returns:
187
+ pd.DataFrame: string edgelist
188
+ """
189
+ base_path, file_name = utils.get_source_base_and_path(string_uri)
190
+ # TODO: test on gz versus txt
191
+ with open_fs(base_path) as base_fs:
192
+ with base_fs.open(file_name, "rb") as f:
193
+ string_edges = pd.read_csv(f, sep=" ")
194
+ return string_edges
195
+
196
+
197
+ def _read_string_aliases(string_aliases_uri: str) -> pd.DataFrame:
198
+ """Reads string from uri
199
+
200
+ Args:
201
+ string_aliases_uri (str): string aliases uri
202
+
203
+ Returns:
204
+ pd.DataFrame: string aliases
205
+ """
206
+ base_path, file_name = utils.get_source_base_and_path(string_aliases_uri)
207
+ # TODO: test on gz versus txt
208
+ with open_fs(base_path) as base_fs:
209
+ with base_fs.open(file_name, "rb") as f:
210
+ string_aliases = (
211
+ pd.read_csv(f, sep="\t")
212
+ # Rename column with #
213
+ .rename(columns={STRING_PROTEIN_ID_RAW: STRING_PROTEIN_ID})
214
+ )
215
+ return string_aliases
216
+
217
+
218
+ def _get_identifiers(
219
+ row: pd.DataFrame,
220
+ alias_to_identifier: dict[str, tuple[str, str]],
221
+ dat_alias: pd.DataFrame,
222
+ ) -> identifiers.Identifiers:
223
+ """Helper function to get identifiers from a row of the string alias file
224
+
225
+ Args:
226
+ row (pd.DataFrame): grouped dataframe
227
+ alias_to_identifier (dict[str, tuple[str, str]]):
228
+ map from an alias source to an ontology and a qualifier
229
+ dat_alias (pd.DataFrame): Helper dataframe with index=string_protein_id
230
+ and columns=source (the source name), alias (the identifier)
231
+
232
+ Returns:
233
+ identifiers.Identifiers: An Identifiers object containing all identifiers
234
+ """
235
+ if row.shape[0] == 0:
236
+ return identifiers.Identifiers([])
237
+ d = dat_alias.loc[row.s_name]
238
+ ids = []
239
+ for source_name, (ontology, qualifier) in alias_to_identifier.items():
240
+ for identifier in d.query(f"source == '{source_name}'")["alias"]:
241
+ # Here we creating an uri
242
+ uri = identifiers.create_uri_url(ontology=ontology, identifier=identifier)
243
+ # This is exactly the output format from: identifiers.format_uri
244
+ # We are doing it manually here to avoid the overhead of parsing
245
+ # the uri again
246
+ id_dict = {
247
+ "ontology": ontology,
248
+ "identifier": identifier,
249
+ "bqb": qualifier,
250
+ "url": uri,
251
+ }
252
+ ids.append(id_dict)
253
+ identifier = identifiers.Identifiers(ids)
254
+ return identifier
255
+
256
+
257
+ def _build_species_df(
258
+ edgelist: pd.DataFrame,
259
+ aliases: pd.DataFrame,
260
+ alias_to_identifier: dict,
261
+ source_col: str = STRING_SOURCE,
262
+ target_col: str = STRING_TARGET,
263
+ ) -> pd.DataFrame:
264
+ """Builds the species dataframe from the edgelist and aliases
265
+
266
+ Args:
267
+ edgelist (pd.DataFrame): edgelist
268
+ aliases (pd.DataFrame): aliases
269
+ alias_to_identifier (dict[str, tuple[str, str]]):
270
+ map from an alias source to an ontology and a qualifier
271
+
272
+ Returns:
273
+ pd.DataFrame: species dataframe
274
+ """
275
+ species_df = (
276
+ pd.Series(
277
+ list(set(edgelist[source_col]).union(edgelist[target_col])),
278
+ name=SBML_SPECIES_DICT_NAME,
279
+ )
280
+ .to_frame()
281
+ .set_index(SBML_SPECIES_DICT_NAME, drop=False)
282
+ .apply(
283
+ _get_identifiers,
284
+ alias_to_identifier=alias_to_identifier,
285
+ dat_alias=aliases,
286
+ axis=1,
287
+ )
288
+ .rename(SBML_SPECIES_DICT_IDENTIFIERS)
289
+ .reset_index()
290
+ )
291
+ return species_df
292
+
293
+
294
+ def _build_interactor_edgelist(
295
+ edgelist: pd.DataFrame,
296
+ upstream_col_name: str = STRING_SOURCE,
297
+ downstream_col_name: str = STRING_TARGET,
298
+ add_reverse_interactions: bool = False,
299
+ sbo_term: str = "interactor",
300
+ compartment: str = COMPARTMENTS["CELLULAR_COMPONENT"],
301
+ ) -> pd.DataFrame:
302
+ """Format STRING interactions as reactions."""
303
+
304
+ sbo_interactor = MINI_SBO_FROM_NAME[sbo_term]
305
+ dat = edgelist.rename(
306
+ columns={
307
+ upstream_col_name: STRING_UPSTREAM_NAME,
308
+ downstream_col_name: STRING_DOWNSTREAM_NAME,
309
+ }
310
+ ).assign(
311
+ **{
312
+ STRING_UPSTREAM_COMPARTMENT: compartment,
313
+ STRING_DOWNSTREAM_COMPARTMENT: compartment,
314
+ SMBL_REACTION_SPEC_SBO_TERM: sbo_interactor,
315
+ SMBL_REACTION_DICT_IDENTIFIERS: lambda x: identifiers.Identifiers([]),
316
+ }
317
+ )
318
+ if add_reverse_interactions:
319
+ dat = (
320
+ dat
321
+ # Add the reverse interactions
322
+ .pipe(
323
+ lambda d: pd.concat(
324
+ [
325
+ d,
326
+ d.rename(
327
+ columns={
328
+ STRING_UPSTREAM_NAME: STRING_DOWNSTREAM_NAME,
329
+ STRING_DOWNSTREAM_NAME: STRING_UPSTREAM_NAME,
330
+ }
331
+ ),
332
+ ]
333
+ )
334
+ )
335
+ )
336
+
337
+ interaction_edgelist = dat
338
+ interaction_edgelist[SMBL_REACTION_DICT_NAME] = _build_string_reaction_name(
339
+ dat[STRING_UPSTREAM_NAME], dat[STRING_DOWNSTREAM_NAME]
340
+ )
341
+ interaction_edgelist[SMBL_REACTION_DICT_IS_REVERSIBLE] = True
342
+
343
+ return interaction_edgelist
344
+
345
+
346
+ def _build_string_reaction_name(from_col: pd.Series, to_col: pd.Series) -> pd.Series:
347
+ """Helper to build the reaction name for string reactions
348
+
349
+ Args:
350
+ from_col (pd.Series): from species
351
+ to_col (pd.Series): to species
352
+
353
+ Returns:
354
+ pd.Series: new name column
355
+ """
356
+ return from_col + " - " + to_col
@@ -0,0 +1,285 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from itertools import chain
5
+
6
+ import pandas as pd
7
+ from napistu import identifiers
8
+ from napistu import sbml_dfs_core
9
+ from napistu import source
10
+ from napistu import utils
11
+ from napistu.constants import MINI_SBO_FROM_NAME
12
+ from napistu.constants import SBOTERM_NAMES
13
+ from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_IDENTIFIERS
14
+ from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_NAME
15
+ from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
16
+ from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
17
+ from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
18
+ from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
19
+ from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
20
+ from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
21
+ from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
22
+ from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
23
+ from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
24
+ from napistu.ingestion.constants import STRING_UPSTREAM_COMPARTMENT
25
+ from napistu.ingestion.constants import STRING_UPSTREAM_NAME
26
+ from napistu.ingestion.constants import TRRUST_COMPARTMENT_NUCLEOPLASM
27
+ from napistu.ingestion.constants import TRRUST_COMPARTMENT_NUCLEOPLASM_GO_ID
28
+ from napistu.ingestion.constants import TRRUST_SYMBOL
29
+ from napistu.ingestion.constants import TRRUST_UNIPROT
30
+ from napistu.ingestion.constants import TRRUST_UNIPROT_ID
31
+ from napistu.ingestion.constants import TTRUST_URL_RAW_DATA_HUMAN
32
+ from napistu.ingestion.constants import TRRUST_SIGNS
33
+ from napistu.rpy2 import callr
34
+ from fs import open_fs
35
+
36
+
37
+ def download_trrust(target_uri: str) -> None:
38
+ """Downloads trrust to the target uri
39
+
40
+ Args:
41
+ target_uri (str): target url
42
+
43
+ Returns:
44
+ None
45
+ """
46
+ utils.download_wget(TTRUST_URL_RAW_DATA_HUMAN, target_uri)
47
+
48
+ return None
49
+
50
+
51
+ def convert_trrust_to_sbml_dfs(
52
+ trrust_uri: str,
53
+ ) -> sbml_dfs_core.SBML_dfs:
54
+ """Ingests trrust to sbml dfs
55
+
56
+ Args:
57
+ trrust_uri (str): trrust uri
58
+
59
+ Returns:
60
+ sbml_dfs
61
+ """
62
+ # Read trrust raw data
63
+ trrust_edgelist = _read_trrust(trrust_uri)
64
+
65
+ # Get uniprot to symbol mapping
66
+ uniprot_2_symbol = _get_uniprot_2_symbol_mapping()
67
+
68
+ # Start building new sbml dfs
69
+ # Per convention unaggregated models receive an empty source
70
+ interaction_source = source.Source(init=True)
71
+
72
+ # Summarize edges
73
+
74
+ edge_summaries_df = (
75
+ trrust_edgelist.groupby(["from", "to"], as_index=True)
76
+ .apply(_summarize_trrust_pairs)
77
+ .reset_index(drop=False)
78
+ )
79
+
80
+ # define distinct species
81
+ species_df = (
82
+ pd.DataFrame(
83
+ {
84
+ SBML_SPECIES_DICT_NAME: list(
85
+ {*edge_summaries_df["from"], *edge_summaries_df["to"]}
86
+ )
87
+ }
88
+ )
89
+ .merge(
90
+ uniprot_2_symbol.rename({TRRUST_SYMBOL: SBML_SPECIES_DICT_NAME}, axis=1),
91
+ how="left",
92
+ )
93
+ .set_index(SBML_SPECIES_DICT_NAME)
94
+ )
95
+
96
+ # create Identifiers objects for all species with uniprot IDs
97
+ species_w_ids = species_df[~species_df[TRRUST_UNIPROT_ID].isnull()].sort_index()
98
+ species_w_ids["url"] = [
99
+ identifiers.create_uri_url(ontology=TRRUST_UNIPROT, identifier=x)
100
+ for x in species_w_ids[TRRUST_UNIPROT_ID]
101
+ ]
102
+
103
+ # create a series where each row is a gene with 1+ uniprot ids and the value is an
104
+ # identifiers objects with all uniprot ids
105
+ species_w_ids_series = pd.Series(
106
+ [
107
+ identifiers.Identifiers(
108
+ [
109
+ identifiers.format_uri(uri=x, biological_qualifier_type="BQB_IS")
110
+ for x in species_w_ids.loc[[ind]]["url"].tolist()
111
+ ]
112
+ )
113
+ for ind in species_w_ids.index.unique()
114
+ ],
115
+ index=species_w_ids.index.unique(),
116
+ ).rename(SBML_SPECIES_DICT_IDENTIFIERS)
117
+
118
+ # just retain s_name and s_Identifiers
119
+ # this just needs a source object which will be added later
120
+ species_df = (
121
+ species_df.reset_index()
122
+ .drop(TRRUST_UNIPROT_ID, axis=1)
123
+ .drop_duplicates()
124
+ .merge(
125
+ species_w_ids_series,
126
+ how="left",
127
+ left_on=SBML_SPECIES_DICT_NAME,
128
+ right_index=True,
129
+ )
130
+ .reset_index(drop=True)
131
+ )
132
+ # stub genes with missing IDs
133
+ species_df[SBML_SPECIES_DICT_IDENTIFIERS] = species_df[SBML_SPECIES_DICT_IDENTIFIERS].fillna( # type: ignore
134
+ value=identifiers.Identifiers([])
135
+ )
136
+
137
+ # define distinct compartments
138
+ compartments_df = pd.DataFrame(
139
+ {
140
+ SBML_COMPARTMENT_DICT_NAME: TRRUST_COMPARTMENT_NUCLEOPLASM,
141
+ SBML_COMPARTMENT_DICT_IDENTIFIERS: identifiers.Identifiers(
142
+ [
143
+ identifiers.format_uri(
144
+ uri=identifiers.create_uri_url(
145
+ ontology="go",
146
+ identifier=TRRUST_COMPARTMENT_NUCLEOPLASM_GO_ID,
147
+ ),
148
+ biological_qualifier_type="BQB_IS",
149
+ )
150
+ ]
151
+ ),
152
+ },
153
+ index=[0],
154
+ )
155
+
156
+ gene_gene_identifier_edgelist = edge_summaries_df.rename(
157
+ {"from": STRING_UPSTREAM_NAME, "to": STRING_DOWNSTREAM_NAME}, axis=1
158
+ ).assign(
159
+ upstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
160
+ downstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
161
+ )
162
+ gene_gene_identifier_edgelist[SMBL_REACTION_DICT_NAME] = [
163
+ f"{x} {y} of {z}"
164
+ for x, y, z in zip(
165
+ gene_gene_identifier_edgelist[STRING_UPSTREAM_NAME],
166
+ gene_gene_identifier_edgelist["sign"],
167
+ gene_gene_identifier_edgelist[STRING_DOWNSTREAM_NAME],
168
+ )
169
+ ]
170
+
171
+ # convert relationships to SBO terms
172
+ interaction_edgelist = gene_gene_identifier_edgelist.replace(
173
+ {"sign": MINI_SBO_FROM_NAME}
174
+ ).rename({"sign": SMBL_REACTION_SPEC_SBO_TERM}, axis=1)
175
+
176
+ # format pubmed identifiers of interactions
177
+ interaction_edgelist[SMBL_REACTION_DICT_IDENTIFIERS] = [
178
+ _format_pubmed_for_interactions(x) for x in interaction_edgelist["reference"]
179
+ ]
180
+
181
+ # directionality: by default, set r_isreversible to False for TRRUST data
182
+ interaction_edgelist[SMBL_REACTION_DICT_IS_REVERSIBLE] = False
183
+
184
+ # reduce to essential variables
185
+ interaction_edgelist = interaction_edgelist[
186
+ [
187
+ STRING_UPSTREAM_NAME,
188
+ STRING_DOWNSTREAM_NAME,
189
+ STRING_UPSTREAM_COMPARTMENT,
190
+ STRING_DOWNSTREAM_COMPARTMENT,
191
+ SMBL_REACTION_DICT_NAME,
192
+ SMBL_REACTION_SPEC_SBO_TERM,
193
+ SMBL_REACTION_DICT_IDENTIFIERS,
194
+ SMBL_REACTION_DICT_IS_REVERSIBLE,
195
+ ]
196
+ ]
197
+
198
+ # Build sbml dfs
199
+ sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
200
+ interaction_edgelist=interaction_edgelist,
201
+ species_df=species_df,
202
+ compartments_df=compartments_df,
203
+ interaction_source=interaction_source,
204
+ )
205
+ sbml_dfs.validate()
206
+ return sbml_dfs
207
+
208
+
209
+ def _read_trrust(trrust_uri: str) -> pd.DataFrame:
210
+ """Read trrust csv
211
+
212
+ Args:
213
+ trrust_uri (str): uri to the trrust csv
214
+
215
+ Returns:
216
+ pd.DataFrame: Data Frame
217
+ """
218
+ base_path = os.path.dirname(trrust_uri)
219
+ file_name = os.path.basename(trrust_uri)
220
+ with open_fs(base_path) as base_fs:
221
+ with base_fs.open(file_name) as f:
222
+ trrust_edgelist = pd.read_csv(
223
+ f, sep="\t", names=["from", "to", "sign", "reference"]
224
+ ).drop_duplicates()
225
+ return trrust_edgelist
226
+
227
+
228
+ def _summarize_trrust_pairs(pair_data: pd.DataFrame) -> pd.Series:
229
+ """Summarize a TF->target relationship based on the sign and source of the interaction."""
230
+
231
+ signs = set(pair_data["sign"].tolist())
232
+ if (TRRUST_SIGNS.ACTIVATION in signs) and (TRRUST_SIGNS.REPRESSION in signs):
233
+ sign = SBOTERM_NAMES.MODIFIER
234
+ elif TRRUST_SIGNS.ACTIVATION in signs:
235
+ sign = SBOTERM_NAMES.STIMULATOR
236
+ elif TRRUST_SIGNS.REPRESSION in signs:
237
+ sign = SBOTERM_NAMES.INHIBITOR
238
+ else:
239
+ sign = SBOTERM_NAMES.MODIFIER
240
+
241
+ refs = set(chain(*[x.split(";") for x in pair_data["reference"]]))
242
+ return pd.Series({"sign": sign, "reference": refs})
243
+
244
+
245
+ def _get_uniprot_2_symbol_mapping() -> pd.DataFrame:
246
+ """Create a mapping from Uniprot IDs to human gene symbols."""
247
+
248
+ entrez_2_symbol = callr.r_dataframe_to_pandas(
249
+ callr.bioconductor_org_r_function(
250
+ TRRUST_SYMBOL.upper(), species=SPECIES_FULL_NAME_HUMAN
251
+ )
252
+ )
253
+ # only look at symbol which uniquely map to a single gene
254
+ symbol_counts = entrez_2_symbol.value_counts(TRRUST_SYMBOL)
255
+ unique_symbols = symbol_counts[symbol_counts == 1].index.tolist()
256
+ entrez_2_symbol = entrez_2_symbol[
257
+ entrez_2_symbol[TRRUST_SYMBOL].isin(unique_symbols)
258
+ ]
259
+
260
+ # one entrez -> multiple uniprot IDs is okay
261
+ entrez_2_uniprot = callr.r_dataframe_to_pandas(
262
+ callr.bioconductor_org_r_function(
263
+ TRRUST_UNIPROT.upper(), species=SPECIES_FULL_NAME_HUMAN
264
+ )
265
+ )
266
+
267
+ uniprot_2_symbol = entrez_2_symbol.merge(entrez_2_uniprot).drop("gene_id", axis=1)
268
+ return uniprot_2_symbol
269
+
270
+
271
+ def _format_pubmed_for_interactions(pubmed_set):
272
+ """Format a set of pubmed ids as an Identifiers object."""
273
+
274
+ ids = list()
275
+ for p in pubmed_set:
276
+ # some pubmed IDs are bogus
277
+ url = identifiers.create_uri_url(ontology="pubmed", identifier=p, strict=False)
278
+ if url is not None:
279
+ valid_url = identifiers.format_uri(
280
+ uri=url, biological_qualifier_type="BQB_IS_DESCRIBED_BY"
281
+ )
282
+
283
+ ids.append(valid_url)
284
+
285
+ return identifiers.Identifiers(ids)