napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,276 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import xml.etree.ElementTree as ET
6
+ from typing import Any
7
+
8
+ from napistu import utils
9
+ from napistu.ingestion.constants import PSI_MI_INTACT_DEFAULT_OUTPUT_DIR
10
+ from napistu.ingestion.constants import PSI_MI_INTACT_FTP_URL
11
+ from napistu.ingestion.constants import PSI_MI_INTACT_SPECIES_TO_BASENAME
12
+ from napistu.ingestion.constants import PSI_MI_INTACT_XML_NAMESPACE
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def format_psi(
19
+ xml_path: str, xml_namespace: str = PSI_MI_INTACT_XML_NAMESPACE
20
+ ) -> list[dict[str, Any]]:
21
+ """
22
+ Format PSI 3.0
23
+
24
+ Format an .xml file containing molecular interactions following the PSI 3.0 format.
25
+
26
+ Args:
27
+ xml_path (str): path to a .xml file
28
+ xml_namespace (str): Namespace for the xml file
29
+
30
+ Returns:
31
+ entry_list (list): a list containing molecular interaction entry dicts of the format:
32
+ - source : dict containing the database that interactions were drawn from.
33
+ - experiment : a simple summary of the experimental design and the publication.
34
+ - interactor_list : list containing dictionaries annotating the molecules
35
+ (defined by their "interactor_id") involved in interactions.
36
+ - interactions_list : list containing dictionaries annotating molecular
37
+ interactions involving a set of "interactor_id"s.
38
+ """
39
+
40
+ if not os.path.isfile(xml_path):
41
+ raise FileNotFoundError(f"{xml_path} was not found")
42
+
43
+ et = ET.parse(xml_path)
44
+
45
+ # the root should be an entrySet if this is a PSI 3.0 file
46
+ entry_set = et.getroot()
47
+ assert entry_set.tag == PSI_MI_INTACT_XML_NAMESPACE + "entrySet"
48
+
49
+ entry_nodes = entry_set.findall(f"./{PSI_MI_INTACT_XML_NAMESPACE}entry")
50
+
51
+ logger.info(f"Processing {len(entry_nodes)} entries from {xml_path}")
52
+
53
+ formatted_entries = [_format_entry(an_entry) for an_entry in entry_nodes]
54
+
55
+ return formatted_entries
56
+
57
+
58
+ def _download_intact_species(
59
+ species: str,
60
+ output_dir_path: str = PSI_MI_INTACT_DEFAULT_OUTPUT_DIR,
61
+ overwrite: bool = False,
62
+ ):
63
+ """
64
+ Download IntAct Species
65
+
66
+ Download the PSM-30 XML files from IntAct for a species of interest.
67
+
68
+ Args:
69
+ species (str): The species name (Genus species) to work with
70
+ output_dir_path (str): Local directory to create an unzip files into
71
+ overwrite (bool): Overwrite an existing output directory. Default: False
72
+
73
+ Returns:
74
+ None
75
+
76
+ """
77
+ if species not in PSI_MI_INTACT_SPECIES_TO_BASENAME.keys():
78
+ raise ValueError(
79
+ f"The provided species {species} did not match any of the species in INTACT_SPECIES_TO_BASENAME: "
80
+ f"{', '.join(PSI_MI_INTACT_SPECIES_TO_BASENAME.keys())}"
81
+ )
82
+
83
+ intact_species_url = os.path.join(
84
+ PSI_MI_INTACT_FTP_URL, f"{PSI_MI_INTACT_SPECIES_TO_BASENAME[species]}.zip"
85
+ )
86
+
87
+ logger.info(f"Downloading and unzipping {intact_species_url}")
88
+
89
+ utils.download_and_extract(
90
+ intact_species_url,
91
+ output_dir_path=output_dir_path,
92
+ download_method="ftp",
93
+ overwrite=overwrite,
94
+ )
95
+
96
+
97
+ def _format_entry(an_entry) -> dict[str, Any]:
98
+ """Extract a single XML entry of interactors and interactions."""
99
+
100
+ assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
101
+
102
+ entry_dict = {
103
+ "source": _format_entry_source(an_entry),
104
+ "experiment": _format_entry_experiment(an_entry),
105
+ "interactor_list": _format_entry_interactor_list(an_entry),
106
+ "interactions_list": _format_entry_interactions(an_entry),
107
+ }
108
+
109
+ return entry_dict
110
+
111
+
112
+ def _format_entry_source(an_entry) -> dict[str, str]:
113
+ """Format the source describing the provenance of an XML entry."""
114
+
115
+ assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
116
+
117
+ source_names = an_entry.find(
118
+ f".{PSI_MI_INTACT_XML_NAMESPACE}source/.{PSI_MI_INTACT_XML_NAMESPACE}names"
119
+ )
120
+
121
+ out = {
122
+ "short_label": source_names.find(
123
+ f".{PSI_MI_INTACT_XML_NAMESPACE}shortLabel"
124
+ ).text,
125
+ "full_name": source_names.find(f".{PSI_MI_INTACT_XML_NAMESPACE}fullName").text,
126
+ }
127
+
128
+ return out
129
+
130
+
131
+ def _format_entry_experiment(an_entry) -> dict[str, str]:
132
+ """Format experiment-level information in an XML entry."""
133
+
134
+ assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
135
+
136
+ experiment_info = an_entry.find(
137
+ f".{PSI_MI_INTACT_XML_NAMESPACE}experimentList/.{PSI_MI_INTACT_XML_NAMESPACE}experimentDescription"
138
+ )
139
+
140
+ primary_ref = experiment_info.find(
141
+ f".{PSI_MI_INTACT_XML_NAMESPACE}bibref/{PSI_MI_INTACT_XML_NAMESPACE}xref/{PSI_MI_INTACT_XML_NAMESPACE}primaryRef"
142
+ )
143
+
144
+ out = {
145
+ "experiment_name": experiment_info.find(
146
+ f".{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
147
+ ).text,
148
+ "interaction_method": experiment_info.find(
149
+ f".{PSI_MI_INTACT_XML_NAMESPACE}interactionDetectionMethod/{PSI_MI_INTACT_XML_NAMESPACE}"
150
+ f"names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
151
+ ).text,
152
+ "primary_ref_db": primary_ref.attrib["db"],
153
+ "primary_ref_id": primary_ref.attrib["id"],
154
+ }
155
+
156
+ return out
157
+
158
+
159
+ def _format_entry_interactor_list(an_entry) -> list[dict[str, Any]]:
160
+ """Format the molecular interactors in an XML entry."""
161
+
162
+ assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
163
+
164
+ interactor_list = an_entry.find(f"./{PSI_MI_INTACT_XML_NAMESPACE}interactorList")
165
+
166
+ return [_format_entry_interactor(x) for x in interactor_list]
167
+
168
+
169
+ def _format_entry_interactor(interactor) -> dict[str, Any]:
170
+ """Format a single molecular interactor in an interaction list XML node."""
171
+
172
+ assert interactor.tag == PSI_MI_INTACT_XML_NAMESPACE + "interactor"
173
+
174
+ # optional full name
175
+ interactor_name_node = interactor.find(
176
+ f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
177
+ )
178
+ if interactor_name_node is None:
179
+ interactor_name_value = "" # type: ignore
180
+ else:
181
+ interactor_name_value = interactor_name_node.text # type: ignore
182
+
183
+ interactor_aliases = [
184
+ {"alias_type": x.attrib["type"], "alias_value": x.text}
185
+ for x in interactor.findall(
186
+ f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}alias"
187
+ )
188
+ ] # type: ignore
189
+
190
+ out = {
191
+ "interactor_id": interactor.attrib["id"],
192
+ "interactor_label": interactor.find(
193
+ f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}shortLabel"
194
+ ).text,
195
+ "interactor_name": interactor_name_value,
196
+ "interactor_aliases": interactor_aliases,
197
+ "interactor_xrefs": _format_entry_interactor_xrefs(interactor),
198
+ }
199
+
200
+ return out
201
+
202
+
203
+ def _format_entry_interactor_xrefs(interactor) -> list[dict[str, str]]:
204
+ """Format the cross-references of a single interactor."""
205
+
206
+ assert interactor.tag == PSI_MI_INTACT_XML_NAMESPACE + "interactor"
207
+
208
+ xref_nodes = [
209
+ *[
210
+ interactor.find(
211
+ f"./{PSI_MI_INTACT_XML_NAMESPACE}xref/{PSI_MI_INTACT_XML_NAMESPACE}primaryRef"
212
+ )
213
+ ],
214
+ *interactor.findall(
215
+ f"./{PSI_MI_INTACT_XML_NAMESPACE}xref/{PSI_MI_INTACT_XML_NAMESPACE}secondaryRef"
216
+ ),
217
+ ]
218
+
219
+ out = [
220
+ {"tag": x.tag, "db": x.attrib["db"], "id": x.attrib["id"]} for x in xref_nodes
221
+ ]
222
+
223
+ return out
224
+
225
+
226
+ def _format_entry_interactions(an_entry) -> list[dict[str, Any]]:
227
+ """Format the molecular interaction in an XML entry."""
228
+
229
+ assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
230
+
231
+ interaction_list = an_entry.find(f"./{PSI_MI_INTACT_XML_NAMESPACE}interactionList")
232
+
233
+ interaction_dicts = [_format_entry_interaction(x) for x in interaction_list]
234
+
235
+ return interaction_dicts
236
+
237
+
238
+ def _format_entry_interaction(interaction) -> dict[str, Any]:
239
+ """Format a single interaction in an XML interaction list."""
240
+
241
+ assert interaction.tag == PSI_MI_INTACT_XML_NAMESPACE + "interaction"
242
+
243
+ interaction_name = interaction.find(
244
+ f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}shortLabel"
245
+ ).text
246
+ interaction_participants = interaction.findall(
247
+ f"./{PSI_MI_INTACT_XML_NAMESPACE}participantList/{PSI_MI_INTACT_XML_NAMESPACE}participant"
248
+ )
249
+
250
+ # iterate through particpants and format them as a list of dicts
251
+ interactors = [
252
+ _format_entry_interaction_participants(x) for x in interaction_participants
253
+ ]
254
+
255
+ out = {"interaction_name": interaction_name, "interactors": interactors}
256
+
257
+ return out
258
+
259
+
260
+ def _format_entry_interaction_participants(interaction_participant) -> dict[str, str]:
261
+ """Format the participants in an XML interaction."""
262
+
263
+ assert interaction_participant.tag == PSI_MI_INTACT_XML_NAMESPACE + "participant"
264
+
265
+ out = {
266
+ "interactor_id": interaction_participant.attrib["id"],
267
+ "biological_role": interaction_participant.find(
268
+ f"./{PSI_MI_INTACT_XML_NAMESPACE}biologicalRole/{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
269
+ ).text,
270
+ "experimental_role": interaction_participant.find(
271
+ f"./{PSI_MI_INTACT_XML_NAMESPACE}experimentalRoleList/{PSI_MI_INTACT_XML_NAMESPACE}experimentalRole/"
272
+ f"{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
273
+ ).text,
274
+ }
275
+
276
+ return out
@@ -0,0 +1,218 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import logging
5
+ import os
6
+ import random
7
+ from io import StringIO
8
+ from typing import Iterable
9
+
10
+ import pandas as pd
11
+ import requests
12
+ from napistu import indices
13
+ from napistu import sbml_dfs_core
14
+ from napistu import utils
15
+ from napistu.consensus import construct_consensus_model
16
+ from napistu.consensus import construct_sbml_dfs_dict
17
+ from napistu.ingestion.constants import REACTOME_PATHWAY_INDEX_COLUMNS
18
+ from napistu.ingestion.constants import REACTOME_PATHWAY_LIST_COLUMNS
19
+ from napistu.ingestion.constants import REACTOME_PATHWAYS_URL
20
+ from napistu.ingestion.constants import REACTOME_SBGN_URL
21
+ from napistu.ingestion.constants import REACTOME_SMBL_URL
22
+ from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
23
+ from fs import open_fs
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def reactome_sbgn_download(output_dir_path: str, overwrite: bool = False):
29
+ """
30
+ Reactome SBGN Download
31
+
32
+ Download all human Reactome SBGN (systems biology graphical notation) files.
33
+
34
+ Args:
35
+ output_dir_path (str): Paths to a directory where .sbgn files should be saved.
36
+ overwrite (bool): Overwrite an existing output directory.
37
+ """
38
+ utils.download_and_extract(
39
+ REACTOME_SBGN_URL,
40
+ output_dir_path=output_dir_path,
41
+ overwrite=overwrite,
42
+ )
43
+ # create the pathway index
44
+ pw_index = _build_reactome_pw_index(
45
+ output_dir_path,
46
+ file_ext="sbgn",
47
+ # For sbgn only homo sapiens files are available
48
+ species_filter=(SPECIES_FULL_NAME_HUMAN,),
49
+ )
50
+ # save as tsv
51
+ out_fs = open_fs(output_dir_path)
52
+ with out_fs.open("pw_index.tsv", "wb") as index_path:
53
+ pw_index.to_csv(index_path, sep="\t", index=False)
54
+
55
+
56
+ def reactome_sbml_download(output_dir_path: str, overwrite: bool = False):
57
+ """
58
+ Reactome SBML Download
59
+
60
+ Download Reactome SBML (systems biology markup language) for all reactome species.
61
+
62
+ Args:
63
+ output_dir_path (str): Paths to a directory where .sbml files should be saved.
64
+ overwrite (bool): Overwrite an existing output directory. Default: False
65
+ """
66
+ utils.download_and_extract(
67
+ REACTOME_SMBL_URL,
68
+ output_dir_path=output_dir_path,
69
+ overwrite=overwrite,
70
+ )
71
+ # create the pathway index
72
+ pw_index = _build_reactome_pw_index(output_dir_path, file_ext="sbml")
73
+
74
+ # save as tsv
75
+ out_fs = open_fs(output_dir_path)
76
+ with out_fs.open("pw_index.tsv", "wb") as index_path:
77
+ pw_index.to_csv(index_path, sep="\t", index=False)
78
+
79
+
80
+ # Functions useful to integrate reactome pathways into a consensus
81
+ def construct_reactome_consensus(
82
+ pw_index_inp: str | indices.PWIndex,
83
+ species: str | Iterable[str] | None = None,
84
+ outdir: str | None = None,
85
+ strict: bool = True,
86
+ ) -> sbml_dfs_core.SBML_dfs:
87
+ """Constructs a basic consensus model by merging all models from a pw_index
88
+
89
+ Args:
90
+ pw_index_inp (str | indices.PWIndex): PWIndex or uri pointing to PWIndex
91
+ species (str | Iterable[str] | None): one or more species to filter by. Default: no filtering
92
+ outdir (str | None, optional): output directory used to cache results. Defaults to None.
93
+ strict (bool): should failure of loading any given model throw an exception? If False a warning is thrown.
94
+
95
+ Returns:
96
+ sbml_dfs_core.SBML_dfs: A consensus SBML
97
+ """
98
+ if isinstance(pw_index_inp, str):
99
+ pw_index = indices.adapt_pw_index(pw_index_inp, species=species, outdir=outdir)
100
+ elif isinstance(pw_index_inp, indices.PWIndex):
101
+ pw_index = pw_index_inp
102
+ else:
103
+ raise ValueError("pw_index_inp needs to be a PWIndex or a str to a location.")
104
+ if outdir is not None:
105
+ construct_sbml_dfs_dict_fkt = utils.pickle_cache(
106
+ os.path.join(outdir, "model_pool.pkl")
107
+ )(construct_sbml_dfs_dict)
108
+ construct_consensus_model_fkt = utils.pickle_cache(
109
+ os.path.join(outdir, "consensus.pkl")
110
+ )(construct_consensus_model)
111
+ else:
112
+ construct_sbml_dfs_dict_fkt = construct_sbml_dfs_dict
113
+ construct_consensus_model_fkt = construct_consensus_model
114
+
115
+ sbml_dfs_dict = construct_sbml_dfs_dict_fkt(pw_index, strict)
116
+ consensus_model = construct_consensus_model_fkt(sbml_dfs_dict, pw_index)
117
+ return consensus_model
118
+
119
+
120
+ def _build_reactome_pw_index(
121
+ output_dir: str,
122
+ file_ext: str,
123
+ species_filter: Iterable[str] | None = None,
124
+ ) -> pd.DataFrame:
125
+ """Build a reactome pathway index
126
+
127
+ Builds the index based on available files and cross-checkes it with the
128
+ expected reactome pathway list.
129
+
130
+ Args:
131
+ output_dir (str): File directory
132
+ file_ext (str): File extension
133
+ species_filter (Optional[Iterable[str]], optional): Filter the expected
134
+ pathway list based on a list of species. Eg in cases only one species available. Defaults to None.
135
+
136
+ Returns:
137
+ pd.DataFrame: pathway index
138
+ """
139
+ # create the pathway index
140
+ out_fs = open_fs(output_dir)
141
+ all_files = [os.path.basename(f.path) for f in out_fs.glob(f"**/*.{file_ext}")]
142
+
143
+ if len(all_files) == 0:
144
+ raise ValueError(f"Zero files in {output_dir} have the {file_ext} extension")
145
+
146
+ pw_index = pd.DataFrame({"file": all_files}).assign(source="Reactome")
147
+ pw_index["pathway_id"] = [os.path.splitext(x)[0] for x in pw_index["file"]]
148
+
149
+ # test before merging
150
+ pathway_list = _get_reactome_pathway_list()
151
+ if species_filter is not None:
152
+ pathway_list = pathway_list.loc[pathway_list["species"].isin(species_filter)]
153
+
154
+ _check_reactome_pw_index(pw_index, pathway_list)
155
+ pw_index = pw_index.merge(pathway_list)
156
+ pw_index = pw_index[REACTOME_PATHWAY_INDEX_COLUMNS]
157
+ pw_index["date"] = datetime.date.today().strftime("%Y%m%d")
158
+
159
+ return pw_index
160
+
161
+
162
+ def _check_reactome_pw_index(pw_index: indices.PWIndex, reactome_pathway_list: list):
163
+ """Compare local files defined in the pathway index to a list of Reactome's pathways."""
164
+
165
+ # check extension in pw_index
166
+ extn = set([os.path.splitext(x)[1] for x in pw_index["file"]])
167
+ assert len(extn) == 1
168
+ assert len(extn.intersection(set([".sbgn", ".sbml"]))) == 1
169
+ extn_string = extn.pop()
170
+
171
+ local_reactome_pws = set(pw_index["pathway_id"])
172
+ remote_reactome_pws = set(reactome_pathway_list["pathway_id"])
173
+
174
+ extra_local = local_reactome_pws.difference(remote_reactome_pws)
175
+ if len(extra_local) != 0:
176
+ n_samples = min(5, len(extra_local))
177
+ local_str = ", ".join(random.sample(list(extra_local), n_samples))
178
+
179
+ logger.warning(
180
+ f"{len(extra_local)} Reactome {extn_string} files were detected "
181
+ "which are not found in reactome.get_reactome_pathway_list(). "
182
+ f"The include {local_str}. "
183
+ "These files will be excluded from the pathway index"
184
+ )
185
+
186
+ extra_remote = remote_reactome_pws.difference(local_reactome_pws)
187
+
188
+ if len(extra_remote) != 0:
189
+ n_samples = min(5, len(extra_remote))
190
+ remote_str = ", ".join(random.sample(list(extra_remote), n_samples))
191
+
192
+ logger.warning(
193
+ f"{len(extra_remote)} Reactome {extn_string} files were missing "
194
+ "which should be present based on reactome.get_reactome_pathway_list(). "
195
+ f"These include {remote_str}."
196
+ )
197
+ return None
198
+
199
+
200
+ def _get_reactome_pathway_list():
201
+ """Reactome Pathway List
202
+ Produce a pd.DataFrame listing all pathways in reactome and their internal ids
203
+
204
+ Parameters:
205
+ None
206
+
207
+ Returns:
208
+ pd.DataFrame containing pathway_id, name and species
209
+ """
210
+ page = requests.get(REACTOME_PATHWAYS_URL)
211
+ if page.status_code != 200:
212
+ raise ValueError(
213
+ f"Reactome data could not be accessed at {REACTOME_PATHWAYS_URL}"
214
+ )
215
+ StringData = StringIO(page.content.decode())
216
+ df = pd.read_csv(StringData, sep="\t", names=REACTOME_PATHWAY_LIST_COLUMNS)
217
+
218
+ return df