napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
napistu/utils.py
ADDED
@@ -0,0 +1,943 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import gzip
|
4
|
+
import io
|
5
|
+
import json
|
6
|
+
import logging
|
7
|
+
import os
|
8
|
+
import pickle
|
9
|
+
import re
|
10
|
+
import shutil
|
11
|
+
import urllib.request as request
|
12
|
+
import zipfile
|
13
|
+
from contextlib import closing
|
14
|
+
from itertools import starmap
|
15
|
+
from typing import Any
|
16
|
+
from typing import Union
|
17
|
+
from urllib.parse import urlparse
|
18
|
+
|
19
|
+
import igraph as ig
|
20
|
+
import pandas as pd
|
21
|
+
import requests
|
22
|
+
from napistu.constants import FILE_EXT_GZ
|
23
|
+
from napistu.constants import FILE_EXT_ZIP
|
24
|
+
from fs import open_fs
|
25
|
+
from fs.copy import copy_dir
|
26
|
+
from fs.copy import copy_file
|
27
|
+
from fs.copy import copy_fs
|
28
|
+
from fs.errors import CreateFailed
|
29
|
+
from fs.errors import ResourceNotFound
|
30
|
+
from fs.tarfs import TarFS
|
31
|
+
from fs.tempfs import TempFS
|
32
|
+
from fs.zipfs import ZipFS
|
33
|
+
from requests.adapters import HTTPAdapter
|
34
|
+
from requests.adapters import Retry
|
35
|
+
|
36
|
+
logger = logging.getLogger(__name__)
|
37
|
+
|
38
|
+
|
39
|
+
def initialize_dir(output_dir_path: str, overwrite: bool):
|
40
|
+
"""Initializes a filesystem directory
|
41
|
+
|
42
|
+
Args:
|
43
|
+
output_dir_path (str): path to new directory
|
44
|
+
overwrite (bool): overwrite? if true, directory will be
|
45
|
+
deleted and recreated
|
46
|
+
|
47
|
+
Raises:
|
48
|
+
FileExistsError
|
49
|
+
"""
|
50
|
+
output_dir_path = str(output_dir_path)
|
51
|
+
try:
|
52
|
+
with open_fs(output_dir_path) as out_fs:
|
53
|
+
if overwrite:
|
54
|
+
out_fs.removetree("/")
|
55
|
+
else:
|
56
|
+
raise FileExistsError(
|
57
|
+
f"{output_dir_path} already exists and overwrite is False"
|
58
|
+
)
|
59
|
+
except CreateFailed:
|
60
|
+
# If gcs bucket did not exist yet, create it
|
61
|
+
with open_fs(output_dir_path, create=True):
|
62
|
+
pass
|
63
|
+
|
64
|
+
|
65
|
+
def download_and_extract(
|
66
|
+
url: str,
|
67
|
+
output_dir_path: str = ".",
|
68
|
+
download_method: str = "wget",
|
69
|
+
overwrite: bool = False,
|
70
|
+
) -> None:
|
71
|
+
"""
|
72
|
+
Download and Unpack
|
73
|
+
|
74
|
+
Download an archive and then extract to a new folder
|
75
|
+
|
76
|
+
Args:
|
77
|
+
url (str): Url of archive.
|
78
|
+
output_dir_path (str): Path to output directory.
|
79
|
+
overwrite (bool): Overwrite an existing output directory.
|
80
|
+
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
None
|
84
|
+
"""
|
85
|
+
|
86
|
+
# initialize output directory
|
87
|
+
output_dir_path = str(output_dir_path)
|
88
|
+
initialize_dir(output_dir_path, overwrite)
|
89
|
+
|
90
|
+
out_fs = open_fs(output_dir_path)
|
91
|
+
extn = get_extn_from_url(url)
|
92
|
+
|
93
|
+
# download archive file
|
94
|
+
tmp_fs = TempFS()
|
95
|
+
tmp_file = os.path.join(tmp_fs.root_path, f"cpr_tmp{extn}")
|
96
|
+
|
97
|
+
if download_method == "wget":
|
98
|
+
download_wget(url, tmp_file)
|
99
|
+
elif download_method == "ftp":
|
100
|
+
download_ftp(url, tmp_file)
|
101
|
+
else:
|
102
|
+
raise ValueError("undefined download_method, defined methods are wget and ftp")
|
103
|
+
|
104
|
+
if re.search(".tar\\.gz$", extn) or re.search("\\.tgz$", extn):
|
105
|
+
# untar .tar.gz into individual files
|
106
|
+
with TarFS(tmp_file) as tar_fs:
|
107
|
+
copy_fs(tar_fs, out_fs)
|
108
|
+
logger.info(f"Archive downloaded and untared to {output_dir_path}")
|
109
|
+
elif re.search("\\.zip$", extn):
|
110
|
+
with ZipFS(tmp_file) as zip_fs:
|
111
|
+
copy_fs(zip_fs, out_fs)
|
112
|
+
logger.info(f"Archive downloaded and unzipped to {output_dir_path}")
|
113
|
+
elif re.search("\\.gz$", extn):
|
114
|
+
outfile = url.split("/")[-1].replace(".gz", "")
|
115
|
+
# gunzip file
|
116
|
+
with gzip.open(tmp_file, "rb") as f_in:
|
117
|
+
with out_fs.open(outfile, "wb") as f_out:
|
118
|
+
f_out.write(f_in.read())
|
119
|
+
else:
|
120
|
+
raise ValueError(f"{extn} is not supported")
|
121
|
+
|
122
|
+
# Close fs
|
123
|
+
tmp_fs.close()
|
124
|
+
out_fs.close()
|
125
|
+
|
126
|
+
return None
|
127
|
+
|
128
|
+
|
129
|
+
def extract(file: str):
|
130
|
+
"""
|
131
|
+
Download and Unpack
|
132
|
+
|
133
|
+
Untar, unzip and ungzip
|
134
|
+
|
135
|
+
Args:
|
136
|
+
file (str): Path to compressed file
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
None
|
140
|
+
"""
|
141
|
+
|
142
|
+
extn = get_extn_from_url(file)
|
143
|
+
if re.search(".tar\\.gz$", extn) or re.search("\\.tgz$", extn):
|
144
|
+
output_dir_path = os.path.join(
|
145
|
+
os.path.join(
|
146
|
+
os.path.dirname(file), os.path.basename(file).replace(extn, "")
|
147
|
+
)
|
148
|
+
)
|
149
|
+
else:
|
150
|
+
output_dir_path = os.path.dirname(file)
|
151
|
+
|
152
|
+
try:
|
153
|
+
initialize_dir(output_dir_path, overwrite=False)
|
154
|
+
except FileExistsError:
|
155
|
+
pass
|
156
|
+
|
157
|
+
out_fs = open_fs(output_dir_path)
|
158
|
+
|
159
|
+
if re.search(".tar\\.gz$", extn) or re.search("\\.tgz$", extn):
|
160
|
+
# untar .tar.gz into individual files
|
161
|
+
with TarFS(file) as tar_fs:
|
162
|
+
copy_fs(tar_fs, out_fs)
|
163
|
+
logger.info(f"Archive downloaded and untared to {output_dir_path}")
|
164
|
+
elif re.search("\\.zip$", extn):
|
165
|
+
with ZipFS(file) as zip_fs:
|
166
|
+
copy_fs(zip_fs, out_fs)
|
167
|
+
logger.info(f"Archive downloaded and unzipped to {output_dir_path}")
|
168
|
+
elif re.search("\\.gz$", extn):
|
169
|
+
outfile = file.split("/")[-1].replace(".gz", "")
|
170
|
+
# gunzip file
|
171
|
+
with gzip.open(file, "rb") as f_in:
|
172
|
+
with out_fs.open(outfile, "wb") as f_out:
|
173
|
+
f_out.write(f_in.read())
|
174
|
+
else:
|
175
|
+
raise ValueError(f"{extn} is not supported")
|
176
|
+
|
177
|
+
# Close fs
|
178
|
+
out_fs.close()
|
179
|
+
|
180
|
+
return None
|
181
|
+
|
182
|
+
|
183
|
+
def gunzip(gzipped_path: str, outpath: str | None = None) -> None:
|
184
|
+
"""Gunzip a file to an output path."""
|
185
|
+
|
186
|
+
if not os.path.exists(gzipped_path):
|
187
|
+
raise FileNotFoundError(f"{gzipped_path} not found")
|
188
|
+
|
189
|
+
if not re.search("\\.gz$", gzipped_path):
|
190
|
+
logger.warning("{gzipped_path} does not have the .gz extension")
|
191
|
+
|
192
|
+
if outpath is None:
|
193
|
+
# determine outfile name automatically if not provided
|
194
|
+
outpath = os.path.join(
|
195
|
+
os.path.dirname(gzipped_path),
|
196
|
+
gzipped_path.split("/")[-1].replace(".gz", ""),
|
197
|
+
)
|
198
|
+
outfile = os.path.basename(outpath)
|
199
|
+
|
200
|
+
out_fs = open_fs(os.path.dirname(outpath))
|
201
|
+
# gunzip file
|
202
|
+
with gzip.open(gzipped_path, "rb") as f_in:
|
203
|
+
with out_fs.open(outfile, "wb") as f_out:
|
204
|
+
f_out.write(f_in.read())
|
205
|
+
out_fs.close()
|
206
|
+
|
207
|
+
return None
|
208
|
+
|
209
|
+
|
210
|
+
def get_extn_from_url(url: str) -> str:
|
211
|
+
"""Retrieves file extension from an URL
|
212
|
+
|
213
|
+
Args:
|
214
|
+
url (str): url
|
215
|
+
|
216
|
+
Raises:
|
217
|
+
ValueError: Raised when no extension identified
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
str: the identified extension
|
221
|
+
|
222
|
+
Examples:
|
223
|
+
>>> get_extn_from_url('https://test/test.gz')
|
224
|
+
'.gz'
|
225
|
+
>>> get_extn_from_url('https://test/test.tar.gz')
|
226
|
+
'.tar.gz'
|
227
|
+
>>> get_extn_from_url('https://test/test.tar.gz/bla')
|
228
|
+
Traceback (most recent call last):
|
229
|
+
...
|
230
|
+
ValueError: File extension not identifiable: https://test/test.tar.gz/bla
|
231
|
+
"""
|
232
|
+
match = re.search("\\..+$", os.path.split(url)[1])
|
233
|
+
if match is None:
|
234
|
+
raise ValueError(f"File extension not identifiable: {url}")
|
235
|
+
else:
|
236
|
+
extn = match.group(0)
|
237
|
+
return extn
|
238
|
+
|
239
|
+
|
240
|
+
def write_file_contents_to_path(path: str, contents) -> None:
|
241
|
+
"""Helper function to write file contents to the path.
|
242
|
+
|
243
|
+
Args:
|
244
|
+
path (str): destination
|
245
|
+
contents (Any): file contents
|
246
|
+
|
247
|
+
Returns:
|
248
|
+
None
|
249
|
+
"""
|
250
|
+
if hasattr(path, "write") and hasattr(path, "__iter__"):
|
251
|
+
path.write(contents) # type: ignore
|
252
|
+
else:
|
253
|
+
base, filename = get_target_base_and_path(path)
|
254
|
+
with open_fs(base, create=True) as fs:
|
255
|
+
with fs.open(filename, "wb") as f:
|
256
|
+
f.write(contents) # type: ignore
|
257
|
+
|
258
|
+
return None
|
259
|
+
|
260
|
+
|
261
|
+
def download_wget(
|
262
|
+
url: str, path, target_filename: str = None, verify: bool = True
|
263
|
+
) -> None:
|
264
|
+
"""Downloades file / archive with wget
|
265
|
+
|
266
|
+
Args:
|
267
|
+
url (str): url
|
268
|
+
path (FilePath | WriteBuffer): file path or buffer
|
269
|
+
target_filename (str): specific file to extract from ZIP if URL is a ZIP file
|
270
|
+
verify (bool): verify argument to pass to requests.get
|
271
|
+
|
272
|
+
Returns:
|
273
|
+
None
|
274
|
+
"""
|
275
|
+
r = requests.get(url, allow_redirects=True, verify=verify)
|
276
|
+
# throw an exception if one was generated
|
277
|
+
r.raise_for_status()
|
278
|
+
|
279
|
+
# check if the content is a ZIP file
|
280
|
+
if (
|
281
|
+
r.headers.get("Content-Type") == "application/zip"
|
282
|
+
or url.endswith(f".{FILE_EXT_ZIP}")
|
283
|
+
) and target_filename:
|
284
|
+
# load the ZIP file in memory
|
285
|
+
with zipfile.ZipFile(io.BytesIO(r.content)) as z:
|
286
|
+
# check if the target file exists in the ZIP archive
|
287
|
+
if target_filename in z.namelist():
|
288
|
+
with z.open(target_filename) as target_file:
|
289
|
+
# apply the same logic as below to the target file
|
290
|
+
return write_file_contents_to_path(path, target_file.read())
|
291
|
+
else:
|
292
|
+
raise FileNotFoundError(
|
293
|
+
f"{target_filename} not found in the ZIP archive"
|
294
|
+
)
|
295
|
+
# check if the content is a GZIP (single-file compression)
|
296
|
+
elif url.endswith(f".{FILE_EXT_GZ}"):
|
297
|
+
with gzip.GzipFile(fileobj=io.BytesIO(r.content)) as gz:
|
298
|
+
return write_file_contents_to_path(path, gz.read())
|
299
|
+
else:
|
300
|
+
# not an archive -> default case -> write file directly
|
301
|
+
return write_file_contents_to_path(path, r.content)
|
302
|
+
|
303
|
+
|
304
|
+
def download_ftp(url, path):
|
305
|
+
with closing(request.urlopen(url)) as r:
|
306
|
+
with open(path, "wb") as f:
|
307
|
+
shutil.copyfileobj(r, f)
|
308
|
+
|
309
|
+
return None
|
310
|
+
|
311
|
+
|
312
|
+
def requests_retry_session(
|
313
|
+
retries=5,
|
314
|
+
backoff_factor=0.3,
|
315
|
+
status_forcelist=(500, 502, 503, 504),
|
316
|
+
session: requests.Session | None = None,
|
317
|
+
**kwargs,
|
318
|
+
) -> requests.Session:
|
319
|
+
"""Requests session with retry logic
|
320
|
+
|
321
|
+
This should help to combat flaky apis, eg Brenda.
|
322
|
+
From: https://stackoverflow.com/a/58687549
|
323
|
+
|
324
|
+
Args:
|
325
|
+
retries (int, optional): Number of retries. Defaults to 5.
|
326
|
+
backoff_factor (float, optional): backoff. Defaults to 0.3.
|
327
|
+
status_forcelist (tuple, optional): errors to retry. Defaults to (500, 502, 503, 504).
|
328
|
+
session (Optional[requests.Session], optional): existing session. Defaults to None.
|
329
|
+
|
330
|
+
Returns:
|
331
|
+
requests.Session: new requests session
|
332
|
+
"""
|
333
|
+
session = session or requests.Session()
|
334
|
+
retry = Retry(
|
335
|
+
total=retries,
|
336
|
+
read=retries,
|
337
|
+
connect=retries,
|
338
|
+
backoff_factor=backoff_factor,
|
339
|
+
status_forcelist=status_forcelist,
|
340
|
+
**kwargs,
|
341
|
+
)
|
342
|
+
adapter = HTTPAdapter(max_retries=retry)
|
343
|
+
session.mount("http://", adapter)
|
344
|
+
session.mount("https://", adapter)
|
345
|
+
return session
|
346
|
+
|
347
|
+
|
348
|
+
def pickle_cache(path: str, overwrite: bool = False):
|
349
|
+
"""A decorator to cache a function call result to pickle
|
350
|
+
|
351
|
+
Attention: this does not care about the function arguments
|
352
|
+
All function calls will be served by the same pickle file.
|
353
|
+
|
354
|
+
Args:
|
355
|
+
path (str): path to the cache pickle file
|
356
|
+
overwrite (bool): should an existing cache be overwritten even
|
357
|
+
if it exists?
|
358
|
+
|
359
|
+
Returns:
|
360
|
+
A function whos output will be cached to pickle.
|
361
|
+
"""
|
362
|
+
|
363
|
+
if overwrite:
|
364
|
+
if path_exists(path):
|
365
|
+
if not os.path.isfile(path):
|
366
|
+
logger.warning(
|
367
|
+
f"{path} is a GCS URI and cannot be deleted using overwrite = True"
|
368
|
+
)
|
369
|
+
else:
|
370
|
+
logger.info(
|
371
|
+
f"Deleting {path} because file exists and overwrite is True"
|
372
|
+
)
|
373
|
+
os.remove(path)
|
374
|
+
|
375
|
+
def decorator(fkt):
|
376
|
+
def wrapper(*args, **kwargs):
|
377
|
+
if path_exists(path):
|
378
|
+
logger.info(
|
379
|
+
"Not running function %s but using cache file '%s' instead.",
|
380
|
+
fkt.__name__,
|
381
|
+
path,
|
382
|
+
)
|
383
|
+
dat = load_pickle(path)
|
384
|
+
else:
|
385
|
+
dat = fkt(*args, **kwargs)
|
386
|
+
save_pickle(path, dat)
|
387
|
+
return dat
|
388
|
+
|
389
|
+
return wrapper
|
390
|
+
|
391
|
+
return decorator
|
392
|
+
|
393
|
+
|
394
|
+
def path_exists(path: str) -> bool:
|
395
|
+
"""Checks if path or uri exists
|
396
|
+
|
397
|
+
Args:
|
398
|
+
path (str): path/uri
|
399
|
+
|
400
|
+
Returns:
|
401
|
+
bool: exists?
|
402
|
+
"""
|
403
|
+
dir, file = os.path.split(path)
|
404
|
+
try:
|
405
|
+
with open_fs(dir) as f:
|
406
|
+
return f.exists(file)
|
407
|
+
except CreateFailed:
|
408
|
+
# If the path is on gcfs,
|
409
|
+
# it could be that the parent
|
410
|
+
# does not exist, but the path does
|
411
|
+
pass
|
412
|
+
|
413
|
+
# If the path is a directory
|
414
|
+
# it is enough that it itself
|
415
|
+
# exists
|
416
|
+
try:
|
417
|
+
with open_fs(path) as f:
|
418
|
+
return True
|
419
|
+
except CreateFailed:
|
420
|
+
return False
|
421
|
+
|
422
|
+
|
423
|
+
def save_pickle(path: str, dat: object):
|
424
|
+
"""Saves object to path as pickle
|
425
|
+
|
426
|
+
Args:
|
427
|
+
path (str): target path
|
428
|
+
dat (object): object
|
429
|
+
"""
|
430
|
+
dir, file = get_target_base_and_path(path)
|
431
|
+
with open_fs(dir, create=True) as f:
|
432
|
+
with f.open(file, "wb") as f:
|
433
|
+
pickle.dump(dat, f)
|
434
|
+
|
435
|
+
|
436
|
+
def load_pickle(path: str):
|
437
|
+
"""Loads pickle object to path
|
438
|
+
|
439
|
+
Args:
|
440
|
+
path (str): path to pickle
|
441
|
+
|
442
|
+
Returns:
|
443
|
+
Any: Object
|
444
|
+
"""
|
445
|
+
dir, file = get_source_base_and_path(path)
|
446
|
+
with open_fs(dir) as source_fs:
|
447
|
+
try:
|
448
|
+
with source_fs.open(file, "rb") as f:
|
449
|
+
return pickle.load(f)
|
450
|
+
except ResourceNotFound as e:
|
451
|
+
if hasattr(source_fs, "fix_storage"):
|
452
|
+
logger.info(
|
453
|
+
"File could not be opened. Trying to fix storage for FS-GCFS. "
|
454
|
+
"This is required because of: https://fs-gcsfs.readthedocs.io/en/latest/#limitations "
|
455
|
+
"and will add empty blobs to indicate directories."
|
456
|
+
)
|
457
|
+
source_fs.fix_storage()
|
458
|
+
else:
|
459
|
+
raise e
|
460
|
+
|
461
|
+
|
462
|
+
read_pickle = load_pickle
|
463
|
+
write_pickle = save_pickle
|
464
|
+
|
465
|
+
|
466
|
+
def get_source_base_and_path(uri: str) -> tuple[str, str]:
|
467
|
+
"""Get the base of a bucket or folder and the path to the file
|
468
|
+
|
469
|
+
Args:
|
470
|
+
uri (str): uri
|
471
|
+
|
472
|
+
Returns:
|
473
|
+
tuple[str, str]: base: the base folder of the bucket
|
474
|
+
|
475
|
+
Example:
|
476
|
+
>>> get_source_base_and_path("gs://bucket/folder/file")
|
477
|
+
('gs://bucket', 'folder/file')
|
478
|
+
>>> get_source_base_and_path("/bucket/folder/file")
|
479
|
+
('/bucket/folder', 'file')
|
480
|
+
"""
|
481
|
+
uri = str(uri)
|
482
|
+
urlelements = urlparse(uri)
|
483
|
+
if len(urlelements.scheme) > 0:
|
484
|
+
base = urlelements.scheme + "://" + urlelements.netloc
|
485
|
+
path = urlelements.path[1:]
|
486
|
+
else:
|
487
|
+
base, path = os.path.split(uri)
|
488
|
+
return base, path
|
489
|
+
|
490
|
+
|
491
|
+
def get_target_base_and_path(uri):
|
492
|
+
"""Get the base of a bucket + directory and the file
|
493
|
+
|
494
|
+
Args:
|
495
|
+
uri (str): uri
|
496
|
+
|
497
|
+
Returns:
|
498
|
+
tuple[str, str]: base: the base folder + path of the bucket
|
499
|
+
file: the file
|
500
|
+
|
501
|
+
Example:
|
502
|
+
>>> get_target_base_and_path("gs://bucket/folder/file")
|
503
|
+
('gs://bucket/folder', 'file')
|
504
|
+
>>> get_target_base_and_path("bucket/folder/file")
|
505
|
+
('bucket/folder', 'file')
|
506
|
+
>>> get_target_base_and_path("/bucket/folder/file")
|
507
|
+
('/bucket/folder', 'file')
|
508
|
+
"""
|
509
|
+
base, path = os.path.split(uri)
|
510
|
+
return base, path
|
511
|
+
|
512
|
+
|
513
|
+
def copy_uri(input_uri: str, output_uri: str, is_file=True):
|
514
|
+
"""Copy a file or folder from one uri to another
|
515
|
+
|
516
|
+
Args:
|
517
|
+
input_uri (str): input file uri (gcs, http, ...)
|
518
|
+
output_uri (str): path to output file (gcs, local)
|
519
|
+
is_file (bool, optional): Is this a file or folder?. Defaults to True.
|
520
|
+
"""
|
521
|
+
logger.info("Copy uri from %s to %s", input_uri, output_uri)
|
522
|
+
source_base, source_path = get_source_base_and_path(input_uri)
|
523
|
+
target_base, target_path = get_target_base_and_path(output_uri)
|
524
|
+
if is_file:
|
525
|
+
copy_fun = copy_file
|
526
|
+
else:
|
527
|
+
copy_fun = copy_dir
|
528
|
+
with open_fs(source_base) as source_fs:
|
529
|
+
with open_fs(target_base, create=True) as target_fs:
|
530
|
+
try:
|
531
|
+
copy_fun(source_fs, source_path, target_fs, target_path)
|
532
|
+
except ResourceNotFound as e:
|
533
|
+
if hasattr(source_fs, "fix_storage"):
|
534
|
+
logger.info(
|
535
|
+
"File could not be opened. Trying to fix storage for FS-GCFS. "
|
536
|
+
"This is required because of: https://fs-gcsfs.readthedocs.io/en/latest/#limitations "
|
537
|
+
"and will add empty blobs to indicate directories."
|
538
|
+
)
|
539
|
+
source_fs.fix_storage()
|
540
|
+
copy_fun(source_fs, source_path, target_fs, target_path)
|
541
|
+
else:
|
542
|
+
raise (e)
|
543
|
+
|
544
|
+
|
545
|
+
def save_json(uri: str, object: Any) -> None:
|
546
|
+
"""Write object to json file at uri
|
547
|
+
|
548
|
+
Args:
|
549
|
+
object (Any): object to write
|
550
|
+
uri (str): path to json file
|
551
|
+
"""
|
552
|
+
target_base, target_path = get_target_base_and_path(uri)
|
553
|
+
with open_fs(target_base, create=True) as target_fs:
|
554
|
+
target_fs.writetext(target_path, json.dumps(object))
|
555
|
+
|
556
|
+
|
557
|
+
def load_json(uri: str) -> Any:
|
558
|
+
"""Read json from uri
|
559
|
+
|
560
|
+
Args:
|
561
|
+
uri (str): path to json file
|
562
|
+
"""
|
563
|
+
logger.info("Read json from %s", uri)
|
564
|
+
source_base, source_path = get_source_base_and_path(uri)
|
565
|
+
with open_fs(source_base) as source_fs:
|
566
|
+
try:
|
567
|
+
txt = source_fs.readtext(source_path)
|
568
|
+
except ResourceNotFound as e:
|
569
|
+
if hasattr(source_fs, "fix_storage"):
|
570
|
+
logger.info(
|
571
|
+
"File could not be opened. Trying to fix storage for FS-GCFS. "
|
572
|
+
"This is required because of: https://fs-gcsfs.readthedocs.io/en/latest/#limitations "
|
573
|
+
"and will add empty blobs to indicate directories."
|
574
|
+
)
|
575
|
+
source_fs.fix_storage()
|
576
|
+
txt = source_fs.readtext(source_path)
|
577
|
+
else:
|
578
|
+
raise (e)
|
579
|
+
return json.loads(txt)
|
580
|
+
|
581
|
+
|
582
|
+
def extract_regex_search(regex: str, query: str, index_value: int = 0) -> str:
|
583
|
+
"""
|
584
|
+
Match an identifier substring and otherwise throw an error
|
585
|
+
|
586
|
+
Args:
|
587
|
+
regex (str): regular expression to search
|
588
|
+
query (str): string to search against
|
589
|
+
index_value (int): entry in index to return
|
590
|
+
|
591
|
+
return:
|
592
|
+
match (str): a character string match
|
593
|
+
|
594
|
+
"""
|
595
|
+
|
596
|
+
if m := re.search(regex, query):
|
597
|
+
match = m[index_value]
|
598
|
+
else:
|
599
|
+
raise ValueError(
|
600
|
+
f"{query} does not match the identifier regular expression: {regex}"
|
601
|
+
)
|
602
|
+
|
603
|
+
return match
|
604
|
+
|
605
|
+
|
606
|
+
def extract_regex_match(regex: str, query: str) -> str:
|
607
|
+
"""
|
608
|
+
Args:
|
609
|
+
regex (str): regular expression to search
|
610
|
+
query (str): string to search against
|
611
|
+
|
612
|
+
return:
|
613
|
+
match (str): a character string match
|
614
|
+
"""
|
615
|
+
|
616
|
+
if m := re.match(regex, query):
|
617
|
+
if len(m.groups()) > 0:
|
618
|
+
match = m.groups()[0]
|
619
|
+
else:
|
620
|
+
raise ValueError(
|
621
|
+
f"{query} does not match a subgroup in the regular expression: {regex}"
|
622
|
+
)
|
623
|
+
else:
|
624
|
+
raise ValueError(f"{query} does not match the regular expression: {regex}")
|
625
|
+
|
626
|
+
return match
|
627
|
+
|
628
|
+
|
629
|
+
class match_pd_vars:
|
630
|
+
"""
|
631
|
+
Match Pandas Variables.
|
632
|
+
|
633
|
+
Attributes
|
634
|
+
----------
|
635
|
+
req_vars:
|
636
|
+
A set of variables which should exist in df
|
637
|
+
missing_vars:
|
638
|
+
Required variables which are not present in df
|
639
|
+
extra_vars:
|
640
|
+
Non-required variables which are present in df
|
641
|
+
are_present:
|
642
|
+
Returns True if req_vars are present and False otherwise
|
643
|
+
|
644
|
+
Methods
|
645
|
+
-------
|
646
|
+
assert_present()
|
647
|
+
Raise an exception of req_vars are absent
|
648
|
+
|
649
|
+
"""
|
650
|
+
|
651
|
+
def __init__(
|
652
|
+
self, df: pd.DataFrame | pd.Series, req_vars: set, allow_series: bool = True
|
653
|
+
) -> None:
|
654
|
+
"""
|
655
|
+
Connects to an SBML file
|
656
|
+
|
657
|
+
Parameters
|
658
|
+
----------
|
659
|
+
df
|
660
|
+
A pd.DataFrame or pd.Series
|
661
|
+
req_vars
|
662
|
+
A set of variables which should exist in df
|
663
|
+
allow_series:
|
664
|
+
Can a pd.Series be provided as df?
|
665
|
+
|
666
|
+
Returns
|
667
|
+
-------
|
668
|
+
None.
|
669
|
+
"""
|
670
|
+
|
671
|
+
if isinstance(df, pd.Series):
|
672
|
+
if not allow_series:
|
673
|
+
raise TypeError("df was a pd.Series and must be a pd.DataFrame")
|
674
|
+
vars_present = set(df.index.tolist())
|
675
|
+
elif isinstance(df, pd.DataFrame):
|
676
|
+
vars_present = set(df.columns.tolist())
|
677
|
+
else:
|
678
|
+
raise TypeError(
|
679
|
+
f"df was a {type(df).__name__} and must be a pd.DataFrame or pd.Series"
|
680
|
+
)
|
681
|
+
|
682
|
+
self.req_vars = req_vars
|
683
|
+
self.missing_vars = req_vars.difference(vars_present)
|
684
|
+
self.extra_vars = vars_present.difference(req_vars)
|
685
|
+
|
686
|
+
if len(self.missing_vars) == 0:
|
687
|
+
self.are_present = True
|
688
|
+
else:
|
689
|
+
self.are_present = False
|
690
|
+
|
691
|
+
def assert_present(self) -> None:
|
692
|
+
"""
|
693
|
+
Raise an error if required variables are missing
|
694
|
+
"""
|
695
|
+
|
696
|
+
if not self.are_present:
|
697
|
+
raise ValueError(
|
698
|
+
f"{len(self.missing_vars)} required variables were "
|
699
|
+
"missing from the provided pd.DataFrame or pd.Series: "
|
700
|
+
f"{', '.join(self.missing_vars)}"
|
701
|
+
)
|
702
|
+
|
703
|
+
return None
|
704
|
+
|
705
|
+
|
706
|
+
def ensure_pd_df(pd_df_or_series: pd.DataFrame | pd.Series) -> pd.DataFrame:
|
707
|
+
"""
|
708
|
+
Ensure Pandas DataFrame
|
709
|
+
|
710
|
+
Convert a pd.Series to a DataFrame if needed.
|
711
|
+
|
712
|
+
Args:
|
713
|
+
pd_df_or_series (pd.Series | pd.DataFrame):
|
714
|
+
a pandas df or series
|
715
|
+
|
716
|
+
Returns:
|
717
|
+
pd_df converted to a pd.DataFrame if needed
|
718
|
+
|
719
|
+
"""
|
720
|
+
|
721
|
+
if isinstance(pd_df_or_series, pd.DataFrame):
|
722
|
+
return pd_df_or_series
|
723
|
+
elif isinstance(pd_df_or_series, pd.Series):
|
724
|
+
return pd_df_or_series.to_frame().T
|
725
|
+
else:
|
726
|
+
raise TypeError(
|
727
|
+
"ensure_pd_df expects either a pandas DataFrame or Series but received"
|
728
|
+
f" a {type(pd_df_or_series)}"
|
729
|
+
)
|
730
|
+
|
731
|
+
|
732
|
+
def format_identifiers_as_edgelist(
|
733
|
+
df: pd.DataFrame, defining_vars: list[str]
|
734
|
+
) -> pd.DataFrame:
|
735
|
+
"""
|
736
|
+
Format Identifiers as Edgelist
|
737
|
+
|
738
|
+
Collapse a multiindex to an index (if needed), and similarly collapse multiple variables to a single entry.
|
739
|
+
This indexed pd.Sereies of index - ids can be treated as an edgelist for greedy clustering.
|
740
|
+
|
741
|
+
Args:
|
742
|
+
df (pd.DataFrame):
|
743
|
+
Any pd.DataFrame
|
744
|
+
defining_vars (list(str)):
|
745
|
+
A set of attributes which define a distinct entry in df
|
746
|
+
|
747
|
+
Returns:
|
748
|
+
df (pd.DataFrame):
|
749
|
+
A pd.DataFrame with an "ind" and "id" variable added indicating rolled up
|
750
|
+
values of the index and defining_vars
|
751
|
+
"""
|
752
|
+
|
753
|
+
assert isinstance(df, pd.DataFrame)
|
754
|
+
# requires a named index by convention
|
755
|
+
if None in df.index.names:
|
756
|
+
raise ValueError(
|
757
|
+
"df did not have a named index. A named index or multindex is expected"
|
758
|
+
)
|
759
|
+
|
760
|
+
assert isinstance(defining_vars, list)
|
761
|
+
|
762
|
+
logger.info(
|
763
|
+
f"creating an edgelist linking index levels {', '.join(df.index.names)} and linking it "
|
764
|
+
f"to levels defined by {', '.join(defining_vars)}"
|
765
|
+
)
|
766
|
+
|
767
|
+
# df is a pd.DataFrame and contains defining_vars
|
768
|
+
match_pd_vars(df, req_vars=set(defining_vars), allow_series=False).assert_present()
|
769
|
+
|
770
|
+
# combine all components of a multindex into a single index value
|
771
|
+
if df.index.nlevels == 1:
|
772
|
+
df.loc[:, "ind"] = ["ind_" + x for x in df.index]
|
773
|
+
else:
|
774
|
+
# handle a multiindex
|
775
|
+
fstr = "ind_" + "_".join(["{}"] * df.index.nlevels)
|
776
|
+
df.loc[:, "ind"] = list(starmap(fstr.format, df.index))
|
777
|
+
|
778
|
+
# aggregate defining variables
|
779
|
+
df.loc[:, "id"] = df[defining_vars].apply(
|
780
|
+
lambda x: "id_" + "_".join(x.dropna().astype(str)), axis=1
|
781
|
+
)
|
782
|
+
|
783
|
+
return df
|
784
|
+
|
785
|
+
|
786
|
+
def find_weakly_connected_subgraphs(edgelist):
|
787
|
+
"""Find all cliques of loosly connected components."""
|
788
|
+
|
789
|
+
assert isinstance(edgelist, pd.DataFrame)
|
790
|
+
assert edgelist.shape[1] == 2
|
791
|
+
assert edgelist.columns.tolist() == ["ind", "id"]
|
792
|
+
# at least some entries in ind should start with ind because this is how we'll pull them out
|
793
|
+
assert any(edgelist["ind"].str.startswith("ind"))
|
794
|
+
|
795
|
+
id_graph = ig.Graph.TupleList(edgelist.itertuples(index=False))
|
796
|
+
|
797
|
+
id_graph_names = [v.attributes()["name"] for v in id_graph.vs]
|
798
|
+
id_graphs_clusters = id_graph.connected_components().membership
|
799
|
+
id_graph_df = pd.DataFrame({"name": id_graph_names, "cluster": id_graphs_clusters})
|
800
|
+
# clusters based on index or identifiers will be the same when joined to id table
|
801
|
+
ind_clusters = id_graph_df[id_graph_df.name.str.startswith("ind")].rename(
|
802
|
+
columns={"name": "ind"}
|
803
|
+
)
|
804
|
+
|
805
|
+
return ind_clusters
|
806
|
+
|
807
|
+
|
808
|
+
def style_df(
|
809
|
+
df: pd.DataFrame,
|
810
|
+
headers: Union[str, list[str], None] = "keys",
|
811
|
+
hide_index: bool = False,
|
812
|
+
) -> pd.io.formats.style.Styler:
|
813
|
+
"""
|
814
|
+
Style DataFrame
|
815
|
+
|
816
|
+
Provide some simple options for styling a pd.DataFrame
|
817
|
+
|
818
|
+
Args:
|
819
|
+
df: pd.DataFrame
|
820
|
+
A table to style
|
821
|
+
headers:
|
822
|
+
- "keys" to use the current column names
|
823
|
+
- None to suppress column names
|
824
|
+
- list[str] to overwrite and show column names
|
825
|
+
hide_index: bool
|
826
|
+
Should rows be displayed?
|
827
|
+
|
828
|
+
Returns:
|
829
|
+
styled_df: pd.io.formats.style.Styler
|
830
|
+
`df` with styles updated
|
831
|
+
"""
|
832
|
+
|
833
|
+
if isinstance(headers, list):
|
834
|
+
if len(headers) != df.shape[1]:
|
835
|
+
raise ValueError(
|
836
|
+
f"headers was a list with {len(headers)} entries, but df has {df.shape[1]} "
|
837
|
+
"columns. These dimensions should match"
|
838
|
+
)
|
839
|
+
|
840
|
+
df.columns = headers # type: ignore
|
841
|
+
|
842
|
+
styled_df = df.style.format(precision=3).set_table_styles(
|
843
|
+
[{"selector": "th", "props": "color: limegreen;"}]
|
844
|
+
)
|
845
|
+
|
846
|
+
if hide_index:
|
847
|
+
styled_df = styled_df.hide(axis="index")
|
848
|
+
|
849
|
+
if headers is None:
|
850
|
+
return styled_df.hide(axis="columns")
|
851
|
+
elif isinstance(headers, str):
|
852
|
+
if headers == "keys":
|
853
|
+
# just plot with the index as headers
|
854
|
+
return styled_df
|
855
|
+
else:
|
856
|
+
raise ValueError(
|
857
|
+
f"headers was a string: {headers} but this option is not recognized. "
|
858
|
+
'The only defined value is "keys".'
|
859
|
+
)
|
860
|
+
else:
|
861
|
+
assert isinstance(headers, list)
|
862
|
+
return styled_df
|
863
|
+
|
864
|
+
|
865
|
+
def safe_series_tolist(x):
|
866
|
+
"""Convert either a list or str to a list."""
|
867
|
+
|
868
|
+
if isinstance(x, str):
|
869
|
+
return [x]
|
870
|
+
elif isinstance(x, pd.Series):
|
871
|
+
return x.tolist()
|
872
|
+
else:
|
873
|
+
raise TypeError(f"x was a {type(x)} but only str and pd.Series are supported")
|
874
|
+
|
875
|
+
|
876
|
+
def check_unique_index(df, label=""):
|
877
|
+
"""Validate that each index value only maps to a single row."""
|
878
|
+
|
879
|
+
if len(df.index) != len(df.index.unique()):
|
880
|
+
raise ValueError(f"{label} index entries are not unique")
|
881
|
+
|
882
|
+
return None
|
883
|
+
|
884
|
+
|
885
|
+
def score_nameness(string: str):
|
886
|
+
"""
|
887
|
+
Score Nameness
|
888
|
+
|
889
|
+
This utility assigns a numeric score to a string reflecting how likely it is to be
|
890
|
+
a human readable name. This will help to prioritize readable entries when we are
|
891
|
+
trying to pick out a single name to display from a set of values which may also
|
892
|
+
include entries like systematic ids.
|
893
|
+
|
894
|
+
Args:
|
895
|
+
string (str):
|
896
|
+
An alphanumeric string
|
897
|
+
|
898
|
+
Returns:
|
899
|
+
score (int):
|
900
|
+
An integer score indicating how name-like the string is (low is more name-like)
|
901
|
+
"""
|
902
|
+
|
903
|
+
return (
|
904
|
+
# string length
|
905
|
+
string.__len__()
|
906
|
+
# no-space penalty
|
907
|
+
+ (sum(c.isspace() for c in string) == 0) * 10
|
908
|
+
# penalty for each number
|
909
|
+
+ sum(c.isdigit() for c in string) * 5
|
910
|
+
)
|
911
|
+
|
912
|
+
|
913
|
+
def click_str_to_list(string: str) -> list[str]:
|
914
|
+
"""Convert a string-based representation of a list inputted from the CLI into a list of strings."""
|
915
|
+
|
916
|
+
var_extract_regex = re.compile("\\'?([a-zA-Z_]+)\\'?")
|
917
|
+
|
918
|
+
re_search = re.search("^\\[(.*)\\]$", string)
|
919
|
+
if re_search:
|
920
|
+
return var_extract_regex.findall(re_search.group(0))
|
921
|
+
else:
|
922
|
+
raise ValueError(
|
923
|
+
f"The provided string, {string}, could not be reformatted as a list. An example string which can be formatted is: \"['weights', 'upstream_weights']\""
|
924
|
+
)
|
925
|
+
|
926
|
+
|
927
|
+
def _add_nameness_score_wrapper(df, name_var, table_schema):
|
928
|
+
"""Call _add_nameness_score with default value."""
|
929
|
+
|
930
|
+
if name_var in table_schema.keys():
|
931
|
+
return _add_nameness_score(df, table_schema[name_var])
|
932
|
+
else:
|
933
|
+
logger.debug(
|
934
|
+
f"{name_var} is not defined in table_schema; adding a constant (1)"
|
935
|
+
)
|
936
|
+
return df.assign(nameness_score=1)
|
937
|
+
|
938
|
+
|
939
|
+
def _add_nameness_score(df, name_var):
|
940
|
+
"""Add a nameness_score variable which reflects how name-like each entry is."""
|
941
|
+
|
942
|
+
df.loc[:, "nameness_score"] = df[name_var].apply(score_nameness)
|
943
|
+
return df
|