napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
napistu/utils.py ADDED
@@ -0,0 +1,943 @@
1
+ from __future__ import annotations
2
+
3
+ import gzip
4
+ import io
5
+ import json
6
+ import logging
7
+ import os
8
+ import pickle
9
+ import re
10
+ import shutil
11
+ import urllib.request as request
12
+ import zipfile
13
+ from contextlib import closing
14
+ from itertools import starmap
15
+ from typing import Any
16
+ from typing import Union
17
+ from urllib.parse import urlparse
18
+
19
+ import igraph as ig
20
+ import pandas as pd
21
+ import requests
22
+ from napistu.constants import FILE_EXT_GZ
23
+ from napistu.constants import FILE_EXT_ZIP
24
+ from fs import open_fs
25
+ from fs.copy import copy_dir
26
+ from fs.copy import copy_file
27
+ from fs.copy import copy_fs
28
+ from fs.errors import CreateFailed
29
+ from fs.errors import ResourceNotFound
30
+ from fs.tarfs import TarFS
31
+ from fs.tempfs import TempFS
32
+ from fs.zipfs import ZipFS
33
+ from requests.adapters import HTTPAdapter
34
+ from requests.adapters import Retry
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ def initialize_dir(output_dir_path: str, overwrite: bool):
40
+ """Initializes a filesystem directory
41
+
42
+ Args:
43
+ output_dir_path (str): path to new directory
44
+ overwrite (bool): overwrite? if true, directory will be
45
+ deleted and recreated
46
+
47
+ Raises:
48
+ FileExistsError
49
+ """
50
+ output_dir_path = str(output_dir_path)
51
+ try:
52
+ with open_fs(output_dir_path) as out_fs:
53
+ if overwrite:
54
+ out_fs.removetree("/")
55
+ else:
56
+ raise FileExistsError(
57
+ f"{output_dir_path} already exists and overwrite is False"
58
+ )
59
+ except CreateFailed:
60
+ # If gcs bucket did not exist yet, create it
61
+ with open_fs(output_dir_path, create=True):
62
+ pass
63
+
64
+
65
+ def download_and_extract(
66
+ url: str,
67
+ output_dir_path: str = ".",
68
+ download_method: str = "wget",
69
+ overwrite: bool = False,
70
+ ) -> None:
71
+ """
72
+ Download and Unpack
73
+
74
+ Download an archive and then extract to a new folder
75
+
76
+ Args:
77
+ url (str): Url of archive.
78
+ output_dir_path (str): Path to output directory.
79
+ overwrite (bool): Overwrite an existing output directory.
80
+
81
+
82
+ Returns:
83
+ None
84
+ """
85
+
86
+ # initialize output directory
87
+ output_dir_path = str(output_dir_path)
88
+ initialize_dir(output_dir_path, overwrite)
89
+
90
+ out_fs = open_fs(output_dir_path)
91
+ extn = get_extn_from_url(url)
92
+
93
+ # download archive file
94
+ tmp_fs = TempFS()
95
+ tmp_file = os.path.join(tmp_fs.root_path, f"cpr_tmp{extn}")
96
+
97
+ if download_method == "wget":
98
+ download_wget(url, tmp_file)
99
+ elif download_method == "ftp":
100
+ download_ftp(url, tmp_file)
101
+ else:
102
+ raise ValueError("undefined download_method, defined methods are wget and ftp")
103
+
104
+ if re.search(".tar\\.gz$", extn) or re.search("\\.tgz$", extn):
105
+ # untar .tar.gz into individual files
106
+ with TarFS(tmp_file) as tar_fs:
107
+ copy_fs(tar_fs, out_fs)
108
+ logger.info(f"Archive downloaded and untared to {output_dir_path}")
109
+ elif re.search("\\.zip$", extn):
110
+ with ZipFS(tmp_file) as zip_fs:
111
+ copy_fs(zip_fs, out_fs)
112
+ logger.info(f"Archive downloaded and unzipped to {output_dir_path}")
113
+ elif re.search("\\.gz$", extn):
114
+ outfile = url.split("/")[-1].replace(".gz", "")
115
+ # gunzip file
116
+ with gzip.open(tmp_file, "rb") as f_in:
117
+ with out_fs.open(outfile, "wb") as f_out:
118
+ f_out.write(f_in.read())
119
+ else:
120
+ raise ValueError(f"{extn} is not supported")
121
+
122
+ # Close fs
123
+ tmp_fs.close()
124
+ out_fs.close()
125
+
126
+ return None
127
+
128
+
129
+ def extract(file: str):
130
+ """
131
+ Download and Unpack
132
+
133
+ Untar, unzip and ungzip
134
+
135
+ Args:
136
+ file (str): Path to compressed file
137
+
138
+ Returns:
139
+ None
140
+ """
141
+
142
+ extn = get_extn_from_url(file)
143
+ if re.search(".tar\\.gz$", extn) or re.search("\\.tgz$", extn):
144
+ output_dir_path = os.path.join(
145
+ os.path.join(
146
+ os.path.dirname(file), os.path.basename(file).replace(extn, "")
147
+ )
148
+ )
149
+ else:
150
+ output_dir_path = os.path.dirname(file)
151
+
152
+ try:
153
+ initialize_dir(output_dir_path, overwrite=False)
154
+ except FileExistsError:
155
+ pass
156
+
157
+ out_fs = open_fs(output_dir_path)
158
+
159
+ if re.search(".tar\\.gz$", extn) or re.search("\\.tgz$", extn):
160
+ # untar .tar.gz into individual files
161
+ with TarFS(file) as tar_fs:
162
+ copy_fs(tar_fs, out_fs)
163
+ logger.info(f"Archive downloaded and untared to {output_dir_path}")
164
+ elif re.search("\\.zip$", extn):
165
+ with ZipFS(file) as zip_fs:
166
+ copy_fs(zip_fs, out_fs)
167
+ logger.info(f"Archive downloaded and unzipped to {output_dir_path}")
168
+ elif re.search("\\.gz$", extn):
169
+ outfile = file.split("/")[-1].replace(".gz", "")
170
+ # gunzip file
171
+ with gzip.open(file, "rb") as f_in:
172
+ with out_fs.open(outfile, "wb") as f_out:
173
+ f_out.write(f_in.read())
174
+ else:
175
+ raise ValueError(f"{extn} is not supported")
176
+
177
+ # Close fs
178
+ out_fs.close()
179
+
180
+ return None
181
+
182
+
183
+ def gunzip(gzipped_path: str, outpath: str | None = None) -> None:
184
+ """Gunzip a file to an output path."""
185
+
186
+ if not os.path.exists(gzipped_path):
187
+ raise FileNotFoundError(f"{gzipped_path} not found")
188
+
189
+ if not re.search("\\.gz$", gzipped_path):
190
+ logger.warning("{gzipped_path} does not have the .gz extension")
191
+
192
+ if outpath is None:
193
+ # determine outfile name automatically if not provided
194
+ outpath = os.path.join(
195
+ os.path.dirname(gzipped_path),
196
+ gzipped_path.split("/")[-1].replace(".gz", ""),
197
+ )
198
+ outfile = os.path.basename(outpath)
199
+
200
+ out_fs = open_fs(os.path.dirname(outpath))
201
+ # gunzip file
202
+ with gzip.open(gzipped_path, "rb") as f_in:
203
+ with out_fs.open(outfile, "wb") as f_out:
204
+ f_out.write(f_in.read())
205
+ out_fs.close()
206
+
207
+ return None
208
+
209
+
210
+ def get_extn_from_url(url: str) -> str:
211
+ """Retrieves file extension from an URL
212
+
213
+ Args:
214
+ url (str): url
215
+
216
+ Raises:
217
+ ValueError: Raised when no extension identified
218
+
219
+ Returns:
220
+ str: the identified extension
221
+
222
+ Examples:
223
+ >>> get_extn_from_url('https://test/test.gz')
224
+ '.gz'
225
+ >>> get_extn_from_url('https://test/test.tar.gz')
226
+ '.tar.gz'
227
+ >>> get_extn_from_url('https://test/test.tar.gz/bla')
228
+ Traceback (most recent call last):
229
+ ...
230
+ ValueError: File extension not identifiable: https://test/test.tar.gz/bla
231
+ """
232
+ match = re.search("\\..+$", os.path.split(url)[1])
233
+ if match is None:
234
+ raise ValueError(f"File extension not identifiable: {url}")
235
+ else:
236
+ extn = match.group(0)
237
+ return extn
238
+
239
+
240
+ def write_file_contents_to_path(path: str, contents) -> None:
241
+ """Helper function to write file contents to the path.
242
+
243
+ Args:
244
+ path (str): destination
245
+ contents (Any): file contents
246
+
247
+ Returns:
248
+ None
249
+ """
250
+ if hasattr(path, "write") and hasattr(path, "__iter__"):
251
+ path.write(contents) # type: ignore
252
+ else:
253
+ base, filename = get_target_base_and_path(path)
254
+ with open_fs(base, create=True) as fs:
255
+ with fs.open(filename, "wb") as f:
256
+ f.write(contents) # type: ignore
257
+
258
+ return None
259
+
260
+
261
+ def download_wget(
262
+ url: str, path, target_filename: str = None, verify: bool = True
263
+ ) -> None:
264
+ """Downloades file / archive with wget
265
+
266
+ Args:
267
+ url (str): url
268
+ path (FilePath | WriteBuffer): file path or buffer
269
+ target_filename (str): specific file to extract from ZIP if URL is a ZIP file
270
+ verify (bool): verify argument to pass to requests.get
271
+
272
+ Returns:
273
+ None
274
+ """
275
+ r = requests.get(url, allow_redirects=True, verify=verify)
276
+ # throw an exception if one was generated
277
+ r.raise_for_status()
278
+
279
+ # check if the content is a ZIP file
280
+ if (
281
+ r.headers.get("Content-Type") == "application/zip"
282
+ or url.endswith(f".{FILE_EXT_ZIP}")
283
+ ) and target_filename:
284
+ # load the ZIP file in memory
285
+ with zipfile.ZipFile(io.BytesIO(r.content)) as z:
286
+ # check if the target file exists in the ZIP archive
287
+ if target_filename in z.namelist():
288
+ with z.open(target_filename) as target_file:
289
+ # apply the same logic as below to the target file
290
+ return write_file_contents_to_path(path, target_file.read())
291
+ else:
292
+ raise FileNotFoundError(
293
+ f"{target_filename} not found in the ZIP archive"
294
+ )
295
+ # check if the content is a GZIP (single-file compression)
296
+ elif url.endswith(f".{FILE_EXT_GZ}"):
297
+ with gzip.GzipFile(fileobj=io.BytesIO(r.content)) as gz:
298
+ return write_file_contents_to_path(path, gz.read())
299
+ else:
300
+ # not an archive -> default case -> write file directly
301
+ return write_file_contents_to_path(path, r.content)
302
+
303
+
304
+ def download_ftp(url, path):
305
+ with closing(request.urlopen(url)) as r:
306
+ with open(path, "wb") as f:
307
+ shutil.copyfileobj(r, f)
308
+
309
+ return None
310
+
311
+
312
+ def requests_retry_session(
313
+ retries=5,
314
+ backoff_factor=0.3,
315
+ status_forcelist=(500, 502, 503, 504),
316
+ session: requests.Session | None = None,
317
+ **kwargs,
318
+ ) -> requests.Session:
319
+ """Requests session with retry logic
320
+
321
+ This should help to combat flaky apis, eg Brenda.
322
+ From: https://stackoverflow.com/a/58687549
323
+
324
+ Args:
325
+ retries (int, optional): Number of retries. Defaults to 5.
326
+ backoff_factor (float, optional): backoff. Defaults to 0.3.
327
+ status_forcelist (tuple, optional): errors to retry. Defaults to (500, 502, 503, 504).
328
+ session (Optional[requests.Session], optional): existing session. Defaults to None.
329
+
330
+ Returns:
331
+ requests.Session: new requests session
332
+ """
333
+ session = session or requests.Session()
334
+ retry = Retry(
335
+ total=retries,
336
+ read=retries,
337
+ connect=retries,
338
+ backoff_factor=backoff_factor,
339
+ status_forcelist=status_forcelist,
340
+ **kwargs,
341
+ )
342
+ adapter = HTTPAdapter(max_retries=retry)
343
+ session.mount("http://", adapter)
344
+ session.mount("https://", adapter)
345
+ return session
346
+
347
+
348
+ def pickle_cache(path: str, overwrite: bool = False):
349
+ """A decorator to cache a function call result to pickle
350
+
351
+ Attention: this does not care about the function arguments
352
+ All function calls will be served by the same pickle file.
353
+
354
+ Args:
355
+ path (str): path to the cache pickle file
356
+ overwrite (bool): should an existing cache be overwritten even
357
+ if it exists?
358
+
359
+ Returns:
360
+ A function whos output will be cached to pickle.
361
+ """
362
+
363
+ if overwrite:
364
+ if path_exists(path):
365
+ if not os.path.isfile(path):
366
+ logger.warning(
367
+ f"{path} is a GCS URI and cannot be deleted using overwrite = True"
368
+ )
369
+ else:
370
+ logger.info(
371
+ f"Deleting {path} because file exists and overwrite is True"
372
+ )
373
+ os.remove(path)
374
+
375
+ def decorator(fkt):
376
+ def wrapper(*args, **kwargs):
377
+ if path_exists(path):
378
+ logger.info(
379
+ "Not running function %s but using cache file '%s' instead.",
380
+ fkt.__name__,
381
+ path,
382
+ )
383
+ dat = load_pickle(path)
384
+ else:
385
+ dat = fkt(*args, **kwargs)
386
+ save_pickle(path, dat)
387
+ return dat
388
+
389
+ return wrapper
390
+
391
+ return decorator
392
+
393
+
394
+ def path_exists(path: str) -> bool:
395
+ """Checks if path or uri exists
396
+
397
+ Args:
398
+ path (str): path/uri
399
+
400
+ Returns:
401
+ bool: exists?
402
+ """
403
+ dir, file = os.path.split(path)
404
+ try:
405
+ with open_fs(dir) as f:
406
+ return f.exists(file)
407
+ except CreateFailed:
408
+ # If the path is on gcfs,
409
+ # it could be that the parent
410
+ # does not exist, but the path does
411
+ pass
412
+
413
+ # If the path is a directory
414
+ # it is enough that it itself
415
+ # exists
416
+ try:
417
+ with open_fs(path) as f:
418
+ return True
419
+ except CreateFailed:
420
+ return False
421
+
422
+
423
+ def save_pickle(path: str, dat: object):
424
+ """Saves object to path as pickle
425
+
426
+ Args:
427
+ path (str): target path
428
+ dat (object): object
429
+ """
430
+ dir, file = get_target_base_and_path(path)
431
+ with open_fs(dir, create=True) as f:
432
+ with f.open(file, "wb") as f:
433
+ pickle.dump(dat, f)
434
+
435
+
436
+ def load_pickle(path: str):
437
+ """Loads pickle object to path
438
+
439
+ Args:
440
+ path (str): path to pickle
441
+
442
+ Returns:
443
+ Any: Object
444
+ """
445
+ dir, file = get_source_base_and_path(path)
446
+ with open_fs(dir) as source_fs:
447
+ try:
448
+ with source_fs.open(file, "rb") as f:
449
+ return pickle.load(f)
450
+ except ResourceNotFound as e:
451
+ if hasattr(source_fs, "fix_storage"):
452
+ logger.info(
453
+ "File could not be opened. Trying to fix storage for FS-GCFS. "
454
+ "This is required because of: https://fs-gcsfs.readthedocs.io/en/latest/#limitations "
455
+ "and will add empty blobs to indicate directories."
456
+ )
457
+ source_fs.fix_storage()
458
+ else:
459
+ raise e
460
+
461
+
462
+ read_pickle = load_pickle
463
+ write_pickle = save_pickle
464
+
465
+
466
+ def get_source_base_and_path(uri: str) -> tuple[str, str]:
467
+ """Get the base of a bucket or folder and the path to the file
468
+
469
+ Args:
470
+ uri (str): uri
471
+
472
+ Returns:
473
+ tuple[str, str]: base: the base folder of the bucket
474
+
475
+ Example:
476
+ >>> get_source_base_and_path("gs://bucket/folder/file")
477
+ ('gs://bucket', 'folder/file')
478
+ >>> get_source_base_and_path("/bucket/folder/file")
479
+ ('/bucket/folder', 'file')
480
+ """
481
+ uri = str(uri)
482
+ urlelements = urlparse(uri)
483
+ if len(urlelements.scheme) > 0:
484
+ base = urlelements.scheme + "://" + urlelements.netloc
485
+ path = urlelements.path[1:]
486
+ else:
487
+ base, path = os.path.split(uri)
488
+ return base, path
489
+
490
+
491
+ def get_target_base_and_path(uri):
492
+ """Get the base of a bucket + directory and the file
493
+
494
+ Args:
495
+ uri (str): uri
496
+
497
+ Returns:
498
+ tuple[str, str]: base: the base folder + path of the bucket
499
+ file: the file
500
+
501
+ Example:
502
+ >>> get_target_base_and_path("gs://bucket/folder/file")
503
+ ('gs://bucket/folder', 'file')
504
+ >>> get_target_base_and_path("bucket/folder/file")
505
+ ('bucket/folder', 'file')
506
+ >>> get_target_base_and_path("/bucket/folder/file")
507
+ ('/bucket/folder', 'file')
508
+ """
509
+ base, path = os.path.split(uri)
510
+ return base, path
511
+
512
+
513
+ def copy_uri(input_uri: str, output_uri: str, is_file=True):
514
+ """Copy a file or folder from one uri to another
515
+
516
+ Args:
517
+ input_uri (str): input file uri (gcs, http, ...)
518
+ output_uri (str): path to output file (gcs, local)
519
+ is_file (bool, optional): Is this a file or folder?. Defaults to True.
520
+ """
521
+ logger.info("Copy uri from %s to %s", input_uri, output_uri)
522
+ source_base, source_path = get_source_base_and_path(input_uri)
523
+ target_base, target_path = get_target_base_and_path(output_uri)
524
+ if is_file:
525
+ copy_fun = copy_file
526
+ else:
527
+ copy_fun = copy_dir
528
+ with open_fs(source_base) as source_fs:
529
+ with open_fs(target_base, create=True) as target_fs:
530
+ try:
531
+ copy_fun(source_fs, source_path, target_fs, target_path)
532
+ except ResourceNotFound as e:
533
+ if hasattr(source_fs, "fix_storage"):
534
+ logger.info(
535
+ "File could not be opened. Trying to fix storage for FS-GCFS. "
536
+ "This is required because of: https://fs-gcsfs.readthedocs.io/en/latest/#limitations "
537
+ "and will add empty blobs to indicate directories."
538
+ )
539
+ source_fs.fix_storage()
540
+ copy_fun(source_fs, source_path, target_fs, target_path)
541
+ else:
542
+ raise (e)
543
+
544
+
545
+ def save_json(uri: str, object: Any) -> None:
546
+ """Write object to json file at uri
547
+
548
+ Args:
549
+ object (Any): object to write
550
+ uri (str): path to json file
551
+ """
552
+ target_base, target_path = get_target_base_and_path(uri)
553
+ with open_fs(target_base, create=True) as target_fs:
554
+ target_fs.writetext(target_path, json.dumps(object))
555
+
556
+
557
+ def load_json(uri: str) -> Any:
558
+ """Read json from uri
559
+
560
+ Args:
561
+ uri (str): path to json file
562
+ """
563
+ logger.info("Read json from %s", uri)
564
+ source_base, source_path = get_source_base_and_path(uri)
565
+ with open_fs(source_base) as source_fs:
566
+ try:
567
+ txt = source_fs.readtext(source_path)
568
+ except ResourceNotFound as e:
569
+ if hasattr(source_fs, "fix_storage"):
570
+ logger.info(
571
+ "File could not be opened. Trying to fix storage for FS-GCFS. "
572
+ "This is required because of: https://fs-gcsfs.readthedocs.io/en/latest/#limitations "
573
+ "and will add empty blobs to indicate directories."
574
+ )
575
+ source_fs.fix_storage()
576
+ txt = source_fs.readtext(source_path)
577
+ else:
578
+ raise (e)
579
+ return json.loads(txt)
580
+
581
+
582
+ def extract_regex_search(regex: str, query: str, index_value: int = 0) -> str:
583
+ """
584
+ Match an identifier substring and otherwise throw an error
585
+
586
+ Args:
587
+ regex (str): regular expression to search
588
+ query (str): string to search against
589
+ index_value (int): entry in index to return
590
+
591
+ return:
592
+ match (str): a character string match
593
+
594
+ """
595
+
596
+ if m := re.search(regex, query):
597
+ match = m[index_value]
598
+ else:
599
+ raise ValueError(
600
+ f"{query} does not match the identifier regular expression: {regex}"
601
+ )
602
+
603
+ return match
604
+
605
+
606
+ def extract_regex_match(regex: str, query: str) -> str:
607
+ """
608
+ Args:
609
+ regex (str): regular expression to search
610
+ query (str): string to search against
611
+
612
+ return:
613
+ match (str): a character string match
614
+ """
615
+
616
+ if m := re.match(regex, query):
617
+ if len(m.groups()) > 0:
618
+ match = m.groups()[0]
619
+ else:
620
+ raise ValueError(
621
+ f"{query} does not match a subgroup in the regular expression: {regex}"
622
+ )
623
+ else:
624
+ raise ValueError(f"{query} does not match the regular expression: {regex}")
625
+
626
+ return match
627
+
628
+
629
+ class match_pd_vars:
630
+ """
631
+ Match Pandas Variables.
632
+
633
+ Attributes
634
+ ----------
635
+ req_vars:
636
+ A set of variables which should exist in df
637
+ missing_vars:
638
+ Required variables which are not present in df
639
+ extra_vars:
640
+ Non-required variables which are present in df
641
+ are_present:
642
+ Returns True if req_vars are present and False otherwise
643
+
644
+ Methods
645
+ -------
646
+ assert_present()
647
+ Raise an exception of req_vars are absent
648
+
649
+ """
650
+
651
+ def __init__(
652
+ self, df: pd.DataFrame | pd.Series, req_vars: set, allow_series: bool = True
653
+ ) -> None:
654
+ """
655
+ Connects to an SBML file
656
+
657
+ Parameters
658
+ ----------
659
+ df
660
+ A pd.DataFrame or pd.Series
661
+ req_vars
662
+ A set of variables which should exist in df
663
+ allow_series:
664
+ Can a pd.Series be provided as df?
665
+
666
+ Returns
667
+ -------
668
+ None.
669
+ """
670
+
671
+ if isinstance(df, pd.Series):
672
+ if not allow_series:
673
+ raise TypeError("df was a pd.Series and must be a pd.DataFrame")
674
+ vars_present = set(df.index.tolist())
675
+ elif isinstance(df, pd.DataFrame):
676
+ vars_present = set(df.columns.tolist())
677
+ else:
678
+ raise TypeError(
679
+ f"df was a {type(df).__name__} and must be a pd.DataFrame or pd.Series"
680
+ )
681
+
682
+ self.req_vars = req_vars
683
+ self.missing_vars = req_vars.difference(vars_present)
684
+ self.extra_vars = vars_present.difference(req_vars)
685
+
686
+ if len(self.missing_vars) == 0:
687
+ self.are_present = True
688
+ else:
689
+ self.are_present = False
690
+
691
+ def assert_present(self) -> None:
692
+ """
693
+ Raise an error if required variables are missing
694
+ """
695
+
696
+ if not self.are_present:
697
+ raise ValueError(
698
+ f"{len(self.missing_vars)} required variables were "
699
+ "missing from the provided pd.DataFrame or pd.Series: "
700
+ f"{', '.join(self.missing_vars)}"
701
+ )
702
+
703
+ return None
704
+
705
+
706
+ def ensure_pd_df(pd_df_or_series: pd.DataFrame | pd.Series) -> pd.DataFrame:
707
+ """
708
+ Ensure Pandas DataFrame
709
+
710
+ Convert a pd.Series to a DataFrame if needed.
711
+
712
+ Args:
713
+ pd_df_or_series (pd.Series | pd.DataFrame):
714
+ a pandas df or series
715
+
716
+ Returns:
717
+ pd_df converted to a pd.DataFrame if needed
718
+
719
+ """
720
+
721
+ if isinstance(pd_df_or_series, pd.DataFrame):
722
+ return pd_df_or_series
723
+ elif isinstance(pd_df_or_series, pd.Series):
724
+ return pd_df_or_series.to_frame().T
725
+ else:
726
+ raise TypeError(
727
+ "ensure_pd_df expects either a pandas DataFrame or Series but received"
728
+ f" a {type(pd_df_or_series)}"
729
+ )
730
+
731
+
732
+ def format_identifiers_as_edgelist(
733
+ df: pd.DataFrame, defining_vars: list[str]
734
+ ) -> pd.DataFrame:
735
+ """
736
+ Format Identifiers as Edgelist
737
+
738
+ Collapse a multiindex to an index (if needed), and similarly collapse multiple variables to a single entry.
739
+ This indexed pd.Sereies of index - ids can be treated as an edgelist for greedy clustering.
740
+
741
+ Args:
742
+ df (pd.DataFrame):
743
+ Any pd.DataFrame
744
+ defining_vars (list(str)):
745
+ A set of attributes which define a distinct entry in df
746
+
747
+ Returns:
748
+ df (pd.DataFrame):
749
+ A pd.DataFrame with an "ind" and "id" variable added indicating rolled up
750
+ values of the index and defining_vars
751
+ """
752
+
753
+ assert isinstance(df, pd.DataFrame)
754
+ # requires a named index by convention
755
+ if None in df.index.names:
756
+ raise ValueError(
757
+ "df did not have a named index. A named index or multindex is expected"
758
+ )
759
+
760
+ assert isinstance(defining_vars, list)
761
+
762
+ logger.info(
763
+ f"creating an edgelist linking index levels {', '.join(df.index.names)} and linking it "
764
+ f"to levels defined by {', '.join(defining_vars)}"
765
+ )
766
+
767
+ # df is a pd.DataFrame and contains defining_vars
768
+ match_pd_vars(df, req_vars=set(defining_vars), allow_series=False).assert_present()
769
+
770
+ # combine all components of a multindex into a single index value
771
+ if df.index.nlevels == 1:
772
+ df.loc[:, "ind"] = ["ind_" + x for x in df.index]
773
+ else:
774
+ # handle a multiindex
775
+ fstr = "ind_" + "_".join(["{}"] * df.index.nlevels)
776
+ df.loc[:, "ind"] = list(starmap(fstr.format, df.index))
777
+
778
+ # aggregate defining variables
779
+ df.loc[:, "id"] = df[defining_vars].apply(
780
+ lambda x: "id_" + "_".join(x.dropna().astype(str)), axis=1
781
+ )
782
+
783
+ return df
784
+
785
+
786
+ def find_weakly_connected_subgraphs(edgelist):
787
+ """Find all cliques of loosly connected components."""
788
+
789
+ assert isinstance(edgelist, pd.DataFrame)
790
+ assert edgelist.shape[1] == 2
791
+ assert edgelist.columns.tolist() == ["ind", "id"]
792
+ # at least some entries in ind should start with ind because this is how we'll pull them out
793
+ assert any(edgelist["ind"].str.startswith("ind"))
794
+
795
+ id_graph = ig.Graph.TupleList(edgelist.itertuples(index=False))
796
+
797
+ id_graph_names = [v.attributes()["name"] for v in id_graph.vs]
798
+ id_graphs_clusters = id_graph.connected_components().membership
799
+ id_graph_df = pd.DataFrame({"name": id_graph_names, "cluster": id_graphs_clusters})
800
+ # clusters based on index or identifiers will be the same when joined to id table
801
+ ind_clusters = id_graph_df[id_graph_df.name.str.startswith("ind")].rename(
802
+ columns={"name": "ind"}
803
+ )
804
+
805
+ return ind_clusters
806
+
807
+
808
+ def style_df(
809
+ df: pd.DataFrame,
810
+ headers: Union[str, list[str], None] = "keys",
811
+ hide_index: bool = False,
812
+ ) -> pd.io.formats.style.Styler:
813
+ """
814
+ Style DataFrame
815
+
816
+ Provide some simple options for styling a pd.DataFrame
817
+
818
+ Args:
819
+ df: pd.DataFrame
820
+ A table to style
821
+ headers:
822
+ - "keys" to use the current column names
823
+ - None to suppress column names
824
+ - list[str] to overwrite and show column names
825
+ hide_index: bool
826
+ Should rows be displayed?
827
+
828
+ Returns:
829
+ styled_df: pd.io.formats.style.Styler
830
+ `df` with styles updated
831
+ """
832
+
833
+ if isinstance(headers, list):
834
+ if len(headers) != df.shape[1]:
835
+ raise ValueError(
836
+ f"headers was a list with {len(headers)} entries, but df has {df.shape[1]} "
837
+ "columns. These dimensions should match"
838
+ )
839
+
840
+ df.columns = headers # type: ignore
841
+
842
+ styled_df = df.style.format(precision=3).set_table_styles(
843
+ [{"selector": "th", "props": "color: limegreen;"}]
844
+ )
845
+
846
+ if hide_index:
847
+ styled_df = styled_df.hide(axis="index")
848
+
849
+ if headers is None:
850
+ return styled_df.hide(axis="columns")
851
+ elif isinstance(headers, str):
852
+ if headers == "keys":
853
+ # just plot with the index as headers
854
+ return styled_df
855
+ else:
856
+ raise ValueError(
857
+ f"headers was a string: {headers} but this option is not recognized. "
858
+ 'The only defined value is "keys".'
859
+ )
860
+ else:
861
+ assert isinstance(headers, list)
862
+ return styled_df
863
+
864
+
865
+ def safe_series_tolist(x):
866
+ """Convert either a list or str to a list."""
867
+
868
+ if isinstance(x, str):
869
+ return [x]
870
+ elif isinstance(x, pd.Series):
871
+ return x.tolist()
872
+ else:
873
+ raise TypeError(f"x was a {type(x)} but only str and pd.Series are supported")
874
+
875
+
876
+ def check_unique_index(df, label=""):
877
+ """Validate that each index value only maps to a single row."""
878
+
879
+ if len(df.index) != len(df.index.unique()):
880
+ raise ValueError(f"{label} index entries are not unique")
881
+
882
+ return None
883
+
884
+
885
+ def score_nameness(string: str):
886
+ """
887
+ Score Nameness
888
+
889
+ This utility assigns a numeric score to a string reflecting how likely it is to be
890
+ a human readable name. This will help to prioritize readable entries when we are
891
+ trying to pick out a single name to display from a set of values which may also
892
+ include entries like systematic ids.
893
+
894
+ Args:
895
+ string (str):
896
+ An alphanumeric string
897
+
898
+ Returns:
899
+ score (int):
900
+ An integer score indicating how name-like the string is (low is more name-like)
901
+ """
902
+
903
+ return (
904
+ # string length
905
+ string.__len__()
906
+ # no-space penalty
907
+ + (sum(c.isspace() for c in string) == 0) * 10
908
+ # penalty for each number
909
+ + sum(c.isdigit() for c in string) * 5
910
+ )
911
+
912
+
913
+ def click_str_to_list(string: str) -> list[str]:
914
+ """Convert a string-based representation of a list inputted from the CLI into a list of strings."""
915
+
916
+ var_extract_regex = re.compile("\\'?([a-zA-Z_]+)\\'?")
917
+
918
+ re_search = re.search("^\\[(.*)\\]$", string)
919
+ if re_search:
920
+ return var_extract_regex.findall(re_search.group(0))
921
+ else:
922
+ raise ValueError(
923
+ f"The provided string, {string}, could not be reformatted as a list. An example string which can be formatted is: \"['weights', 'upstream_weights']\""
924
+ )
925
+
926
+
927
+ def _add_nameness_score_wrapper(df, name_var, table_schema):
928
+ """Call _add_nameness_score with default value."""
929
+
930
+ if name_var in table_schema.keys():
931
+ return _add_nameness_score(df, table_schema[name_var])
932
+ else:
933
+ logger.debug(
934
+ f"{name_var} is not defined in table_schema; adding a constant (1)"
935
+ )
936
+ return df.assign(nameness_score=1)
937
+
938
+
939
+ def _add_nameness_score(df, name_var):
940
+ """Add a nameness_score variable which reflects how name-like each entry is."""
941
+
942
+ df.loc[:, "nameness_score"] = df[name_var].apply(score_nameness)
943
+ return df