biocypher 0.9.2__py3-none-any.whl → 0.12.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biocypher/__init__.py CHANGED
@@ -11,6 +11,12 @@ __all__ = [
11
11
  "log",
12
12
  "Driver",
13
13
  "BioCypher",
14
+ "BioCypherWorkflow",
15
+ "Graph",
16
+ "Node",
17
+ "Edge",
18
+ "HyperEdge",
19
+ "create_workflow",
14
20
  "FileDownload",
15
21
  "APIRequest",
16
22
  ]
@@ -18,8 +24,10 @@ __all__ = [
18
24
  from ._config import config, module_data
19
25
  from ._core import BioCypher
20
26
  from ._get import APIRequest, FileDownload
27
+ from ._graph import Edge, Graph, HyperEdge, Node
21
28
  from ._logger import log, logfile, logger
22
29
  from ._metadata import __author__, __version__
30
+ from ._workflow import BioCypherWorkflow, create_workflow
23
31
 
24
32
 
25
33
  class Driver(BioCypher):
biocypher/_core.py CHANGED
@@ -301,7 +301,7 @@ class BioCypher:
301
301
 
302
302
  return self._translator
303
303
 
304
- def _get_writer(self):
304
+ def _initialize_writer(self) -> None:
305
305
  """Create writer if not online.
306
306
 
307
307
  Set as instance variable `self._writer`.
@@ -328,8 +328,6 @@ class BioCypher:
328
328
  msg = "Cannot get writer in online mode."
329
329
  raise NotImplementedError(msg)
330
330
 
331
- return self._writer
332
-
333
331
  def _get_driver(self):
334
332
  """Create driver if not exists.
335
333
 
@@ -385,7 +383,9 @@ class BioCypher:
385
383
  translated_nodes = self._translator.translate_entities(nodes)
386
384
 
387
385
  if self._offline:
388
- passed = self._get_writer().write_nodes(
386
+ if not self._writer:
387
+ self._initialize_writer()
388
+ passed = self._writer.write_nodes(
389
389
  translated_nodes,
390
390
  batch_size=batch_size,
391
391
  force=force,
@@ -688,6 +688,12 @@ class BioCypher:
688
688
  if not self._offline:
689
689
  msg = "Cannot write import call in online mode."
690
690
  raise NotImplementedError(msg)
691
+ else:
692
+ if not self._writer:
693
+ logger.warning(
694
+ "No edges or nodes were added, I'll try to continue, but you may want to double-check your data."
695
+ )
696
+ self._initialize_writer()
691
697
 
692
698
  return self._writer.write_import_call()
693
699
 
biocypher/_get.py CHANGED
@@ -1,5 +1,6 @@
1
- """
2
- BioCypher get module. Used to download and cache data from external sources.
1
+ """BioCypher get module.
2
+
3
+ Used to download and cache data from external sources.
3
4
  """
4
5
 
5
6
  from __future__ import annotations
@@ -30,19 +31,22 @@ class Resource(ABC):
30
31
  url_s: str | list[str],
31
32
  lifetime: int = 0,
32
33
  ):
33
- """
34
+ """Initialize a Resource.
35
+
34
36
  A Resource is a file, a list of files, an API request, or a list of API
35
37
  requests, any of which can be downloaded from the given URL(s) and
36
38
  cached locally. This class implements checks of the minimum requirements
37
39
  for a resource, to be implemented by a biocypher adapter.
38
40
 
39
41
  Args:
42
+ ----
40
43
  name (str): The name of the resource.
41
44
 
42
45
  url_s (str | list[str]): The URL or URLs of the resource.
43
46
 
44
47
  lifetime (int): The lifetime of the resource in days. If 0, the
45
48
  resource is considered to be permanent.
49
+
46
50
  """
47
51
  self.name = name
48
52
  self.url_s = url_s
@@ -57,10 +61,12 @@ class FileDownload(Resource):
57
61
  lifetime: int = 0,
58
62
  is_dir: bool = False,
59
63
  ):
60
- """
64
+ """Initialize a FileDownload object.
65
+
61
66
  Represents basic information for a File Download.
62
67
 
63
68
  Args:
69
+ ----
64
70
  name(str): The name of the File Download.
65
71
 
66
72
  url_s(str|list[str]): The URL(s) of the File Download.
@@ -69,18 +75,20 @@ class FileDownload(Resource):
69
75
  File Download is cached indefinitely.
70
76
 
71
77
  is_dir (bool): Whether the URL points to a directory or not.
72
- """
73
78
 
79
+ """
74
80
  super().__init__(name, url_s, lifetime)
75
81
  self.is_dir = is_dir
76
82
 
77
83
 
78
84
  class APIRequest(Resource):
79
85
  def __init__(self, name: str, url_s: str | list[str], lifetime: int = 0):
80
- """
86
+ """Initialize an APIRequest object.
87
+
81
88
  Represents basic information for an API Request.
82
89
 
83
90
  Args:
91
+ ----
84
92
  name(str): The name of the API Request.
85
93
 
86
94
  url_s(str|list): The URL of the API endpoint.
@@ -94,29 +102,35 @@ class APIRequest(Resource):
94
102
 
95
103
  class Downloader:
96
104
  def __init__(self, cache_dir: Optional[str] = None) -> None:
97
- """
105
+ """Initialize the Downloader.
106
+
98
107
  The Downloader is a class that manages resources that can be downloaded
99
108
  and cached locally. It manages the lifetime of downloaded resources by
100
109
  keeping a JSON record of the download date of each resource.
101
110
 
102
111
  Args:
112
+ ----
103
113
  cache_dir (str): The directory where the resources are cached. If
104
114
  not given, a temporary directory is created.
115
+
105
116
  """
106
117
  self.cache_dir = cache_dir or TemporaryDirectory().name
107
118
  self.cache_file = os.path.join(self.cache_dir, "cache.json")
108
119
  self.cache_dict = self._load_cache_dict()
109
120
 
110
121
  def download(self, *resources: Resource):
111
- """
112
- Download one or multiple resources. Load from cache if the resource is
113
- already downloaded and the cache is not expired.
122
+ """Download one or multiple resources.
123
+
124
+ Load from cache if the resource is already downloaded and the cache is
125
+ not expired.
114
126
 
115
127
  Args:
128
+ ----
116
129
  resources (Resource): The resource(s) to download or load from
117
130
  cache.
118
131
 
119
132
  Returns:
133
+ -------
120
134
  list[str]: The path or paths to the resource(s) that were downloaded
121
135
  or loaded from cache.
122
136
 
@@ -132,12 +146,14 @@ class Downloader:
132
146
  return paths
133
147
 
134
148
  def _download_or_cache(self, resource: Resource, cache: bool = True):
135
- """
136
- Download a resource if it is not cached or exceeded its lifetime.
149
+ """Download a resource if it is not cached or exceeded its lifetime.
137
150
 
138
151
  Args:
152
+ ----
139
153
  resource (Resource): The resource to download.
154
+
140
155
  Returns:
156
+ -------
141
157
  list[str]: The path or paths to the downloaded resource(s).
142
158
 
143
159
  """
@@ -159,14 +175,16 @@ class Downloader:
159
175
  return paths
160
176
 
161
177
  def _is_cache_expired(self, resource: Resource) -> bool:
162
- """
163
- Check if resource or API request cache is expired.
178
+ """Check if resource or API request cache is expired.
164
179
 
165
180
  Args:
181
+ ----
166
182
  resource (Resource): The resource to download.
167
183
 
168
184
  Returns:
185
+ -------
169
186
  bool: cache is expired or not.
187
+
170
188
  """
171
189
  cache_record = self._get_cache_record(resource)
172
190
  if cache_record:
@@ -182,17 +200,21 @@ class Downloader:
182
200
  if os.path.exists(cache_resource_path) and os.path.isdir(cache_resource_path):
183
201
  shutil.rmtree(cache_resource_path)
184
202
 
185
- def _download_files(self, cache, file_download: FileDownload):
186
- """
187
- Download a resource given it is a file or a directory and return the
188
- path.
203
+ def _download_files(self, cache, file_download: FileDownload) -> list[str]:
204
+ """Download a resource given it is a file or a directory.
205
+
206
+ Upon downloading, return the path(s).
189
207
 
190
208
  Args:
209
+ ----
191
210
  cache (bool): Whether to cache the resource or not.
211
+
192
212
  file_download (FileDownload): The resource to download.
193
213
 
194
214
  Returns:
215
+ -------
195
216
  list[str]: The path or paths to the downloaded resource(s).
217
+
196
218
  """
197
219
  if file_download.is_dir:
198
220
  files = self._get_files(file_download)
@@ -202,7 +224,7 @@ class Downloader:
202
224
  elif isinstance(file_download.url_s, list):
203
225
  paths = []
204
226
  for url in file_download.url_s:
205
- fname = url[url.rfind("/") + 1 :].split("?")[0]
227
+ fname = self._trim_filename(url)
206
228
  path = self._retrieve(
207
229
  url=url,
208
230
  fname=fname,
@@ -211,7 +233,7 @@ class Downloader:
211
233
  paths.append(path)
212
234
  else:
213
235
  paths = []
214
- fname = file_download.url_s[file_download.url_s.rfind("/") + 1 :].split("?")[0]
236
+ fname = self._trim_filename(file_download.url_s)
215
237
  results = self._retrieve(
216
238
  url=file_download.url_s,
217
239
  fname=fname,
@@ -227,20 +249,23 @@ class Downloader:
227
249
  # adapter
228
250
  return paths
229
251
 
230
- def _download_api_request(self, api_request: APIRequest):
231
- """
232
- Download an API request and return the path.
252
+ def _download_api_request(self, api_request: APIRequest) -> list[str]:
253
+ """Download an API request and return the path.
233
254
 
234
255
  Args:
235
- api_request(APIRequest): The API request result that is being cached.
256
+ ----
257
+ api_request(APIRequest): The API request result that is being
258
+ cached.
259
+
236
260
  Returns:
261
+ -------
237
262
  list[str]: The path to the cached API request.
238
263
 
239
264
  """
240
265
  urls = api_request.url_s if isinstance(api_request.url_s, list) else [api_request.url_s]
241
266
  paths = []
242
267
  for url in urls:
243
- fname = url[url.rfind("/") + 1 :].rsplit(".", 1)[0]
268
+ fname = self._trim_filename(url)
244
269
  logger.info(f"Asking for caching API of {api_request.name} {fname}.")
245
270
  response = requests.get(url=url)
246
271
 
@@ -260,10 +285,13 @@ class Downloader:
260
285
  """Get the cached version of a resource.
261
286
 
262
287
  Args:
288
+ ----
263
289
  resource(Resource): The resource to get the cached version of.
264
290
 
265
291
  Returns:
292
+ -------
266
293
  list[str]: The paths to the cached resource(s).
294
+
267
295
  """
268
296
  cached_location = os.path.join(self.cache_dir, resource.name)
269
297
  logger.info(f"Use cached version from {cached_location}.")
@@ -278,17 +306,25 @@ class Downloader:
278
306
  fname: str,
279
307
  path: str,
280
308
  known_hash: str = None,
281
- ):
282
- """
283
- Retrieve a file from a URL using Pooch. Infer type of file from
284
- extension and use appropriate processor.
309
+ ) -> str:
310
+ """Retrieve a file from a URL using Pooch.
311
+
312
+ Infer type of file from extension and use appropriate processor.
285
313
 
286
314
  Args:
315
+ ----
287
316
  url (str): The URL to retrieve the file from.
288
317
 
289
318
  fname (str): The name of the file.
290
319
 
291
320
  path (str): The path to the file.
321
+
322
+ known_hash (str): The known hash of the file.
323
+
324
+ Returns:
325
+ -------
326
+ str: The path to the file.
327
+
292
328
  """
293
329
  if fname.endswith(".zip"):
294
330
  return pooch.retrieve(
@@ -329,15 +365,17 @@ class Downloader:
329
365
  progressbar=True,
330
366
  )
331
367
 
332
- def _get_files(self, file_download: FileDownload):
333
- """
334
- Get the files contained in a directory file.
368
+ def _get_files(self, file_download: FileDownload) -> list[str]:
369
+ """Get the files contained in a directory file.
335
370
 
336
371
  Args:
372
+ ----
337
373
  file_download (FileDownload): The directory file.
338
374
 
339
375
  Returns:
340
- list: The files contained in the directory.
376
+ -------
377
+ list[str]: The files contained in the directory.
378
+
341
379
  """
342
380
  if file_download.url_s.startswith("ftp://"):
343
381
  # remove protocol
@@ -353,14 +391,25 @@ class Downloader:
353
391
  files = ftp.nlst()
354
392
  ftp.quit()
355
393
  else:
356
- raise NotImplementedError("Only FTP directories are supported at the moment.")
394
+ msg = "Only FTP directories are supported at the moment."
395
+ logger.error(msg)
396
+ raise NotImplementedError(msg)
357
397
 
358
398
  return files
359
399
 
360
- def _load_cache_dict(self):
361
- """
362
- Load the cache dictionary from the cache file. Create an empty cache
363
- file if it does not exist.
400
+ def _load_cache_dict(self) -> dict:
401
+ """Load the cache dictionary from the cache file.
402
+
403
+ Create an empty cache file if it does not exist.
404
+
405
+ Args:
406
+ ----
407
+ None.
408
+
409
+ Returns:
410
+ -------
411
+ dict: The cache dictionary.
412
+
364
413
  """
365
414
  if not os.path.exists(self.cache_dir):
366
415
  logger.info(f"Creating cache directory {self.cache_dir}.")
@@ -371,28 +420,31 @@ class Downloader:
371
420
  with open(self.cache_file, "w") as f:
372
421
  json.dump({}, f)
373
422
 
374
- with open(self.cache_file, "r") as f:
423
+ with open(self.cache_file) as f:
375
424
  logger.info(f"Loading cache file {self.cache_file}.")
376
425
  return json.load(f)
377
426
 
378
- def _get_cache_record(self, resource: Resource):
379
- """
380
- Get the cache record of a resource.
427
+ def _get_cache_record(self, resource: Resource) -> dict:
428
+ """Get the cache record of a resource.
381
429
 
382
430
  Args:
431
+ ----
383
432
  resource (Resource): The resource to get the cache record of.
384
433
 
385
434
  Returns:
386
- The cache record of the resource.
435
+ -------
436
+ dict: The cache record of the resource.
437
+
387
438
  """
388
439
  return self.cache_dict.get(resource.name, {})
389
440
 
390
- def _update_cache_record(self, resource: Resource):
391
- """
392
- Update the cache record of a resource.
441
+ def _update_cache_record(self, resource: Resource) -> None:
442
+ """Update the cache record of a resource.
393
443
 
394
444
  Args:
445
+ ----
395
446
  resource (Resource): The resource to update the cache record of.
447
+
396
448
  """
397
449
  cache_record = {}
398
450
  cache_record["url"] = to_list(resource.url_s)
@@ -401,3 +453,34 @@ class Downloader:
401
453
  self.cache_dict[resource.name] = cache_record
402
454
  with open(self.cache_file, "w") as f:
403
455
  json.dump(self.cache_dict, f, default=str)
456
+
457
+ def _trim_filename(self, url: str, max_length: int = 150) -> str:
458
+ """Create a trimmed filename from a URL.
459
+
460
+ If the URL exceeds max_length, create a hash of the filename.
461
+
462
+ Args:
463
+ ----
464
+ url (str): The URL to generate a filename from
465
+ max_length (int): Maximum filename length (default: 150)
466
+
467
+ Returns:
468
+ -------
469
+ str: A valid filename derived from the URL, trimmed if necessary
470
+
471
+ """
472
+ # Extract the filename from the URL
473
+ fname = url[url.rfind("/") + 1 :]
474
+
475
+ # Remove query parameters if present
476
+ if "?" in fname:
477
+ fname = fname.split("?")[0]
478
+
479
+ if len(fname) > max_length:
480
+ import hashlib
481
+
482
+ fname_trimmed = hashlib.md5(fname.encode()).hexdigest()
483
+ else:
484
+ fname_trimmed = fname
485
+
486
+ return fname_trimmed