biocypher 0.5.20__py3-none-any.whl → 0.5.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

biocypher/_core.py CHANGED
@@ -24,6 +24,7 @@ from ._logger import logger
24
24
 
25
25
  logger.debug(f"Loading module {__name__}.")
26
26
 
27
+ from ._get import Downloader
27
28
  from ._write import get_writer
28
29
  from ._config import config as _config
29
30
  from ._config import update_from_file as _file_update
@@ -307,12 +308,20 @@ class BioCypher:
307
308
 
308
309
  return self._pd.dfs
309
310
 
310
- def add(self, entities):
311
+ def add(self, entities) -> None:
311
312
  """
312
313
  Function to add entities to the in-memory database. Accepts an iterable
313
314
  of tuples (if given, translates to ``BioCypherNode`` or
314
315
  ``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
315
316
  ``BioCypherEdge`` objects.
317
+
318
+ Args:
319
+ entities (iterable): An iterable of entities to add to the database.
320
+ Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
321
+ 4-tuples for edges (deprecated).
322
+
323
+ Returns:
324
+ None
316
325
  """
317
326
  if not self._pd:
318
327
  self._pd = Pandas(
@@ -335,10 +344,28 @@ class BioCypher:
335
344
 
336
345
  self._pd.add_tables(tentities)
337
346
 
338
- def add_nodes(self, nodes):
347
+ def add_nodes(self, nodes) -> None:
348
+ """
349
+ Wrapper for ``add()`` to add nodes to the in-memory database.
350
+
351
+ Args:
352
+ nodes (iterable): An iterable of node tuples to add to the database.
353
+
354
+ Returns:
355
+ None
356
+ """
339
357
  self.add(nodes)
340
358
 
341
- def add_edges(self, edges):
359
+ def add_edges(self, edges) -> None:
360
+ """
361
+ Wrapper for ``add()`` to add edges to the in-memory database.
362
+
363
+ Args:
364
+ edges (iterable): An iterable of edge tuples to add to the database.
365
+
366
+ Returns:
367
+ None
368
+ """
342
369
  self.add(edges)
343
370
 
344
371
  def merge_nodes(self, nodes) -> bool:
@@ -389,6 +416,24 @@ class BioCypher:
389
416
  # write edge files
390
417
  return self._driver.add_biocypher_edges(tedges)
391
418
 
419
+ # DOWNLOAD AND CACHE MANAGEMENT METHODS ###
420
+
421
+ def _get_downloader(self):
422
+ """
423
+ Create downloader if not exists.
424
+ """
425
+
426
+ if not self._downloader:
427
+ self._downloader = Downloader()
428
+
429
+ def download(self, force: bool = False) -> None:
430
+ """
431
+ Use the :class:`Downloader` class to download or load from cache the
432
+ resources given by the adapter.
433
+ """
434
+
435
+ self._get_downloader()
436
+
392
437
  # OVERVIEW AND CONVENIENCE METHODS ###
393
438
 
394
439
  def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
biocypher/_get.py ADDED
@@ -0,0 +1,299 @@
1
+ #!/usr/bin/env python
2
+
3
+ #
4
+ # Copyright 2021, Heidelberg University Clinic
5
+ #
6
+ # File author(s): Sebastian Lobentanzer
7
+ # ...
8
+ #
9
+ # Distributed under MIT licence, see the file `LICENSE`.
10
+ #
11
+ """
12
+ BioCypher get module. Used to download and cache data from external sources.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from ._logger import logger
18
+
19
+ logger.debug(f"Loading module {__name__}.")
20
+
21
+ from datetime import datetime, timedelta
22
+ from tempfile import TemporaryDirectory
23
+ import os
24
+ import json
25
+ import ftplib
26
+
27
+ import pooch
28
+
29
+ from ._misc import to_list
30
+
31
+
32
+ class Resource:
33
+ def __init__(
34
+ self,
35
+ name: str,
36
+ url_s: str | list[str],
37
+ lifetime: int = 0,
38
+ is_dir: bool = False,
39
+ ):
40
+ """
41
+ A resource is a file that can be downloaded from a URL and cached
42
+ locally. This class implements checks of the minimum requirements for
43
+ a resource, to be implemented by a biocypher adapter.
44
+
45
+ Args:
46
+ name (str): The name of the resource.
47
+
48
+ url_s (str | list[str]): The URL or URLs of the resource.
49
+
50
+ lifetime (int): The lifetime of the resource in days. If 0, the
51
+ resource is considered to be permanent.
52
+ """
53
+ self.name = name
54
+ self.url_s = url_s
55
+ self.lifetime = lifetime
56
+ self.is_dir = is_dir
57
+
58
+
59
+ class Downloader:
60
+ def __init__(self, cache_dir: str):
61
+ """
62
+ A downloader is a collection of resources that can be downloaded
63
+ and cached locally. It manages the lifetime of downloaded resources by
64
+ keeping a JSON record of the download date of each resource.
65
+
66
+ Args:
67
+ cache_dir (str): The directory where the resources are cached. If
68
+ not given, a temporary directory is created.
69
+ """
70
+ self.cache_dir = cache_dir or TemporaryDirectory().name
71
+ self.cache_file = os.path.join(self.cache_dir, "cache.json")
72
+ self.cache_dict = self._load_cache_dict()
73
+
74
+ # download function that accepts a resource or a list of resources
75
+ def download(self, *resources: Resource):
76
+ """
77
+ Download one or multiple resources.
78
+
79
+ Args:
80
+ resources (Resource): The resource or resources to download.
81
+
82
+ Returns:
83
+ str or list: The path or paths to the downloaded resource(s).
84
+ """
85
+ paths = []
86
+ for resource in resources:
87
+ paths.append(self._download_or_cache(resource))
88
+
89
+ # flatten list if it is nested
90
+ if is_nested(paths):
91
+ paths = [path for sublist in paths for path in sublist]
92
+
93
+ return paths
94
+
95
+ def _download_or_cache(self, resource: Resource, cache: bool = True):
96
+ """
97
+ Download a resource if it is not cached or exceeded its lifetime.
98
+
99
+ Args:
100
+ resource (Resource): The resource to download.
101
+
102
+ Returns:
103
+ str or list: The path or paths to the downloaded resource(s).
104
+ """
105
+ # check if resource is cached
106
+ cache_record = self._get_cache_record(resource)
107
+
108
+ if cache_record:
109
+ # check if resource is expired (formatted in days)
110
+ dl = cache_record.get("date_downloaded")
111
+ lt = timedelta(days=resource.lifetime)
112
+ expired = dl + lt < datetime.now()
113
+ else:
114
+ expired = True
115
+
116
+ # download resource
117
+ if expired or not cache:
118
+ logger.info(f"Downloading resource {resource.name}.")
119
+
120
+ if resource.is_dir:
121
+ files = self._get_files(resource)
122
+ resource.url_s = [resource.url_s + "/" + file for file in files]
123
+ resource.is_dir = False
124
+ paths = self._download_or_cache(resource, cache)
125
+ elif isinstance(resource.url_s, list):
126
+ paths = []
127
+ for url in resource.url_s:
128
+ fname = url[url.rfind("/") + 1 :]
129
+ paths.append(
130
+ self._retrieve(
131
+ url=url,
132
+ fname=fname,
133
+ path=os.path.join(self.cache_dir, resource.name),
134
+ )
135
+ )
136
+ else:
137
+ fname = resource.url_s[resource.url_s.rfind("/") + 1 :]
138
+ paths = self._retrieve(
139
+ url=resource.url_s,
140
+ fname=fname,
141
+ path=os.path.join(self.cache_dir, resource.name),
142
+ )
143
+
144
+ # sometimes a compressed file contains multiple files
145
+ # TODO ask for a list of files in the archive to be used from the
146
+ # adapter
147
+
148
+ # update cache record
149
+ self._update_cache_record(resource)
150
+
151
+ return paths
152
+
153
+ def _retrieve(
154
+ self,
155
+ url: str,
156
+ fname: str,
157
+ path: str,
158
+ known_hash: str = None,
159
+ ):
160
+ """
161
+ Retrieve a file from a URL using Pooch. Infer type of file from
162
+ extension and use appropriate processor.
163
+
164
+ Args:
165
+ url (str): The URL to retrieve the file from.
166
+
167
+ fname (str): The name of the file.
168
+
169
+ path (str): The path to the file.
170
+ """
171
+ if fname.endswith(".zip"):
172
+ return pooch.retrieve(
173
+ url=url,
174
+ known_hash=known_hash,
175
+ fname=fname,
176
+ path=path,
177
+ processor=pooch.Unzip(),
178
+ progressbar=True,
179
+ )
180
+
181
+ elif fname.endswith(".tar.gz"):
182
+ return pooch.retrieve(
183
+ url=url,
184
+ known_hash=known_hash,
185
+ fname=fname,
186
+ path=path,
187
+ processor=pooch.Untar(),
188
+ progressbar=True,
189
+ )
190
+
191
+ elif fname.endswith(".gz"):
192
+ return pooch.retrieve(
193
+ url=url,
194
+ known_hash=known_hash,
195
+ fname=fname,
196
+ path=path,
197
+ processor=pooch.Decompress(),
198
+ progressbar=True,
199
+ )
200
+
201
+ else:
202
+ return pooch.retrieve(
203
+ url=url,
204
+ known_hash=known_hash,
205
+ fname=fname,
206
+ path=path,
207
+ progressbar=True,
208
+ )
209
+
210
+ def _get_files(self, resource: Resource):
211
+ """
212
+ Get the files contained in a directory resource.
213
+
214
+ Args:
215
+ resource (Resource): The directory resource.
216
+
217
+ Returns:
218
+ list: The files contained in the directory.
219
+ """
220
+ if resource.url_s.startswith("ftp://"):
221
+ # remove protocol
222
+ url = resource.url_s[6:]
223
+ # get base url
224
+ url = url[: url.find("/")]
225
+ # get directory (remove initial slash as well)
226
+ dir = resource.url_s[7 + len(url) :]
227
+ # get files
228
+ ftp = ftplib.FTP(url)
229
+ ftp.login()
230
+ ftp.cwd(dir)
231
+ files = ftp.nlst()
232
+ ftp.quit()
233
+ else:
234
+ raise NotImplementedError(
235
+ "Only FTP directories are supported at the moment."
236
+ )
237
+
238
+ return files
239
+
240
+ def _load_cache_dict(self):
241
+ """
242
+ Load the cache dictionary from the cache file. Create an empty cache
243
+ file if it does not exist.
244
+ """
245
+ if not os.path.exists(self.cache_dir):
246
+ logger.info(f"Creating cache directory {self.cache_dir}.")
247
+ os.makedirs(self.cache_dir)
248
+
249
+ if not os.path.exists(self.cache_file):
250
+ logger.info(f"Creating cache file {self.cache_file}.")
251
+ with open(self.cache_file, "w") as f:
252
+ json.dump({}, f)
253
+
254
+ with open(self.cache_file, "r") as f:
255
+ logger.info(f"Loading cache file {self.cache_file}.")
256
+ return json.load(f)
257
+
258
+ def _get_cache_record(self, resource: Resource):
259
+ """
260
+ Get the cache record of a resource.
261
+
262
+ Args:
263
+ resource (Resource): The resource to get the cache record of.
264
+
265
+ Returns:
266
+ The cache record of the resource.
267
+ """
268
+ return self.cache_dict.get(resource.name, {})
269
+
270
+ def _update_cache_record(self, resource: Resource):
271
+ """
272
+ Update the cache record of a resource.
273
+
274
+ Args:
275
+ resource (Resource): The resource to update the cache record of.
276
+ """
277
+ cache_record = {}
278
+ cache_record["url"] = to_list(resource.url_s)
279
+ cache_record["date_downloaded"] = datetime.now()
280
+ cache_record["lifetime"] = resource.lifetime
281
+ self.cache_dict[resource.name] = cache_record
282
+ with open(self.cache_file, "w") as f:
283
+ json.dump(self.cache_dict, f, default=str)
284
+
285
+
286
+ def is_nested(lst):
287
+ """
288
+ Check if a list is nested.
289
+
290
+ Args:
291
+ lst (list): The list to check.
292
+
293
+ Returns:
294
+ bool: True if the list is nested, False otherwise.
295
+ """
296
+ for item in lst:
297
+ if isinstance(item, list):
298
+ return True
299
+ return False
biocypher/_metadata.py CHANGED
@@ -19,7 +19,7 @@ import importlib.metadata
19
19
 
20
20
  import toml
21
21
 
22
- _VERSION = "0.5.20"
22
+ _VERSION = "0.5.21"
23
23
 
24
24
 
25
25
  def get_metadata():
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: biocypher
3
- Version: 0.5.20
3
+ Version: 0.5.21
4
4
  Summary: A unifying framework for biomedical research knowledge graphs
5
5
  Home-page: https://github.com/biocypher/biocypher
6
6
  License: MIT
@@ -25,8 +25,10 @@ Requires-Dist: more_itertools
25
25
  Requires-Dist: neo4j-utils (==0.0.7)
26
26
  Requires-Dist: networkx (>=3.0,<4.0)
27
27
  Requires-Dist: pandas (>=2.0.1,<3.0.0)
28
+ Requires-Dist: pooch (>=1.7.0,<2.0.0)
28
29
  Requires-Dist: rdflib (>=6.2.0,<7.0.0)
29
30
  Requires-Dist: stringcase (>=1.2.0,<2.0.0)
31
+ Requires-Dist: tqdm (>=4.65.0,<5.0.0)
30
32
  Requires-Dist: treelib (>=1.6.1,<2.0.0)
31
33
  Project-URL: Bug Tracker, https://github.com/biocypher/biocypher/issues
32
34
  Project-URL: Repository, https://github.com/biocypher/biocypher
@@ -6,18 +6,19 @@ biocypher/_config/test_schema_config.yaml,sha256=D1600WgEj3iTXrumVU9LIivJHJO36ia
6
6
  biocypher/_config/test_schema_config_disconnected.yaml,sha256=Qm8FLxEn2spHcyj_5F859KjcDvKSxNhxDvi4b4LLkvQ,68
7
7
  biocypher/_config/test_schema_config_extended.yaml,sha256=wn3A76142hhjnImhMF6RODbCFESTJ2TtPvcFdIFsAT0,3309
8
8
  biocypher/_connect.py,sha256=0oSyO6CEIlKL8rHo-HHE7y0FzGfSi4vnEXSDy1TnIUE,12456
9
- biocypher/_core.py,sha256=fA0tRorzy3R1mgzzT77mFk-l6oQ01ZAfjg8l6KbPQYM,19882
9
+ biocypher/_core.py,sha256=cc8iOOAhaByobN6zOwdUm1hZFAJ5CpGpKmQnBIIQrbY,21090
10
10
  biocypher/_create.py,sha256=vpUchUdEpWupZi1LgFLxAWMtqoBwnWbP7PwEDUCBS4A,10202
11
11
  biocypher/_deduplicate.py,sha256=BBvfpXzu6L5YDY5FdtXxnf8YlsbJpbCE8RdUoKsm0n0,4949
12
+ biocypher/_get.py,sha256=MHjHEqvPr4Z7Ud05qBcUJkR--iZ1SgUvUoft8MfwUic,8996
12
13
  biocypher/_logger.py,sha256=soYtz1DiduLFw3XrMnphWWUxeuJqvSof4AYhlafxl08,2933
13
14
  biocypher/_mapping.py,sha256=XJZjmXTPnXVkyub1ZU0h3EKXQ2YROaGaJOaGyPMqgy4,9338
14
- biocypher/_metadata.py,sha256=Hmz4g_CSuqikUJ6EtLEq2GS7Z0BawtAsL0Wk-7AiE8c,1658
15
+ biocypher/_metadata.py,sha256=CHGBWJ8qYrb7QNQO-Fk0ROkRDXHvtFECSoex9GytJ4A,1658
15
16
  biocypher/_misc.py,sha256=wsjGVOqBDVM5hxbE_TEaZ69u1kJc8HXwRAtQHUgE8XQ,4545
16
17
  biocypher/_ontology.py,sha256=pHc4hO8iZx-yg9gzqfBR9khoIni-lKAxWgnRFyNP91E,21530
17
18
  biocypher/_pandas.py,sha256=GVCFM68J7yBjh40MpkNVgD8qT1RFMrrIjMOtD3iKsf4,3040
18
19
  biocypher/_translate.py,sha256=nj4Y60F0U3JBH36N2dh5pFcC8Ot86rskJ2ChJwje9dI,16494
19
20
  biocypher/_write.py,sha256=2ynF-VkvTr8WT2qPt2wji3iupP3WON94TlT6NpfDvCs,67738
20
- biocypher-0.5.20.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
21
- biocypher-0.5.20.dist-info/WHEEL,sha256=vxFmldFsRN_Hx10GDvsdv1wroKq8r5Lzvjp6GZ4OO8c,88
22
- biocypher-0.5.20.dist-info/METADATA,sha256=B3VOakjkLgCjusCElMML-neoPoc869g4jNI45Bchibo,9429
23
- biocypher-0.5.20.dist-info/RECORD,,
21
+ biocypher-0.5.21.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
22
+ biocypher-0.5.21.dist-info/WHEEL,sha256=vxFmldFsRN_Hx10GDvsdv1wroKq8r5Lzvjp6GZ4OO8c,88
23
+ biocypher-0.5.21.dist-info/METADATA,sha256=wJ1Hnuq_erwEJRMCKA3e7VeUF7cLibnZdcnSCryynx0,9505
24
+ biocypher-0.5.21.dist-info/RECORD,,