biocypher 0.5.19__py3-none-any.whl → 0.5.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

biocypher/_get.py ADDED
@@ -0,0 +1,299 @@
1
+ #!/usr/bin/env python
2
+
3
+ #
4
+ # Copyright 2021, Heidelberg University Clinic
5
+ #
6
+ # File author(s): Sebastian Lobentanzer
7
+ # ...
8
+ #
9
+ # Distributed under MIT licence, see the file `LICENSE`.
10
+ #
11
+ """
12
+ BioCypher get module. Used to download and cache data from external sources.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from ._logger import logger
18
+
19
+ logger.debug(f"Loading module {__name__}.")
20
+
21
+ from datetime import datetime, timedelta
22
+ from tempfile import TemporaryDirectory
23
+ import os
24
+ import json
25
+ import ftplib
26
+
27
+ import pooch
28
+
29
+ from ._misc import to_list
30
+
31
+
32
+ class Resource:
33
+ def __init__(
34
+ self,
35
+ name: str,
36
+ url_s: str | list[str],
37
+ lifetime: int = 0,
38
+ is_dir: bool = False,
39
+ ):
40
+ """
41
+ A resource is a file that can be downloaded from a URL and cached
42
+ locally. This class implements checks of the minimum requirements for
43
+ a resource, to be implemented by a biocypher adapter.
44
+
45
+ Args:
46
+ name (str): The name of the resource.
47
+
48
+ url_s (str | list[str]): The URL or URLs of the resource.
49
+
50
+ lifetime (int): The lifetime of the resource in days. If 0, the
51
+ resource is considered to be permanent.
52
+ """
53
+ self.name = name
54
+ self.url_s = url_s
55
+ self.lifetime = lifetime
56
+ self.is_dir = is_dir
57
+
58
+
59
+ class Downloader:
60
+ def __init__(self, cache_dir: str):
61
+ """
62
+ A downloader is a collection of resources that can be downloaded
63
+ and cached locally. It manages the lifetime of downloaded resources by
64
+ keeping a JSON record of the download date of each resource.
65
+
66
+ Args:
67
+ cache_dir (str): The directory where the resources are cached. If
68
+ not given, a temporary directory is created.
69
+ """
70
+ self.cache_dir = cache_dir or TemporaryDirectory().name
71
+ self.cache_file = os.path.join(self.cache_dir, "cache.json")
72
+ self.cache_dict = self._load_cache_dict()
73
+
74
+ # download function that accepts a resource or a list of resources
75
+ def download(self, *resources: Resource):
76
+ """
77
+ Download one or multiple resources.
78
+
79
+ Args:
80
+ resources (Resource): The resource or resources to download.
81
+
82
+ Returns:
83
+ str or list: The path or paths to the downloaded resource(s).
84
+ """
85
+ paths = []
86
+ for resource in resources:
87
+ paths.append(self._download_or_cache(resource))
88
+
89
+ # flatten list if it is nested
90
+ if is_nested(paths):
91
+ paths = [path for sublist in paths for path in sublist]
92
+
93
+ return paths
94
+
95
+ def _download_or_cache(self, resource: Resource, cache: bool = True):
96
+ """
97
+ Download a resource if it is not cached or exceeded its lifetime.
98
+
99
+ Args:
100
+ resource (Resource): The resource to download.
101
+
102
+ Returns:
103
+ str or list: The path or paths to the downloaded resource(s).
104
+ """
105
+ # check if resource is cached
106
+ cache_record = self._get_cache_record(resource)
107
+
108
+ if cache_record:
109
+ # check if resource is expired (formatted in days)
110
+ dl = cache_record.get("date_downloaded")
111
+ lt = timedelta(days=resource.lifetime)
112
+ expired = dl + lt < datetime.now()
113
+ else:
114
+ expired = True
115
+
116
+ # download resource
117
+ if expired or not cache:
118
+ logger.info(f"Downloading resource {resource.name}.")
119
+
120
+ if resource.is_dir:
121
+ files = self._get_files(resource)
122
+ resource.url_s = [resource.url_s + "/" + file for file in files]
123
+ resource.is_dir = False
124
+ paths = self._download_or_cache(resource, cache)
125
+ elif isinstance(resource.url_s, list):
126
+ paths = []
127
+ for url in resource.url_s:
128
+ fname = url[url.rfind("/") + 1 :]
129
+ paths.append(
130
+ self._retrieve(
131
+ url=url,
132
+ fname=fname,
133
+ path=os.path.join(self.cache_dir, resource.name),
134
+ )
135
+ )
136
+ else:
137
+ fname = resource.url_s[resource.url_s.rfind("/") + 1 :]
138
+ paths = self._retrieve(
139
+ url=resource.url_s,
140
+ fname=fname,
141
+ path=os.path.join(self.cache_dir, resource.name),
142
+ )
143
+
144
+ # sometimes a compressed file contains multiple files
145
+ # TODO ask for a list of files in the archive to be used from the
146
+ # adapter
147
+
148
+ # update cache record
149
+ self._update_cache_record(resource)
150
+
151
+ return paths
152
+
153
+ def _retrieve(
154
+ self,
155
+ url: str,
156
+ fname: str,
157
+ path: str,
158
+ known_hash: str = None,
159
+ ):
160
+ """
161
+ Retrieve a file from a URL using Pooch. Infer type of file from
162
+ extension and use appropriate processor.
163
+
164
+ Args:
165
+ url (str): The URL to retrieve the file from.
166
+
167
+ fname (str): The name of the file.
168
+
169
+ path (str): The path to the file.
170
+ """
171
+ if fname.endswith(".zip"):
172
+ return pooch.retrieve(
173
+ url=url,
174
+ known_hash=known_hash,
175
+ fname=fname,
176
+ path=path,
177
+ processor=pooch.Unzip(),
178
+ progressbar=True,
179
+ )
180
+
181
+ elif fname.endswith(".tar.gz"):
182
+ return pooch.retrieve(
183
+ url=url,
184
+ known_hash=known_hash,
185
+ fname=fname,
186
+ path=path,
187
+ processor=pooch.Untar(),
188
+ progressbar=True,
189
+ )
190
+
191
+ elif fname.endswith(".gz"):
192
+ return pooch.retrieve(
193
+ url=url,
194
+ known_hash=known_hash,
195
+ fname=fname,
196
+ path=path,
197
+ processor=pooch.Decompress(),
198
+ progressbar=True,
199
+ )
200
+
201
+ else:
202
+ return pooch.retrieve(
203
+ url=url,
204
+ known_hash=known_hash,
205
+ fname=fname,
206
+ path=path,
207
+ progressbar=True,
208
+ )
209
+
210
+ def _get_files(self, resource: Resource):
211
+ """
212
+ Get the files contained in a directory resource.
213
+
214
+ Args:
215
+ resource (Resource): The directory resource.
216
+
217
+ Returns:
218
+ list: The files contained in the directory.
219
+ """
220
+ if resource.url_s.startswith("ftp://"):
221
+ # remove protocol
222
+ url = resource.url_s[6:]
223
+ # get base url
224
+ url = url[: url.find("/")]
225
+ # get directory (remove initial slash as well)
226
+ dir = resource.url_s[7 + len(url) :]
227
+ # get files
228
+ ftp = ftplib.FTP(url)
229
+ ftp.login()
230
+ ftp.cwd(dir)
231
+ files = ftp.nlst()
232
+ ftp.quit()
233
+ else:
234
+ raise NotImplementedError(
235
+ "Only FTP directories are supported at the moment."
236
+ )
237
+
238
+ return files
239
+
240
+ def _load_cache_dict(self):
241
+ """
242
+ Load the cache dictionary from the cache file. Create an empty cache
243
+ file if it does not exist.
244
+ """
245
+ if not os.path.exists(self.cache_dir):
246
+ logger.info(f"Creating cache directory {self.cache_dir}.")
247
+ os.makedirs(self.cache_dir)
248
+
249
+ if not os.path.exists(self.cache_file):
250
+ logger.info(f"Creating cache file {self.cache_file}.")
251
+ with open(self.cache_file, "w") as f:
252
+ json.dump({}, f)
253
+
254
+ with open(self.cache_file, "r") as f:
255
+ logger.info(f"Loading cache file {self.cache_file}.")
256
+ return json.load(f)
257
+
258
+ def _get_cache_record(self, resource: Resource):
259
+ """
260
+ Get the cache record of a resource.
261
+
262
+ Args:
263
+ resource (Resource): The resource to get the cache record of.
264
+
265
+ Returns:
266
+ The cache record of the resource.
267
+ """
268
+ return self.cache_dict.get(resource.name, {})
269
+
270
+ def _update_cache_record(self, resource: Resource):
271
+ """
272
+ Update the cache record of a resource.
273
+
274
+ Args:
275
+ resource (Resource): The resource to update the cache record of.
276
+ """
277
+ cache_record = {}
278
+ cache_record["url"] = to_list(resource.url_s)
279
+ cache_record["date_downloaded"] = datetime.now()
280
+ cache_record["lifetime"] = resource.lifetime
281
+ self.cache_dict[resource.name] = cache_record
282
+ with open(self.cache_file, "w") as f:
283
+ json.dump(self.cache_dict, f, default=str)
284
+
285
+
286
+ def is_nested(lst):
287
+ """
288
+ Check if a list is nested.
289
+
290
+ Args:
291
+ lst (list): The list to check.
292
+
293
+ Returns:
294
+ bool: True if the list is nested, False otherwise.
295
+ """
296
+ for item in lst:
297
+ if isinstance(item, list):
298
+ return True
299
+ return False
biocypher/_metadata.py CHANGED
@@ -19,7 +19,7 @@ import importlib.metadata
19
19
 
20
20
  import toml
21
21
 
22
- _VERSION = "0.5.19"
22
+ _VERSION = "0.5.21"
23
23
 
24
24
 
25
25
  def get_metadata():
biocypher/_ontology.py CHANGED
@@ -269,7 +269,7 @@ class Ontology:
269
269
  """
270
270
 
271
271
  self._head_ontology_meta = head_ontology
272
- self.extended_schema = ontology_mapping.extended_schema
272
+ self.mapping = ontology_mapping
273
273
  self._tail_ontology_meta = tail_ontologies
274
274
 
275
275
  self._tail_ontologies = None
@@ -403,7 +403,7 @@ class Ontology:
403
403
  if not self._nx_graph:
404
404
  self._nx_graph = self._head_ontology.get_nx_graph().copy()
405
405
 
406
- for key, value in self.extended_schema.items():
406
+ for key, value in self.mapping.extended_schema.items():
407
407
  if not value.get("is_a"):
408
408
  if self._nx_graph.has_node(value.get("synonym_for")):
409
409
  continue
@@ -485,7 +485,7 @@ class Ontology:
485
485
  setting the synonym as the primary node label.
486
486
  """
487
487
 
488
- for key, value in self.extended_schema.items():
488
+ for key, value in self.mapping.extended_schema.items():
489
489
  if key in self._nx_graph.nodes:
490
490
  self._nx_graph.nodes[key].update(value)
491
491
 
@@ -541,9 +541,9 @@ class Ontology:
541
541
 
542
542
  if not full:
543
543
  # set of leaves and their intermediate parents up to the root
544
- filter_nodes = set(self.extended_schema.keys())
544
+ filter_nodes = set(self.mapping.extended_schema.keys())
545
545
 
546
- for node in self.extended_schema.keys():
546
+ for node in self.mapping.extended_schema.keys():
547
547
  filter_nodes.update(self.get_ancestors(node).nodes)
548
548
 
549
549
  # filter graph
@@ -557,11 +557,13 @@ class Ontology:
557
557
  tree = _misc.create_tree_visualisation(G)
558
558
 
559
559
  # add synonym information
560
- for node in self.extended_schema:
561
- if self.extended_schema[node].get("synonym_for"):
560
+ for node in self.mapping.extended_schema:
561
+ if not isinstance(self.mapping.extended_schema[node], dict):
562
+ continue
563
+ if self.mapping.extended_schema[node].get("synonym_for"):
562
564
  tree.nodes[node].tag = (
563
565
  f"{node} = "
564
- f"{self.extended_schema[node].get('synonym_for')}"
566
+ f"{self.mapping.extended_schema[node].get('synonym_for')}"
565
567
  )
566
568
 
567
569
  tree.show()
@@ -602,7 +604,7 @@ class Ontology:
602
604
  "node_id": self._get_current_id(),
603
605
  "node_label": "BioCypher",
604
606
  "properties": {
605
- "schema": "self.extended_schema",
607
+ "schema": "self.ontology_mapping.extended_schema",
606
608
  },
607
609
  }
608
610
 
biocypher/_pandas.py CHANGED
@@ -1,11 +1,10 @@
1
1
  import pandas as pd
2
2
 
3
- from ._create import BioCypherEdge, BioCypherNode
3
+ from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
4
4
 
5
5
 
6
6
  class Pandas:
7
- def __init__(self, ontology, translator, deduplicator):
8
- self.ontology = ontology
7
+ def __init__(self, translator, deduplicator):
9
8
  self.translator = translator
10
9
  self.deduplicator = deduplicator
11
10
 
@@ -18,22 +17,48 @@ class Pandas:
18
17
  """
19
18
  lists = {}
20
19
  for entity in entities:
21
- if not isinstance(entity, BioCypherNode) and not isinstance(
22
- entity, BioCypherEdge
20
+ if (
21
+ not isinstance(entity, BioCypherNode)
22
+ and not isinstance(entity, BioCypherEdge)
23
+ and not isinstance(entity, BioCypherRelAsNode)
23
24
  ):
24
25
  raise TypeError(
25
- f"Expected a BioCypherNode or BioCypherEdge, got {type(entity)}."
26
+ "Expected a BioCypherNode / BioCypherEdge / "
27
+ f"BioCypherRelAsNode, got {type(entity)}."
26
28
  )
27
29
 
28
30
  if isinstance(entity, BioCypherNode):
29
31
  seen = self.deduplicator.node_seen(entity)
30
32
  elif isinstance(entity, BioCypherEdge):
31
33
  seen = self.deduplicator.edge_seen(entity)
34
+ elif isinstance(entity, BioCypherRelAsNode):
35
+ seen = self.deduplicator.rel_as_node_seen(entity)
32
36
 
33
37
  if seen:
34
38
  continue
35
39
 
36
- _type = entity.get_label()
40
+ if isinstance(entity, BioCypherRelAsNode):
41
+ node = entity.get_node()
42
+ source_edge = entity.get_source_edge()
43
+ target_edge = entity.get_target_edge()
44
+
45
+ _type = node.get_type()
46
+ if not _type in lists:
47
+ lists[_type] = []
48
+ lists[_type].append(node)
49
+
50
+ _source_type = source_edge.get_type()
51
+ if not _source_type in lists:
52
+ lists[_source_type] = []
53
+ lists[_source_type].append(source_edge)
54
+
55
+ _target_type = target_edge.get_type()
56
+ if not _target_type in lists:
57
+ lists[_target_type] = []
58
+ lists[_target_type].append(target_edge)
59
+ continue
60
+
61
+ _type = entity.get_type()
37
62
  if not _type in lists:
38
63
  lists[_type] = []
39
64
  lists[_type].append(entity)
biocypher/_translate.py CHANGED
@@ -23,7 +23,7 @@ from more_itertools import peekable
23
23
 
24
24
  from . import _misc
25
25
  from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
26
- from ._mapping import OntologyMapping
26
+ from ._ontology import Ontology
27
27
 
28
28
  __all__ = ["BiolinkAdapter", "Translator"]
29
29
 
@@ -41,9 +41,7 @@ class Translator:
41
41
  and cypher queries.
42
42
  """
43
43
 
44
- def __init__(
45
- self, ontology_mapping: "OntologyMapping", strict_mode: bool = False
46
- ):
44
+ def __init__(self, ontology: "Ontology", strict_mode: bool = False):
47
45
  """
48
46
  Args:
49
47
  leaves:
@@ -57,7 +55,7 @@ class Translator:
57
55
  carry source, licence, and version information.
58
56
  """
59
57
 
60
- self.extended_schema = ontology_mapping.extended_schema
58
+ self.ontology = ontology
61
59
  self.strict_mode = strict_mode
62
60
 
63
61
  # record nodes without biolink type configured in schema_config.yaml
@@ -71,7 +69,7 @@ class Translator:
71
69
 
72
70
  def translate_nodes(
73
71
  self,
74
- id_type_prop_tuples: Iterable,
72
+ node_tuples: Iterable,
75
73
  ) -> Generator[BioCypherNode, None, None]:
76
74
  """
77
75
  Translates input node representation to a representation that
@@ -79,16 +77,16 @@ class Translator:
79
77
  requires explicit statement of node type on pass.
80
78
 
81
79
  Args:
82
- id_type_tuples (list of tuples): collection of tuples
80
+ node_tuples (list of tuples): collection of tuples
83
81
  representing individual nodes by their unique id and a type
84
82
  that is translated from the original database notation to
85
83
  the corresponding BioCypher notation.
86
84
 
87
85
  """
88
86
 
89
- self._log_begin_translate(id_type_prop_tuples, "nodes")
87
+ self._log_begin_translate(node_tuples, "nodes")
90
88
 
91
- for _id, _type, _props in id_type_prop_tuples:
89
+ for _id, _type, _props in node_tuples:
92
90
  # check for strict mode requirements
93
91
  required_props = ["source", "licence", "version"]
94
92
 
@@ -132,8 +130,9 @@ class Translator:
132
130
  """
133
131
 
134
132
  return (
135
- self.extended_schema[_bl_type]["preferred_id"]
136
- if "preferred_id" in self.extended_schema.get(_bl_type, {})
133
+ self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
134
+ if "preferred_id"
135
+ in self.ontology.mapping.extended_schema.get(_bl_type, {})
137
136
  else "id"
138
137
  )
139
138
 
@@ -142,7 +141,9 @@ class Translator:
142
141
  Filters properties for those specified in schema_config if any.
143
142
  """
144
143
 
145
- filter_props = self.extended_schema[bl_type].get("properties", {})
144
+ filter_props = self.ontology.mapping.extended_schema[bl_type].get(
145
+ "properties", {}
146
+ )
146
147
 
147
148
  # strict mode: add required properties (only if there is a whitelist)
148
149
  if self.strict_mode and filter_props:
@@ -150,7 +151,7 @@ class Translator:
150
151
  {"source": "str", "licence": "str", "version": "str"},
151
152
  )
152
153
 
153
- exclude_props = self.extended_schema[bl_type].get(
154
+ exclude_props = self.ontology.mapping.extended_schema[bl_type].get(
154
155
  "exclude_properties", []
155
156
  )
156
157
 
@@ -188,7 +189,7 @@ class Translator:
188
189
 
189
190
  def translate_edges(
190
191
  self,
191
- id_src_tar_type_prop_tuples: Iterable,
192
+ edge_tuples: Iterable,
192
193
  ) -> Generator[Union[BioCypherEdge, BioCypherRelAsNode], None, None]:
193
194
  """
194
195
  Translates input edge representation to a representation that
@@ -197,7 +198,7 @@ class Translator:
197
198
 
198
199
  Args:
199
200
 
200
- id_src_tar_type_prop_tuples (list of tuples):
201
+ edge_tuples (list of tuples):
201
202
 
202
203
  collection of tuples representing source and target of
203
204
  an interaction via their unique ids as well as the type
@@ -206,18 +207,18 @@ class Translator:
206
207
  Can optionally possess its own ID.
207
208
  """
208
209
 
209
- self._log_begin_translate(id_src_tar_type_prop_tuples, "edges")
210
+ self._log_begin_translate(edge_tuples, "edges")
210
211
 
211
212
  # legacy: deal with 4-tuples (no edge id)
212
213
  # TODO remove for performance reasons once safe
213
- id_src_tar_type_prop_tuples = peekable(id_src_tar_type_prop_tuples)
214
- if len(id_src_tar_type_prop_tuples.peek()) == 4:
215
- id_src_tar_type_prop_tuples = [
214
+ edge_tuples = peekable(edge_tuples)
215
+ if len(edge_tuples.peek()) == 4:
216
+ edge_tuples = [
216
217
  (None, src, tar, typ, props)
217
- for src, tar, typ, props in id_src_tar_type_prop_tuples
218
+ for src, tar, typ, props in edge_tuples
218
219
  ]
219
220
 
220
- for _id, _src, _tar, _type, _props in id_src_tar_type_prop_tuples:
221
+ for _id, _src, _tar, _type, _props in edge_tuples:
221
222
  # check for strict mode requirements
222
223
  if self.strict_mode:
223
224
  if not "source" in _props:
@@ -239,7 +240,9 @@ class Translator:
239
240
  # filter properties for those specified in schema_config if any
240
241
  _filtered_props = self._filter_props(bl_type, _props)
241
242
 
242
- rep = self.extended_schema[bl_type]["represented_as"]
243
+ rep = self.ontology.mapping.extended_schema[bl_type][
244
+ "represented_as"
245
+ ]
243
246
 
244
247
  if rep == "node":
245
248
  if _id:
@@ -295,9 +298,9 @@ class Translator:
295
298
  yield BioCypherRelAsNode(n, e_s, e_t)
296
299
 
297
300
  else:
298
- edge_label = self.extended_schema[bl_type].get(
299
- "label_as_edge"
300
- )
301
+ edge_label = self.ontology.mapping.extended_schema[
302
+ bl_type
303
+ ].get("label_as_edge")
301
304
 
302
305
  if edge_label is None:
303
306
  edge_label = bl_type
@@ -356,7 +359,7 @@ class Translator:
356
359
 
357
360
  self._ontology_mapping = {}
358
361
 
359
- for key, value in self.extended_schema.items():
362
+ for key, value in self.ontology.mapping.extended_schema.items():
360
363
  labels = value.get("input_label") or value.get("label_in_input")
361
364
 
362
365
  if isinstance(labels, str):