biocypher 0.5.41__tar.gz → 0.5.42__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

Files changed (33) hide show
  1. {biocypher-0.5.41 → biocypher-0.5.42}/PKG-INFO +1 -1
  2. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_config/biocypher_config.yaml +7 -0
  3. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_metadata.py +1 -1
  4. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_misc.py +6 -1
  5. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_ontology.py +133 -53
  6. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/write/_batch_writer.py +11 -0
  7. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/write/_write.py +5 -0
  8. biocypher-0.5.42/biocypher/write/graph/_rdf.py +516 -0
  9. {biocypher-0.5.41 → biocypher-0.5.42}/pyproject.toml +1 -1
  10. {biocypher-0.5.41 → biocypher-0.5.42}/LICENSE +0 -0
  11. {biocypher-0.5.41 → biocypher-0.5.42}/README.md +0 -0
  12. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/__init__.py +0 -0
  13. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_config/__init__.py +0 -0
  14. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_config/test_config.yaml +0 -0
  15. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_config/test_schema_config.yaml +0 -0
  16. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_config/test_schema_config_disconnected.yaml +0 -0
  17. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_config/test_schema_config_extended.yaml +0 -0
  18. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_connect.py +0 -0
  19. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_core.py +0 -0
  20. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_create.py +0 -0
  21. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_deduplicate.py +0 -0
  22. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_get.py +0 -0
  23. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_logger.py +0 -0
  24. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_mapping.py +0 -0
  25. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_pandas.py +0 -0
  26. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/_translate.py +0 -0
  27. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/write/__init__.py +0 -0
  28. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/write/graph/__init__.py +0 -0
  29. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/write/graph/_arangodb.py +0 -0
  30. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/write/graph/_neo4j.py +0 -0
  31. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/write/relational/__init__.py +0 -0
  32. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/write/relational/_postgresql.py +0 -0
  33. {biocypher-0.5.41 → biocypher-0.5.42}/biocypher/write/relational/_sqlite.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: biocypher
3
- Version: 0.5.41
3
+ Version: 0.5.42
4
4
  Summary: A unifying framework for biomedical research knowledge graphs
5
5
  Home-page: https://github.com/biocypher/biocypher
6
6
  License: MIT
@@ -27,6 +27,7 @@ biocypher:
27
27
  head_ontology:
28
28
  url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
29
29
  root_node: entity
30
+ # switch_label_and_id: true
30
31
 
31
32
  ### Optional parameters ###
32
33
 
@@ -53,10 +54,12 @@ biocypher:
53
54
  # url: test/ontologies/so.owl
54
55
  # head_join_node: sequence variant
55
56
  # tail_join_node: sequence_variant
57
+ # switch_label_and_id: true
56
58
  # mondo:
57
59
  # url: test/ontologies/mondo.owl
58
60
  # head_join_node: disease
59
61
  # tail_join_node: disease
62
+ # switch_label_and_id: true
60
63
 
61
64
  ### DBMS configuration ###
62
65
 
@@ -113,6 +116,10 @@ postgresql:
113
116
  # import_call_bin_prefix: '' # path to "psql"
114
117
  # import_call_file_prefix: '/path/to/files'
115
118
 
119
+ rdf:
120
+ ### RDF configuration ###
121
+ rdf_format: turtle
122
+
116
123
  sqlite:
117
124
  ### SQLite configuration ###
118
125
 
@@ -19,7 +19,7 @@ import importlib.metadata
19
19
 
20
20
  import toml
21
21
 
22
- _VERSION = "0.5.41"
22
+ _VERSION = "0.5.42"
23
23
 
24
24
 
25
25
  def get_metadata():
@@ -115,7 +115,12 @@ def _get_inheritance_tree(inheritance_graph: Union[dict, nx.Graph]) -> dict:
115
115
  )
116
116
  if multiple_parents_present:
117
117
  logger.warning(
118
- "The ontology contains multiple inheritance (one child node has multiple parent nodes). This is not visualized in the following hierarchy tree (the child node is only added once). If you want to browse all relationships of the parsed ontology write a graphml file to disk and view this file."
118
+ "The ontology contains multiple inheritance (one child node "
119
+ "has multiple parent nodes). This is not visualized in the "
120
+ "following hierarchy tree (the child node is only added once). "
121
+ "If you wish to browse all relationships of the parsed "
122
+ "ontologies, write a graphml file to disk using "
123
+ "`to_disk = <directory>` and view this file."
119
124
  )
120
125
 
121
126
  # unlist values
@@ -43,19 +43,19 @@ class OntologyAdapter:
43
43
  ontology is represented by a networkx.DiGraph object; an RDFlib graph is
44
44
  also kept. By default, the DiGraph reverses the label and identifier of the
45
45
  nodes, such that the node name in the graph is the human-readable label. The
46
- edges are oriented from child to parent. Going from the Biolink example,
47
- labels are formatted in lower sentence case. In some cases, this means that
48
- we replace underscores with spaces.
46
+ edges are oriented from child to parent.
47
+ Labels are formatted in lower sentence case and underscores are replaced by spaces.
48
+ Identifiers are taken as defined and the prefixes are removed by default.
49
49
  """
50
50
 
51
51
  def __init__(
52
52
  self,
53
53
  ontology_file: str,
54
54
  root_label: str,
55
- format: Optional[str] = None,
56
- head_join_node: Optional[str] = None,
55
+ ontology_file_format: Optional[str] = None,
56
+ head_join_node_label: Optional[str] = None,
57
57
  merge_nodes: Optional[bool] = True,
58
- reverse_labels: bool = True,
58
+ switch_label_and_id: bool = True,
59
59
  remove_prefixes: bool = True,
60
60
  ):
61
61
  """
@@ -68,7 +68,10 @@ class OntologyAdapter:
68
68
  root_label (str): The label of the root node in the ontology. In
69
69
  case of a tail ontology, this is the tail join node.
70
70
 
71
- head_join_node (str): Optional variable to store the label of the
71
+ ontology_file_format (str): The format of the ontology file (e.g. "application/rdf+xml")
72
+ If format is not passed, it is determined automatically.
73
+
74
+ head_join_node_label (str): Optional variable to store the label of the
72
75
  node in the head ontology that should be used to join to the
73
76
  root node of the tail ontology. Defaults to None.
74
77
 
@@ -77,7 +80,7 @@ class OntologyAdapter:
77
80
  tail join node will be attached as a child of the head join
78
81
  node.
79
82
 
80
- reverse_labels (bool): If True, the node names in the graph will be
83
+ switch_label_and_id (bool): If True, the node names in the graph will be
81
84
  the human-readable labels. If False, the node names will be the
82
85
  identifiers. Defaults to True.
83
86
 
@@ -89,33 +92,37 @@ class OntologyAdapter:
89
92
 
90
93
  self._ontology_file = ontology_file
91
94
  self._root_label = root_label
92
- self._format = format
95
+ self._format = ontology_file_format
93
96
  self._merge_nodes = merge_nodes
94
- self._head_join_node = head_join_node
95
- self._reverse_labels = reverse_labels
97
+ self._head_join_node = head_join_node_label
98
+ self._switch_label_and_id = switch_label_and_id
96
99
  self._remove_prefixes = remove_prefixes
97
100
 
98
101
  self._rdf_graph = self._load_rdf_graph(ontology_file)
99
102
 
100
103
  self._nx_graph = self._rdf_to_nx(
101
- self._rdf_graph, root_label, reverse_labels
104
+ self._rdf_graph, root_label, switch_label_and_id
102
105
  )
103
106
 
104
107
  def _rdf_to_nx(
105
- self, _rdf_graph: rdflib.Graph, root_label: str, reverse_labels: bool
108
+ self,
109
+ _rdf_graph: rdflib.Graph,
110
+ root_label: str,
111
+ switch_label_and_id: bool,
112
+ rename_nodes: bool = True,
106
113
  ) -> nx.DiGraph:
107
114
  one_to_one_triples, one_to_many_dict = self._get_relevant_rdf_triples(
108
115
  _rdf_graph
109
116
  )
110
117
  nx_graph = self._convert_to_nx(one_to_one_triples, one_to_many_dict)
111
- nx_graph_with_labels = self._add_labels_to_nodes(
112
- nx_graph, reverse_labels
118
+ nx_graph = self._add_labels_to_nodes(nx_graph, switch_label_and_id)
119
+ nx_graph = self._change_nodes_to_biocypher_format(
120
+ nx_graph, switch_label_and_id, rename_nodes
113
121
  )
114
- renamed_graph = self._rename_nodes(nx_graph_with_labels, reverse_labels)
115
- filtered_graph = self._get_all_ancestors(
116
- renamed_graph, root_label, reverse_labels
122
+ nx_graph = self._get_all_ancestors(
123
+ nx_graph, root_label, switch_label_and_id, rename_nodes
117
124
  )
118
- return nx.DiGraph(filtered_graph)
125
+ return nx.DiGraph(nx_graph)
119
126
 
120
127
  def _get_relevant_rdf_triples(self, g: rdflib.Graph) -> tuple:
121
128
  one_to_one_inheritance_graph = self._get_one_to_one_inheritance_triples(
@@ -239,19 +246,21 @@ class OntologyAdapter:
239
246
  return nx_graph
240
247
 
241
248
  def _add_labels_to_nodes(
242
- self, nx_graph: nx.DiGraph, reverse_labels: bool
249
+ self, nx_graph: nx.DiGraph, switch_label_and_id: bool
243
250
  ) -> nx.DiGraph:
244
251
  """Add labels to the nodes in the networkx graph.
245
252
 
246
253
  Args:
247
254
  nx_graph (nx.DiGraph): The networkx graph
248
- reverse_labels (bool): If True, id and label are switched
255
+ switch_label_and_id (bool): If True, id and label are switched
249
256
 
250
257
  Returns:
251
258
  nx.DiGraph: The networkx graph with labels
252
259
  """
253
260
  for node in list(nx_graph.nodes):
254
- nx_id, nx_label = self._get_nx_id_and_label(node, reverse_labels)
261
+ nx_id, nx_label = self._get_nx_id_and_label(
262
+ node, switch_label_and_id
263
+ )
255
264
  if nx_id == "none":
256
265
  # remove node if it has no id
257
266
  nx_graph.remove_node(node)
@@ -260,39 +269,56 @@ class OntologyAdapter:
260
269
  nx_graph.nodes[node]["label"] = nx_label
261
270
  return nx_graph
262
271
 
263
- def _rename_nodes(
264
- self, nx_graph: nx.DiGraph, reverse_labels: bool
272
+ def _change_nodes_to_biocypher_format(
273
+ self,
274
+ nx_graph: nx.DiGraph,
275
+ switch_label_and_id: bool,
276
+ rename_nodes: bool = True,
265
277
  ) -> nx.DiGraph:
266
- """Rename the nodes in the networkx graph (remove prefix and switch id and label).
278
+ """Change the nodes in the networkx graph to BioCypher format:
279
+ - remove the prefix of the identifier
280
+ - switch id and label
281
+ - adapt the labels (replace _ with space and convert to lower sentence case)
267
282
 
268
283
  Args:
269
284
  nx_graph (nx.DiGraph): The networkx graph
270
- reverse_labels (bool): If True, id and label are switched
285
+ switch_label_and_id (bool): If True, id and label are switched
286
+ rename_nodes (bool): If True, the nodes are renamed
271
287
 
272
288
  Returns:
273
- nx.DiGraph: The renamed networkx graph
289
+ nx.DiGraph: The networkx ontology graph in BioCypher format
274
290
  """
275
291
  mapping = {
276
- node: self._get_nx_id_and_label(node, reverse_labels)[0]
292
+ node: self._get_nx_id_and_label(
293
+ node, switch_label_and_id, rename_nodes
294
+ )[0]
277
295
  for node in nx_graph.nodes
278
296
  }
279
297
  renamed = nx.relabel_nodes(nx_graph, mapping, copy=False)
280
298
  return renamed
281
299
 
282
300
  def _get_all_ancestors(
283
- self, renamed: nx.DiGraph, root_label: str, reverse_labels: bool
301
+ self,
302
+ renamed: nx.DiGraph,
303
+ root_label: str,
304
+ switch_label_and_id: bool,
305
+ rename_nodes: bool = True,
284
306
  ) -> nx.DiGraph:
285
307
  """Get all ancestors of the root node in the networkx graph.
286
308
 
287
309
  Args:
288
310
  renamed (nx.DiGraph): The renamed networkx graph
289
311
  root_label (str): The label of the root node in the ontology
312
+ switch_label_and_id (bool): If True, id and label are switched
313
+ rename_nodes (bool): If True, the nodes are renamed
290
314
 
291
315
  Returns:
292
316
  nx.DiGraph: The filtered networkx graph
293
317
  """
294
318
  root = self._get_nx_id_and_label(
295
- self._find_root_label(self._rdf_graph, root_label), reverse_labels
319
+ self._find_root_label(self._rdf_graph, root_label),
320
+ switch_label_and_id,
321
+ rename_nodes,
296
322
  )[0]
297
323
  ancestors = nx.ancestors(renamed, root)
298
324
  ancestors.add(root)
@@ -300,7 +326,7 @@ class OntologyAdapter:
300
326
  return filtered_graph
301
327
 
302
328
  def _get_nx_id_and_label(
303
- self, node, switch_id_and_label: bool
329
+ self, node, switch_id_and_label: bool, rename_nodes: bool = True
304
330
  ) -> tuple[str, str]:
305
331
  """Rename node id and label for nx graph.
306
332
 
@@ -312,10 +338,10 @@ class OntologyAdapter:
312
338
  tuple[str, str]: The renamed node id and label
313
339
  """
314
340
  node_id_str = self._remove_prefix(str(node))
315
- node_label_str = str(
316
- self._rdf_graph.value(node, rdflib.RDFS.label)
317
- ).replace("_", " ")
318
- node_label_str = to_lower_sentence_case(node_label_str)
341
+ node_label_str = str(self._rdf_graph.value(node, rdflib.RDFS.label))
342
+ if rename_nodes:
343
+ node_label_str = node_label_str.replace("_", " ")
344
+ node_label_str = to_lower_sentence_case(node_label_str)
319
345
  nx_id = node_label_str if switch_id_and_label else node_id_str
320
346
  nx_label = node_id_str if switch_id_and_label else node_label_str
321
347
  return nx_id, nx_label
@@ -330,8 +356,14 @@ class OntologyAdapter:
330
356
  root = label_subject
331
357
  break
332
358
  else:
359
+ labels_in_ontology = []
360
+ for label_subject, _, label_in_ontology in g.triples(
361
+ (None, rdflib.RDFS.label, None)
362
+ ):
363
+ labels_in_ontology.append(str(label_in_ontology))
333
364
  raise ValueError(
334
- f"Could not find root node with label {root_label}"
365
+ f"Could not find root node with label '{root_label}'. "
366
+ f"The ontology contains the following labels: {labels_in_ontology}"
335
367
  )
336
368
  return root
337
369
 
@@ -398,11 +430,29 @@ class OntologyAdapter:
398
430
  """
399
431
  return self._rdf_graph
400
432
 
401
- def get_root_label(self):
433
+ def get_root_node(self):
402
434
  """
403
- Get the label of the root node in the ontology.
435
+ Get root node in the ontology.
436
+
437
+ Returns:
438
+ root_node: If _switch_label_and_id is True, the root node label is returned,
439
+ otherwise the root node id is returned.
404
440
  """
405
- return self._root_label
441
+
442
+ root_node = None
443
+ root_label = self._root_label.replace("_", " ")
444
+
445
+ if self._switch_label_and_id:
446
+ root_node = to_lower_sentence_case(root_label)
447
+ elif not self._switch_label_and_id:
448
+ for node, data in self.get_nx_graph().nodes(data=True):
449
+ if "label" in data and data["label"] == to_lower_sentence_case(
450
+ root_label
451
+ ):
452
+ root_node = node
453
+ break
454
+
455
+ return root_node
406
456
 
407
457
  def get_ancestors(self, node_label):
408
458
  """
@@ -465,8 +515,8 @@ class Ontology:
465
515
 
466
516
  if self._tail_ontologies:
467
517
  for adapter in self._tail_ontologies.values():
468
- self._assert_join_node(adapter)
469
- self._join_ontologies(adapter)
518
+ head_join_node = self._get_head_join_node(adapter)
519
+ self._join_ontologies(adapter, head_join_node)
470
520
  else:
471
521
  self._nx_graph = self._head_ontology.get_nx_graph()
472
522
 
@@ -489,7 +539,10 @@ class Ontology:
489
539
  self._head_ontology = OntologyAdapter(
490
540
  ontology_file=self._head_ontology_meta["url"],
491
541
  root_label=self._head_ontology_meta["root_node"],
492
- format=self._head_ontology_meta.get("format", None),
542
+ ontology_file_format=self._head_ontology_meta.get("format", None),
543
+ switch_label_and_id=self._head_ontology_meta.get(
544
+ "switch_label_and_id", True
545
+ ),
493
546
  )
494
547
 
495
548
  if self._tail_ontology_meta:
@@ -498,12 +551,13 @@ class Ontology:
498
551
  self._tail_ontologies[key] = OntologyAdapter(
499
552
  ontology_file=value["url"],
500
553
  root_label=value["tail_join_node"],
501
- head_join_node=value["head_join_node"],
502
- format=value.get("format", None),
554
+ head_join_node_label=value["head_join_node"],
555
+ ontology_file_format=value.get("format", None),
503
556
  merge_nodes=value.get("merge_nodes", True),
557
+ switch_label_and_id=value.get("switch_label_and_id", True),
504
558
  )
505
559
 
506
- def _assert_join_node(self, adapter: OntologyAdapter) -> None:
560
+ def _get_head_join_node(self, adapter: OntologyAdapter) -> str:
507
561
  """
508
562
  Tries to find the head join node of the given ontology adapter in the
509
563
  head ontology. If the join node is not found, the method will raise an
@@ -514,15 +568,41 @@ class Ontology:
514
568
  join node in the head ontology.
515
569
  """
516
570
 
517
- head_join_node = adapter.get_head_join_node()
571
+ head_join_node = None
572
+ user_defined_head_join_node_label = adapter.get_head_join_node()
573
+ head_join_node_label_in_bc_format = to_lower_sentence_case(
574
+ user_defined_head_join_node_label.replace("_", " ")
575
+ )
576
+
577
+ if self._head_ontology._switch_label_and_id:
578
+ head_join_node = head_join_node_label_in_bc_format
579
+ elif not self._head_ontology._switch_label_and_id:
580
+ for node_id, data in self._head_ontology.get_nx_graph().nodes(
581
+ data=True
582
+ ):
583
+ if (
584
+ "label" in data
585
+ and data["label"] == head_join_node_label_in_bc_format
586
+ ):
587
+ head_join_node = node_id
588
+ break
518
589
 
519
590
  if head_join_node not in self._head_ontology.get_nx_graph().nodes:
591
+ head_ontology = self._head_ontology._rdf_to_nx(
592
+ self._head_ontology.get_rdf_graph(),
593
+ self._head_ontology._root_label,
594
+ self._head_ontology._switch_label_and_id,
595
+ rename_nodes=False,
596
+ )
520
597
  raise ValueError(
521
- f"Head join node {head_join_node} not found in "
522
- f"head ontology."
598
+ f"Head join node '{head_join_node}' not found in head ontology. "
599
+ f"The head ontology contains the following nodes: {head_ontology.nodes}."
523
600
  )
601
+ return head_join_node
524
602
 
525
- def _join_ontologies(self, adapter: OntologyAdapter) -> None:
603
+ def _join_ontologies(
604
+ self, adapter: OntologyAdapter, head_join_node
605
+ ) -> None:
526
606
  """
527
607
  Joins the ontologies by adding the tail ontology as a subgraph to the
528
608
  head ontology at the specified join nodes.
@@ -535,8 +615,7 @@ class Ontology:
535
615
  if not self._nx_graph:
536
616
  self._nx_graph = self._head_ontology.get_nx_graph().copy()
537
617
 
538
- head_join_node = to_lower_sentence_case(adapter.get_head_join_node())
539
- tail_join_node = to_lower_sentence_case(adapter.get_root_label())
618
+ tail_join_node = adapter.get_root_node()
540
619
  tail_ontology = adapter.get_nx_graph()
541
620
 
542
621
  # subtree of tail ontology at join node
@@ -695,8 +774,9 @@ class Ontology:
695
774
  Args:
696
775
 
697
776
  to_disk (str): If specified, the ontology structure will be saved
698
- to disk as a GRAPHML file, to be opened in your favourite
699
- graph visualisation tool.
777
+ to disk as a GRAPHML file at the location (directory) specified
778
+ by the `to_disk` string, to be opened in your favourite graph
779
+ visualisation tool.
700
780
 
701
781
  full (bool): If True, the full ontology structure will be shown,
702
782
  including all nodes and edges. If False, only the nodes and
@@ -6,6 +6,7 @@ import os
6
6
  import re
7
7
  import glob
8
8
 
9
+ from rdflib import Graph
9
10
  from more_itertools import peekable
10
11
 
11
12
  from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
@@ -117,6 +118,8 @@ class _BatchWriter(ABC):
117
118
  db_password: str = None,
118
119
  db_host: str = None,
119
120
  db_port: str = None,
121
+ rdf_format: str = None,
122
+ rdf_namespaces: dict = {},
120
123
  ):
121
124
  """
122
125
 
@@ -196,12 +199,20 @@ class _BatchWriter(ABC):
196
199
 
197
200
  db_port:
198
201
  The database port.
202
+
203
+ rdf_format:
204
+ The format of RDF.
205
+
206
+ rdf_namespaces:
207
+ The namespaces for RDF.
199
208
  """
200
209
  self.db_name = db_name
201
210
  self.db_user = db_user
202
211
  self.db_password = db_password
203
212
  self.db_host = db_host or "localhost"
204
213
  self.db_port = db_port
214
+ self.rdf_format = rdf_format
215
+ self.rdf_namespaces = rdf_namespaces
205
216
 
206
217
  self.delim, self.escaped_delim = self._process_delimiter(delimiter)
207
218
  self.adelim, self.escaped_adelim = self._process_delimiter(
@@ -14,6 +14,7 @@ suitable for import into a DBMS.
14
14
  """
15
15
 
16
16
  from biocypher._logger import logger
17
+ from biocypher.write.graph._rdf import _RDFWriter
17
18
  from biocypher.write.graph._neo4j import _Neo4jBatchWriter
18
19
  from biocypher.write.graph._arangodb import _ArangoDBBatchWriter
19
20
  from biocypher.write.relational._sqlite import _SQLiteBatchWriter
@@ -43,6 +44,8 @@ DBMS_TO_CLASS = {
43
44
  "ArangoDB": _ArangoDBBatchWriter,
44
45
  "sqlite": _SQLiteBatchWriter,
45
46
  "sqlite3": _SQLiteBatchWriter,
47
+ "rdf": _RDFWriter,
48
+ "RDF": _RDFWriter,
46
49
  }
47
50
 
48
51
 
@@ -102,4 +105,6 @@ def get_writer(
102
105
  db_user=dbms_config.get("user"), # psql
103
106
  db_password=dbms_config.get("password"), # psql
104
107
  db_port=dbms_config.get("port"), # psql
108
+ rdf_format=dbms_config.get("rdf_format"), # rdf
109
+ rdf_namespaces=dbms_config.get("rdf_namespaces"), # rdf
105
110
  )
@@ -0,0 +1,516 @@
1
+ #!/usr/bin/env python
2
+
3
+ #
4
+ # Copyright 2021, Heidelberg University Clinic
5
+ #
6
+ # File author(s): Loes van den Biggelaar
7
+ # Sebastian Lobentanzer
8
+ #
9
+ # Distributed under MIT licence, see the file `LICENSE`.
10
+ #
11
+ """
12
+ BioCypher 'offline' module. Handles the writing of node and edge representations
13
+ suitable for import into a DBMS.
14
+ """
15
+ from types import GeneratorType
16
+ from typing import Union
17
+ import os
18
+
19
+ from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
20
+ from rdflib.namespace import (
21
+ _NAMESPACE_PREFIXES_CORE,
22
+ _NAMESPACE_PREFIXES_RDFLIB,
23
+ )
24
+
25
+ from biocypher._create import BioCypherEdge, BioCypherNode
26
+ from biocypher._logger import logger
27
+ from biocypher.write._batch_writer import _BatchWriter
28
+
29
+
30
+ class _RDFWriter(_BatchWriter):
31
+
32
+ """
33
+ Class to write BioCypher's property graph into an RDF format using
34
+ rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
35
+ N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
36
+ is done keeping only the minimum information about node and edges,
37
+ skipping all properties.
38
+ """
39
+
40
+ def _get_import_script_name(self) -> str:
41
+ """
42
+ Returns the name of the RDF admin import script.
43
+ This function applicable for RDF export.
44
+
45
+ Returns:
46
+ str: The name of the import script (ending in .sh)
47
+ """
48
+ return "rdf-import-call.sh"
49
+
50
+ def _get_default_import_call_bin_prefix(self):
51
+ """
52
+ Method to provide the default string for the import call bin prefix.
53
+
54
+ Returns:
55
+ str: The default location for the RDF admin import location
56
+ """
57
+ return "bin/"
58
+
59
+ def _is_rdf_format_supported(self, rdf_format: str) -> bool:
60
+ """
61
+ Function to check if the specified RDF format is supported.
62
+
63
+ Args:
64
+ rdf_format (str): The RDF format to check.
65
+
66
+ Returns:
67
+ bool: Returns True if rdf format supported, False otherwise.
68
+ """
69
+ supported_formats = [
70
+ "xml",
71
+ "n3",
72
+ "turtle",
73
+ "nt",
74
+ "pretty-xml",
75
+ "trix",
76
+ "trig",
77
+ "nquads",
78
+ "json-ld",
79
+ ]
80
+ if rdf_format not in supported_formats:
81
+ logger.error(
82
+ f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
83
+ f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
84
+ )
85
+ return False
86
+ else:
87
+ # RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
88
+ if self.rdf_format == "turtle":
89
+ self.extension = "ttl"
90
+ elif self.rdf_format == "ttl":
91
+ self.rdf_format = "turtle"
92
+ self.extension = "ttl"
93
+ else:
94
+ self.extension = self.rdf_format
95
+ return True
96
+
97
+ def _write_single_edge_list_to_file(
98
+ self,
99
+ edge_list: list,
100
+ label: str,
101
+ prop_dict: dict,
102
+ ):
103
+ """
104
+ This function takes one list of biocypher edges and writes them
105
+ to an RDF file with the given format.
106
+
107
+ Args:
108
+ edge_list (list): list of BioCypherEdges to be written
109
+
110
+ label (str): the label (type) of the edge
111
+
112
+ prop_dict (dict): properties of node class passed from parsing
113
+ function and their types
114
+
115
+ Returns:
116
+ bool: The return value. True for success, False otherwise.
117
+ """
118
+
119
+ if not all(isinstance(n, BioCypherEdge) for n in edge_list):
120
+ logger.error("Edges must be passed as type BioCypherEdge.")
121
+ return False
122
+
123
+ # translate label to PascalCase
124
+ label_pascal = self.translator.name_sentence_to_pascal(label)
125
+
126
+ # create file name
127
+ file_name = os.path.join(
128
+ self._outdir, f"{label_pascal}.{self.extension}"
129
+ )
130
+
131
+ # write data in graph
132
+ graph = Graph()
133
+ self._init_namespaces(graph)
134
+
135
+ for edge in edge_list:
136
+ rdf_subject = edge.get_source_id()
137
+ rdf_object = edge.get_target_id()
138
+ rdf_predicate = edge.get_id()
139
+ rdf_properties = edge.get_properties()
140
+ if rdf_predicate == None:
141
+ rdf_predicate = rdf_subject + rdf_object
142
+
143
+ edge_label = self.translator.name_sentence_to_pascal(
144
+ edge.get_label()
145
+ )
146
+ edge_uri = self.rdf_namespaces["biocypher"][edge_label]
147
+ graph.add((edge_uri, RDF.type, RDFS.Class))
148
+ graph.add(
149
+ (
150
+ self.rdf_namespaces["biocypher"][rdf_predicate],
151
+ RDF.type,
152
+ edge_uri,
153
+ )
154
+ )
155
+ graph.add(
156
+ (
157
+ self.rdf_namespaces["biocypher"][rdf_predicate],
158
+ self.rdf_namespaces["biocypher"]["subject"],
159
+ self.subject_to_uri(rdf_subject),
160
+ )
161
+ )
162
+ graph.add(
163
+ (
164
+ self.rdf_namespaces["biocypher"][rdf_predicate],
165
+ self.rdf_namespaces["biocypher"]["object"],
166
+ self.subject_to_uri(rdf_object),
167
+ )
168
+ )
169
+
170
+ # add properties to the transformed edge --> node
171
+ for key, value in rdf_properties.items():
172
+ # only write value if it exists.
173
+ if value:
174
+ self.add_property_to_graph(graph, rdf_predicate, value, key)
175
+
176
+ graph.serialize(destination=file_name, format=self.rdf_format)
177
+
178
+ logger.info(
179
+ f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
180
+ )
181
+
182
+ return True
183
+
184
+ def add_property_to_graph(
185
+ self,
186
+ graph: Graph,
187
+ rdf_subject: str,
188
+ rdf_object: str,
189
+ rdf_predicate: str,
190
+ ):
191
+ """
192
+ Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
193
+ It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
194
+ If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
195
+ If the property is neither a list or string, it will also be added as a literal.
196
+
197
+ Args:
198
+ graph (RDFLib.Graph): The RDF graph to add the nodes to.
199
+
200
+ rdf_subject (str): The subject of the RDF triple.
201
+
202
+ rdf_object (str): The object of the RDF triple.
203
+
204
+ rdf_predicate (str): The predicate of the RDF triple.
205
+
206
+ Returns:
207
+ None
208
+ """
209
+ if isinstance(rdf_object, list):
210
+ for obj in rdf_object:
211
+ graph.add(
212
+ (
213
+ self.subject_to_uri(rdf_subject),
214
+ self.property_to_uri(rdf_predicate),
215
+ Literal(obj),
216
+ )
217
+ )
218
+ elif isinstance(rdf_object, str):
219
+ if rdf_object.startswith("[") and rdf_object.endswith("]"):
220
+ self.add_property_to_graph(
221
+ graph,
222
+ rdf_subject,
223
+ self.transform_string_to_list(rdf_object),
224
+ rdf_predicate,
225
+ )
226
+ else:
227
+ graph.add(
228
+ (
229
+ self.subject_to_uri(rdf_subject),
230
+ self.property_to_uri(rdf_predicate),
231
+ Literal(rdf_object),
232
+ )
233
+ )
234
+ else:
235
+ graph.add(
236
+ (
237
+ self.subject_to_uri(rdf_subject),
238
+ self.property_to_uri(rdf_predicate),
239
+ Literal(rdf_object),
240
+ )
241
+ )
242
+
243
+ def transform_string_to_list(self, string_list: str) -> list:
244
+ """
245
+ Function to transform a string representation of a list into a list.
246
+
247
+ Args:
248
+ string_list (str): The string representation of the list.
249
+
250
+ Returns:
251
+ list: The list representation of the input string.
252
+ """
253
+ return (
254
+ string_list.replace("[", "")
255
+ .replace("]", "")
256
+ .replace("'", "")
257
+ .split(", ")
258
+ )
259
+
260
+ def _write_single_node_list_to_file(
261
+ self,
262
+ node_list: list,
263
+ label: str,
264
+ prop_dict: dict,
265
+ labels: str,
266
+ ):
267
+ """
268
+ This function takes a list of BioCypherNodes and writes them
269
+ to an RDF file in the specified format.
270
+
271
+ Args:
272
+ node_list (list): A list of BioCypherNodes to be written.
273
+
274
+ label (str): The label (type) of the nodes.
275
+
276
+ prop_dict (dict): A dictionary of properties and their types for the node class.
277
+
278
+ Returns:
279
+ bool: True if the writing is successful, False otherwise.
280
+ """
281
+ if not all(isinstance(n, BioCypherNode) for n in node_list):
282
+ logger.error("Nodes must be passed as type BioCypherNode.")
283
+ return False
284
+
285
+ # translate label to PascalCase
286
+ label_pascal = self.translator.name_sentence_to_pascal(label)
287
+
288
+ # create file name
289
+ file_name = os.path.join(
290
+ self._outdir, f"{label_pascal}.{self.extension}"
291
+ )
292
+
293
+ # write data in graph
294
+ graph = Graph()
295
+ self._init_namespaces(graph)
296
+
297
+ for n in node_list:
298
+ rdf_subject = n.get_id()
299
+ rdf_object = n.get_label()
300
+ properties = n.get_properties()
301
+ class_name = self.translator.name_sentence_to_pascal(rdf_object)
302
+ graph.add(
303
+ (
304
+ self.rdf_namespaces["biocypher"][class_name],
305
+ RDF.type,
306
+ RDFS.Class,
307
+ )
308
+ )
309
+ graph.add(
310
+ (
311
+ self.subject_to_uri(rdf_subject),
312
+ RDF.type,
313
+ self.rdf_namespaces["biocypher"][class_name],
314
+ )
315
+ )
316
+ for key, value in properties.items():
317
+ # only write value if it exists.
318
+ if value:
319
+ self.add_property_to_graph(graph, rdf_subject, value, key)
320
+
321
+ graph.serialize(destination=file_name, format=self.rdf_format)
322
+
323
+ logger.info(
324
+ f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
325
+ )
326
+
327
+ return True
328
+
329
+ def write_nodes(
330
+ self, nodes, batch_size: int = int(1e6), force: bool = False
331
+ ) -> bool:
332
+ """
333
+ Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
334
+
335
+ Args:
336
+ nodes (list or generator): A list or generator of nodes in BioCypherNode format.
337
+ batch_size (int): The number of nodes to write in each batch.
338
+ force (bool): Flag to force the writing even if the output file already exists.
339
+
340
+ Returns:
341
+ bool: True if the writing is successful, False otherwise.
342
+ """
343
+ # check if specified output format is correct
344
+ passed = self._is_rdf_format_supported(self.rdf_format)
345
+ if not passed:
346
+ logger.error("Error while writing node data, wrong RDF format")
347
+ return False
348
+ # write node data using _write_node_data method
349
+ passed = self._write_node_data(nodes, batch_size, force)
350
+ if not passed:
351
+ logger.error("Error while writing node data.")
352
+ return False
353
+ return True
354
+
355
+ def write_edges(
356
+ self,
357
+ edges: Union[list, GeneratorType],
358
+ batch_size: int = int(1e6),
359
+ ) -> bool:
360
+ """
361
+ Wrapper for writing edges in RDF format. It calls _write_edge_data()
362
+ functions specifying it's edge data.
363
+
364
+ Args:
365
+ edges (BioCypherEdge): a list or generator of edges in
366
+ :py:class:`BioCypherEdge` format
367
+ batch_size (int): The number of edges to write in each batch.
368
+
369
+ Returns:
370
+ bool: The return value. True for success, False otherwise.
371
+ """
372
+ # check if specified output format is correct
373
+ passed = self._is_rdf_format_supported(self.rdf_format)
374
+ if not passed:
375
+ logger.error("Error while writing edge data, wrong RDF format")
376
+ return False
377
+ # write edge data using _write_edge_data method
378
+ passed = self._write_edge_data(edges, batch_size=batch_size)
379
+ if not passed:
380
+ logger.error("Error while writing edge data.")
381
+ return False
382
+
383
+ return True
384
+
385
+ def _construct_import_call(self) -> bool:
386
+ """
387
+ Function to write the import call.
388
+ This function is not applicable for RDF.
389
+
390
+ Returns:
391
+ bool: The return value. True for success, False otherwise.
392
+ """
393
+ return ""
394
+
395
+ def _write_array_string(self, string_list):
396
+ """
397
+ Abstract method to write the string representation of an array into a .csv file
398
+ as required by the RDF admin-import.
399
+ This function is not applicable for RDF.
400
+
401
+ Args:
402
+ string_list (list): list of ontology strings
403
+
404
+ Returns:
405
+ str: The string representation of an array for the neo4j admin import
406
+ """
407
+
408
+ return True
409
+
410
+ def _write_node_headers(self):
411
+ """
412
+ Abstract method that takes care of importing properties of a graph entity that is represented
413
+ as a node as per the definition in the `schema_config.yaml`
414
+ This function is not applicable for RDF.
415
+
416
+ Returns:
417
+ bool: The return value. True for success, False otherwise.
418
+ """
419
+ return True
420
+
421
+ def _write_edge_headers(self):
422
+ """
423
+ Abstract method to write a database import-file for a graph entity that is represented
424
+ as an edge as per the definition in the `schema_config.yaml`,
425
+ containing only the header for this type of edge.
426
+ This function is not applicable for RDF.
427
+
428
+ Returns:
429
+ bool: The return value. True for success, False otherwise.
430
+ """
431
+ return True
432
+
433
+ def subject_to_uri(self, subject: str) -> str:
434
+ """
435
+ Converts the subject to a proper URI using the available namespaces.
436
+ If the conversion fails, it defaults to the biocypher prefix.
437
+
438
+ Args:
439
+ subject (str): The subject to be converted to a URI.
440
+
441
+ Returns:
442
+ str: The corresponding URI for the subject.
443
+ """
444
+ try:
445
+ _pref, _id = subject.split(":")
446
+
447
+ if _pref in self.rdf_namespaces.keys():
448
+ return self.rdf_namespaces[_pref][_id]
449
+ else:
450
+ return self.rdf_namespaces["biocypher"][subject]
451
+ except ValueError:
452
+ return self.rdf_namespaces["biocypher"][subject]
453
+
454
+ def property_to_uri(self, property_name: str) -> dict[str, str]:
455
+ """
456
+ Converts a property name to its corresponding URI.
457
+
458
+ This function takes a property name and searches for its corresponding URI in various namespaces.
459
+ It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
460
+
461
+ Args:
462
+ property_name (str): The property name to be converted to a URI.
463
+
464
+ Returns:
465
+ str: The corresponding URI for the input property name.
466
+ """
467
+ # These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
468
+ for namespace in _NAMESPACE_PREFIXES_CORE.values():
469
+ if property_name in namespace:
470
+ return namespace[property_name]
471
+
472
+ # If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
473
+ for namespace in [SKOS, DC, DCTERMS]:
474
+ if property_name in namespace:
475
+ return namespace[property_name]
476
+
477
+ # If the property name is still not found, try other namespaces from rdflib.
478
+ for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
479
+ if property_name in namespace:
480
+ return namespace[property_name]
481
+
482
+ # If the property name is "licence", it recursively calls the function with "license" as the input.
483
+ if property_name == "licence":
484
+ return self.property_to_uri("license")
485
+
486
+ # TODO: add an option to search trough manually implemented namespaces
487
+
488
+ # If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
489
+ # TODO: give a warning and try to prevent this option altogether
490
+ return self.rdf_namespaces["biocypher"][property_name]
491
+
492
+ def _init_namespaces(self, graph: Graph):
493
+ """
494
+ Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
495
+
496
+ This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
497
+ If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
498
+ the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
499
+
500
+ Args:
501
+ graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
502
+
503
+ Returns:
504
+ None
505
+ """
506
+ # add biocypher standard to self.rdf_namespaces
507
+ biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
508
+ if not self.rdf_namespaces:
509
+ self.rdf_namespaces = biocypher_standard
510
+ else:
511
+ self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
512
+
513
+ for key, value in self.rdf_namespaces.items():
514
+ namespace = Namespace(value)
515
+ self.rdf_namespaces[key] = namespace
516
+ graph.bind(key, namespace)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "biocypher"
3
- version = "0.5.41"
3
+ version = "0.5.42"
4
4
  description = "A unifying framework for biomedical research knowledge graphs"
5
5
  authors = [
6
6
  "Sebastian Lobentanzer <sebastian.lobentanzer@gmail.com>",
File without changes
File without changes
File without changes