biocypher 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

@@ -0,0 +1,569 @@
1
+ """Module to provide the OWL writer class."""
2
+ import os
3
+
4
+ from types import GeneratorType
5
+ from urllib.parse import quote_plus as url_quote
6
+
7
+ from rdflib import (
8
+ OWL,
9
+ RDF,
10
+ RDFS,
11
+ Literal,
12
+ )
13
+
14
+ from biocypher._create import BioCypherEdge, BioCypherNode
15
+ from biocypher._deduplicate import Deduplicator
16
+ from biocypher._logger import logger
17
+ from biocypher._translate import Translator
18
+ from biocypher.output.write.graph._rdf import _RDFWriter
19
+
20
+
21
+ class _OWLWriter(_RDFWriter):
22
+ """Write BioCypher's graph into a self-contained OWL file.
23
+
24
+ The resulting OWL file contains both the input vocabulary and
25
+ the output instances.
26
+
27
+ The behavior relies mainly on the `edge_model` parameter,
28
+ which can take two values:
29
+
30
+ - "ObjectProperty", which translates BioCypher's edges into
31
+ OWL's object properties (if they are available under the
32
+ selected root term). Object properties are the natural way
33
+ to model edges in OWL, but they do not support annotation,
34
+ thus being incompatible with having BioCypher's properties
35
+ on edges.
36
+ As most OWL files do not model a common term on top of both
37
+ owl:topObjectProperty and owl:Thing, you may need to ensure
38
+ that the input OWL contains a common ancestor honoring both:
39
+
40
+ - owl:Thing rdfs:subClassOf <root_node>
41
+ - owl:topObjectProperty rdfs:subPropertyOf <root_node>
42
+
43
+ and that you select it in your BioCypher configuration.
44
+
45
+ - "Association" (the default), which translates BioCypher's
46
+ edges into OWL's class instances. Those edges instances are
47
+ inserted in between the instances coming from BioCypher's nodes.
48
+ This allows to keep edge properties, but adds OWL instances
49
+ to model relationships, which does not follow the classical
50
+ OWL model. In this approach, all OWL instances are linked
51
+ with a generic "edge_source" (linking source instance to
52
+ the association instance) and "edge_target" (linking the association
53
+ instance to the target instance). Both of which inherit from "edge",
54
+ and are in the biocypher namespace.
55
+
56
+ This class takes care of keeping the vocabulary underneath the
57
+ selected root node and exports it along the instances in the
58
+ resulting OWL file. It discards whatever terms are not in the
59
+ tree below the selected root node.
60
+
61
+ To output a valid self-contained OWL file, it is required that
62
+ you call *both* `write_nodes` *and* `write_edges`.
63
+
64
+ This class heavily relies on the _RDFWriter class interface and code.
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ translator: Translator,
70
+ deduplicator: Deduplicator,
71
+ delimiter: str,
72
+ array_delimiter: str = ",",
73
+ quote: str = '"',
74
+ output_directory: str | None = None,
75
+ db_name: str = "neo4j",
76
+ import_call_bin_prefix: str | None = None,
77
+ import_call_file_prefix: str | None = None,
78
+ wipe: bool = True,
79
+ strict_mode: bool = False,
80
+ skip_bad_relationships: bool = False,
81
+ skip_duplicate_nodes: bool = False,
82
+ db_user: str = None,
83
+ db_password: str = None,
84
+ db_host: str = None,
85
+ db_port: str = None,
86
+ file_format: str = None,
87
+ rdf_namespaces: dict = {},
88
+ labels_order: str = "Ascending",
89
+ edge_model: str = "Association",
90
+ file_stem: str = "biocypher",
91
+ **kwargs,
92
+ ):
93
+ """Initialize the OWL writer.
94
+
95
+ Args:
96
+ ----
97
+ translator:
98
+ Instance of :py:class:`Translator` to enable translation of
99
+ nodes and manipulation of properties.
100
+
101
+ deduplicator:
102
+ Instance of :py:class:`Deduplicator` to enable deduplication
103
+ of nodes and edges.
104
+
105
+ delimiter:
106
+ The delimiter to use for the CSV files.
107
+
108
+ array_delimiter:
109
+ The delimiter to use for array properties.
110
+
111
+ quote:
112
+ The quote character to use for the CSV files.
113
+
114
+ output_directory:
115
+ Path for exporting CSV files.
116
+
117
+ db_name:
118
+ Name of the database that will be used in the generated
119
+ commands.
120
+
121
+ import_call_bin_prefix:
122
+ Path prefix for the admin import call binary.
123
+
124
+ import_call_file_prefix:
125
+ Path prefix for the data files (headers and parts) in the import
126
+ call.
127
+
128
+ wipe:
129
+ Whether to force import (removing existing DB content).
130
+ (Specific to Neo4j.)
131
+
132
+ strict_mode:
133
+ Whether to enforce source, version, and license properties.
134
+
135
+ skip_bad_relationships:
136
+ Whether to skip relationships that do not have a valid
137
+ start and end node. (Specific to Neo4j.)
138
+
139
+ skip_duplicate_nodes:
140
+ Whether to skip duplicate nodes. (Specific to Neo4j.)
141
+
142
+ db_user:
143
+ The database user.
144
+
145
+ db_password:
146
+ The database password.
147
+
148
+ db_host:
149
+ The database host. Defaults to localhost.
150
+
151
+ db_port:
152
+ The database port.
153
+
154
+ file_format:
155
+ The format of RDF.
156
+
157
+ rdf_namespaces:
158
+ The namespaces for RDF.
159
+
160
+ edge_model:
161
+ Whether to model an edge as OWL's "ObjectProperty" (discards
162
+ edges properties) or "Association" (adds an intermediate node
163
+ that holds the edge properties).
164
+
165
+ file_stem:
166
+ The stem (name without the path and extension) of the output
167
+ OWL file. The extension is determined from `file_format`.
168
+
169
+ """
170
+ super().__init__(
171
+ translator=translator,
172
+ deduplicator=deduplicator,
173
+ delimiter=delimiter,
174
+ array_delimiter=array_delimiter,
175
+ quote=quote,
176
+ output_directory=output_directory,
177
+ db_name=db_name,
178
+ import_call_bin_prefix=import_call_bin_prefix,
179
+ import_call_file_prefix=import_call_file_prefix,
180
+ wipe=wipe,
181
+ strict_mode=strict_mode,
182
+ skip_bad_relationships=skip_bad_relationships,
183
+ skip_duplicate_nodes=skip_duplicate_nodes,
184
+ db_user=db_user,
185
+ db_password=db_password,
186
+ db_host=db_host,
187
+ db_port=db_port,
188
+ file_format=file_format,
189
+ rdf_namespaces=rdf_namespaces,
190
+ labels_order=labels_order,
191
+ **kwargs,
192
+ )
193
+
194
+ # Starts with the loaded ontologies RDF graph,
195
+ # so as to keep the declared vocabulary.
196
+ self.graph = self.translator.ontology.get_rdf_graph()
197
+ self._init_namespaces(self.graph)
198
+
199
+ # Write guards because Biocypher has `write_nodes` and `write_edges`,
200
+ # but not `write`, so we need to ensure to call both.
201
+ self._has_nodes = False
202
+ self._has_edges = False
203
+
204
+ self.edge_models = ["Association", "ObjectProperty"]
205
+ if edge_model not in self.edge_models:
206
+ msg = f"`edge_model` cannot be '{edge_model}', but should be either: {' or '.join(self.edge_models)}"
207
+ logger.error(msg)
208
+ raise ValueError(msg)
209
+ self.edge_model = edge_model
210
+
211
+ self.file_stem = file_stem
212
+
213
+ def _write_single_node_list_to_file(
214
+ self,
215
+ node_list: list,
216
+ label: str,
217
+ prop_dict: dict,
218
+ labels: str,
219
+ ) -> bool:
220
+ """Save a list of BioCypherNodes in the graph.
221
+
222
+ This function takes a list of BioCypherNodes and saves them in
223
+ `self.graph`. It re-uses RDFWriter's machinery, hence the misleading
224
+ name.
225
+
226
+ Nodes are modelled as class instances, being also
227
+ owl:NamedIndividual.
228
+
229
+ Args:
230
+ ----
231
+ node_list (list): A list of BioCypherNodes to be written.
232
+
233
+ label (str): The label (type) of the nodes.
234
+
235
+ prop_dict (dict): A dictionary of properties and their types for the
236
+ node class.
237
+
238
+ labels (str): string of one or several concatenated labels
239
+
240
+ Returns:
241
+ -------
242
+ bool: True for success, False otherwise.
243
+
244
+ """
245
+ # NOTE: despite its name, this function does not write to file,
246
+ # but to self.graph.
247
+ # NOTE: labels and prop_dict are not used.
248
+
249
+ if not all(isinstance(n, BioCypherNode) for n in node_list):
250
+ logger.error("Nodes must be passed as type BioCypherNode.")
251
+ return False
252
+
253
+ # Cache for terms with specific namespaces.
254
+ already_found = {}
255
+
256
+ for n in node_list:
257
+ rdf_subject = url_quote(n.get_id())
258
+ properties = n.get_properties()
259
+ logger.debug(f"Node Class: [{rdf_subject}]")
260
+
261
+ all_labels = list(reversed(list(self.translator.ontology.get_ancestors(n.get_label()).nodes)))
262
+ logger.debug(f"\tVocabulary ancestors: {all_labels}")
263
+
264
+ # Create types in ancestors that would not exist in the vocabulary.
265
+ # For those that exists, get the URI (and thus the correct namespace).
266
+ for ancestor, current_class in zip(all_labels, all_labels[1:], strict=False):
267
+ logger.debug(f"\t\t'{current_class}' is_a '{ancestor}'")
268
+ ancestor_label = self.translator.name_sentence_to_pascal(ancestor)
269
+ current_label = self.translator.name_sentence_to_pascal(current_class)
270
+
271
+ # Fast search using default (or biocypher, if no default) namespace.
272
+ rdf_currents = list(
273
+ self.graph.triples(
274
+ (
275
+ self.to_uri(current_label),
276
+ RDFS.subClassOf,
277
+ self.to_uri(ancestor_label),
278
+ ),
279
+ ),
280
+ )
281
+
282
+ if not rdf_currents:
283
+ # Slow search with SPARQL queries.
284
+
285
+ # Use cache if term has been found already.
286
+ if ancestor_label in already_found:
287
+ uri_ancestor = already_found[ancestor_label]
288
+ else:
289
+ # Use SPARQL queries to get a term with an existing namespace.
290
+ # Because the missing term may be just in another namespace.
291
+ # But we don't want to SPARQL before, because it is so slow.
292
+ # FIXME this is VERY slow, maybe we can recover the namespaces
293
+ # from some BioCypher data structure?
294
+
295
+ # Note: using \\b in the regexp does not seems to work.
296
+ uri_ancestor = self.find_uri(f"#{ancestor_label}$")
297
+ if not uri_ancestor:
298
+ msg = f"I found no term with subject URI matching `#{ancestor_label}$`, but it should exist"
299
+ logger.error(msg)
300
+ raise RuntimeError(msg)
301
+
302
+ already_found[ancestor_label] = uri_ancestor
303
+
304
+ if current_label not in already_found:
305
+ uri_current = self.find_uri(f"#{current_label}$")
306
+ if not uri_current:
307
+ uri_current = self.as_uri(current_label, "biocypher")
308
+ # Create the term in biocypher namespace.
309
+ self.graph.add(
310
+ (
311
+ uri_current,
312
+ RDF.type,
313
+ uri_ancestor,
314
+ ),
315
+ )
316
+ logger.debug(f"\t\t\t[{uri_current}]--(type)->[{uri_ancestor}]")
317
+
318
+ already_found[current_label] = uri_current
319
+
320
+ else: # Found by fast search.
321
+ assert len(rdf_currents) > 0
322
+ uri_current = rdf_currents[0][0]
323
+
324
+ # Add the instance.
325
+ self.graph.add(
326
+ (
327
+ self.to_uri(rdf_subject),
328
+ RDF.type,
329
+ uri_current,
330
+ ),
331
+ )
332
+ logger.debug(f"\t[{rdf_subject}]--(type)->[{uri_current}]")
333
+
334
+ # The instance is also a NamedIndividual, in OWL.
335
+ self.graph.add(
336
+ (
337
+ self.to_uri(rdf_subject),
338
+ RDF.type,
339
+ OWL.NamedIndividual,
340
+ ),
341
+ )
342
+ logger.debug(f"\t[{rdf_subject}]--(type)->[NamedIndividual]")
343
+
344
+ # Add a readable label.
345
+ self.graph.add(
346
+ (
347
+ self.to_uri(rdf_subject),
348
+ RDFS.label,
349
+ Literal(n.get_id()),
350
+ ),
351
+ )
352
+ logger.debug(f"\t[{rdf_subject}]--(label)->[{n.get_id()}]")
353
+
354
+ # Add properties.
355
+ for key, value in properties.items():
356
+ # only write value if it exists.
357
+ if value:
358
+ self.add_property_to_graph(self.graph, rdf_subject, value, key)
359
+
360
+ self._has_nodes = True
361
+ return True
362
+
363
+ def _write_single_edge_list_to_file(
364
+ self,
365
+ edge_list: list,
366
+ label: str,
367
+ prop_dict: dict,
368
+ ):
369
+ """Save a list of BioCypherEdges in the graph.
370
+
371
+ This function takes a list of BioCypherEdges and saves them in
372
+ `self.graph`. It re-uses RDFWriter's machinery, hence the misleading
373
+ name.
374
+
375
+ Args:
376
+ ----
377
+ edge_list (list): list of BioCypherEdges to be written
378
+
379
+ label (str): the label (type) of the edge
380
+
381
+ prop_dict (dict): properties of node class passed from parsing
382
+ function and their types
383
+
384
+ Returns:
385
+ -------
386
+ bool: True for success, False otherwise.
387
+
388
+ """
389
+ # NOTE: despite its name, this function does not write to file,
390
+ # but to self.graph.
391
+ # NOTE: prop_dict is not used.
392
+
393
+ if not all(isinstance(n, BioCypherEdge) for n in edge_list):
394
+ logger.error("Edges must be passed as type BioCypherEdge.")
395
+ return False
396
+
397
+ for edge in edge_list:
398
+ rdf_subject = url_quote(edge.get_source_id())
399
+ rdf_object = url_quote(edge.get_target_id())
400
+ rdf_properties = edge.get_properties()
401
+
402
+ edge_label = url_quote(edge.get_label())
403
+ edge_uri = self.to_uri(edge_label)
404
+
405
+ if self.edge_model == "ObjectProperty":
406
+ # Add to the subject the property toward the object.
407
+ self.graph.add(
408
+ (
409
+ self.to_uri(rdf_subject),
410
+ edge_uri,
411
+ self.to_uri(rdf_object),
412
+ ),
413
+ )
414
+ logger.debug(f"Edge ObjectProperty: [{rdf_subject}]--({edge_label})->[{rdf_object}]")
415
+
416
+ elif self.edge_model == "Association":
417
+ # Modelling edges as Association allows for attaching
418
+ # data properties to an intermediate node.
419
+ logger.debug(f"EDGE Association: [{rdf_subject}]--({edge_label})->[{rdf_object}]")
420
+
421
+ if edge.get_id():
422
+ rdf_id = url_quote(edge.get_id())
423
+ else:
424
+ # We need an instance to attach properties.
425
+ rdf_id = url_quote(f"{rdf_subject}--{edge.get_label()}--{rdf_object}")
426
+
427
+ # Add object class modelling the edge.
428
+ # NOTE (from https://www.w3.org/TR/owl-ref/):
429
+ # owl:Class is defined as a subclass of rdfs:Class. The rationale for
430
+ # having a separate OWL class construct lies in the restrictions on
431
+ # OWL DL (and thus also on OWL Lite), which imply that not all RDFS
432
+ # classes are legal OWL DL classes. In OWL Full these restrictions
433
+ # do not exist and therefore owl:Class and rdfs:Class are equivalent
434
+ # in OWL Full.
435
+ self.graph.add((edge_uri, RDF.type, OWL.Class))
436
+ logger.debug(f"\tEdge object: [{edge_label}]--(type)->[Class]")
437
+
438
+ # Instantiate the edge object.
439
+ self.graph.add(
440
+ (
441
+ self.to_uri(rdf_id),
442
+ RDF.type,
443
+ edge_uri,
444
+ ),
445
+ )
446
+ logger.debug(f"\tEdge object instance: [{rdf_id}]--(type)->[{edge_label}]")
447
+
448
+ # ObjectProperties modelling the subject and object
449
+ # parts of the links around the object.
450
+ # edge_source and edge_target inherits from edge,
451
+ # and are in the biocypher namespace.
452
+ self.graph.add(
453
+ (
454
+ self.as_uri("edge", "biocypher"),
455
+ RDF.type,
456
+ OWL.ObjectProperty,
457
+ ),
458
+ )
459
+ logger.debug("\tBase ObjectProperty type: [edge]--(type)->[ObjectProperty]")
460
+
461
+ self.graph.add(
462
+ (
463
+ self.as_uri("edge_source", "biocypher"),
464
+ RDFS.subPropertyOf,
465
+ self.as_uri("edge", "biocypher"),
466
+ ),
467
+ )
468
+ logger.debug("\tLeft ObjectProperty type: [edge_source]--(type)->[edge]")
469
+
470
+ self.graph.add(
471
+ (
472
+ self.as_uri("edge_target", "biocypher"),
473
+ RDFS.subPropertyOf,
474
+ self.as_uri("edge", "biocypher"),
475
+ ),
476
+ )
477
+ logger.debug("\tRight ObjectProperty type: [edge_target]--(type)->[edge]")
478
+
479
+ self.graph.add(
480
+ (
481
+ self.to_uri(rdf_subject),
482
+ self.as_uri("edge_source", "biocypher"),
483
+ self.as_uri(rdf_id, "biocypher"),
484
+ ),
485
+ )
486
+ logger.debug(f"\tLeft ObjectProperty: [{rdf_subject}]--(edge_source)->[{rdf_id}]")
487
+
488
+ self.graph.add(
489
+ (
490
+ self.as_uri(rdf_id, "biocypher"),
491
+ self.as_uri("edge_target", "biocypher"),
492
+ self.to_uri(rdf_object),
493
+ ),
494
+ )
495
+ logger.debug(f"\tRight ObjectProperty: [{rdf_id}]--(edge_target)->[{rdf_object}]")
496
+
497
+ # Add properties to the edge modelled as an instance.
498
+ for key, value in rdf_properties.items():
499
+ # only write value if it exists.
500
+ if value:
501
+ self.add_property_to_graph(self.graph, rdf_id, value, key)
502
+
503
+ else:
504
+ logger.debug(f"{self.edge_model} not in {self.edge_models}")
505
+ assert self.edge_model in self.edge_models
506
+
507
+ self._has_edges = True
508
+ return True
509
+
510
+ def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False) -> bool:
511
+ """Insert nodes in `self.graph`.
512
+
513
+ It calls _write_node_data, which calls _write_single_node_list_to_file.
514
+
515
+ Args:
516
+ ----
517
+ nodes (list or generator): A list or generator of nodes in
518
+ BioCypherNode format.
519
+ batch_size (int): The number of nodes to write in each batch.
520
+ force (bool): Flag to force the writing even if the output file
521
+ already exists.
522
+
523
+ Returns:
524
+ -------
525
+ bool: True if the writing is successful, False otherwise.
526
+
527
+ """
528
+ # Calls _write_single_node_list_to_file, which sets self.has_nodes.
529
+ if not super().write_nodes(nodes, batch_size, force):
530
+ return False
531
+
532
+ # Attempt at writing the file.
533
+ self._write_file()
534
+ return True
535
+
536
+ def write_edges(
537
+ self,
538
+ edges: list | GeneratorType,
539
+ batch_size: int = int(1e6),
540
+ ) -> bool:
541
+ """Insert edges in `self.graph`.
542
+
543
+ It calls _write_edge_data, which calls _write_single_edge_list_to_file.
544
+
545
+ Args:
546
+ ----
547
+ edges (BioCypherEdge): a list or generator of edges in
548
+ :py:class:`BioCypherEdge` format
549
+ batch_size (int): The number of edges to write in each batch.
550
+
551
+ Returns:
552
+ -------
553
+ bool: The return value. True for success, False otherwise.
554
+
555
+ """
556
+ # Calls _write_single_edge_list_to_file, which sets self.has_edges.
557
+ if not super().write_edges(edges, batch_size):
558
+ return False
559
+
560
+ # Attempt at writing the file.
561
+ self._write_file()
562
+ return True
563
+
564
+ def _write_file(self):
565
+ """Write an OWL file if nodes and edges are ready in self.graph."""
566
+ if self._has_nodes and self._has_edges:
567
+ file_name = os.path.join(self.outdir, f"{self.file_stem}.{self.extension}")
568
+ logger.info(f"Writing {len(self.graph)} terms to {file_name}")
569
+ self.graph.serialize(destination=file_name, format=self.file_format)