cognite-neat 0.119.2__py3-none-any.whl → 0.119.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-neat might be problematic. Click here for more details.

Files changed (30) hide show
  1. cognite/neat/_constants.py +34 -70
  2. cognite/neat/_graph/extractors/__init__.py +0 -6
  3. cognite/neat/_graph/loaders/_rdf2dms.py +5 -5
  4. cognite/neat/_graph/queries/__init__.py +1 -1
  5. cognite/neat/_graph/queries/_base.py +2 -456
  6. cognite/neat/_graph/queries/_queries.py +16 -0
  7. cognite/neat/_graph/queries/_select.py +440 -0
  8. cognite/neat/_graph/queries/_update.py +37 -0
  9. cognite/neat/_rules/exporters/_rules2excel.py +240 -107
  10. cognite/neat/_rules/models/_base_rules.py +16 -1
  11. cognite/neat/_rules/models/dms/_validation.py +10 -1
  12. cognite/neat/_rules/transformers/_converters.py +16 -6
  13. cognite/neat/_session/_drop.py +2 -2
  14. cognite/neat/_session/_explore.py +4 -4
  15. cognite/neat/_session/_prepare.py +5 -5
  16. cognite/neat/_session/_read.py +6 -0
  17. cognite/neat/_session/_set.py +3 -3
  18. cognite/neat/_session/_show.py +1 -1
  19. cognite/neat/_session/_template.py +21 -2
  20. cognite/neat/_state/README.md +23 -0
  21. cognite/neat/_store/_graph_store.py +5 -5
  22. cognite/neat/_version.py +1 -1
  23. {cognite_neat-0.119.2.dist-info → cognite_neat-0.119.3.dist-info}/METADATA +37 -2
  24. {cognite_neat-0.119.2.dist-info → cognite_neat-0.119.3.dist-info}/RECORD +27 -26
  25. cognite/neat/_graph/extractors/_dexpi.py +0 -234
  26. cognite/neat/_graph/extractors/_iodd.py +0 -403
  27. cognite/neat/_graph/transformers/_iodd.py +0 -30
  28. {cognite_neat-0.119.2.dist-info → cognite_neat-0.119.3.dist-info}/LICENSE +0 -0
  29. {cognite_neat-0.119.2.dist-info → cognite_neat-0.119.3.dist-info}/WHEEL +0 -0
  30. {cognite_neat-0.119.2.dist-info → cognite_neat-0.119.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,440 @@
1
+ import urllib.parse
2
+ from collections import defaultdict
3
+ from collections.abc import Iterable
4
+ from typing import Any, Literal, cast, overload
5
+
6
+ from rdflib import RDF, XSD, Namespace, URIRef
7
+ from rdflib import Literal as RdfLiteral
8
+ from rdflib.query import ResultRow
9
+
10
+ from cognite.neat._constants import NEAT
11
+ from cognite.neat._shared import InstanceType
12
+ from cognite.neat._utils.rdf_ import remove_namespace_from_uri
13
+
14
+ from ._base import BaseQuery
15
+
16
+
17
+ class SelectQueries(BaseQuery):
18
+ """This class holds a set of SPARQL queries which are reading triples from the knowledge graph.
19
+ The read queries are executed against query endpoint, and typically start with SELECT statement.
20
+ """
21
+
22
+ def summarize_instances(self, named_graph: URIRef | None = None) -> list[tuple[str, int]]:
23
+ """Summarize instances in the graph store by class and count"""
24
+ query_statement = """ SELECT ?class (COUNT(?instance) AS ?instanceCount)
25
+ WHERE {
26
+ ?instance a ?class .
27
+ }
28
+ GROUP BY ?class
29
+ ORDER BY DESC(?instanceCount) """
30
+
31
+ return [ # type: ignore[misc]
32
+ (
33
+ remove_namespace_from_uri(cast(URIRef, class_)),
34
+ cast(RdfLiteral, count).value,
35
+ )
36
+ for class_, count in self.graph(named_graph=named_graph).query(query_statement)
37
+ ]
38
+
39
+ def types(self, named_graph: URIRef | None = None) -> dict[URIRef, str]:
40
+ """Types and their short form in the graph"""
41
+ query = """SELECT DISTINCT ?type
42
+ WHERE {?s a ?type .}"""
43
+
44
+ return { # type: ignore[misc, index, arg-type]
45
+ cast(URIRef, type_): remove_namespace_from_uri(cast(URIRef, type_))
46
+ for (type_,) in list(self.graph(named_graph).query(query))
47
+ }
48
+
49
+ def type_uri(self, type_: str, named_graph: URIRef | None = None) -> list[URIRef]:
50
+ """Get the URIRef of a type"""
51
+ return [k for k, v in self.types(named_graph).items() if v == type_]
52
+
53
+ def properties(self, named_graph: URIRef | None = None) -> dict[URIRef, str]:
54
+ """Properties and their short form in the graph
55
+
56
+ Args:
57
+ named_graph: Named graph to query over, default None (default graph)
58
+
59
+ """
60
+ query = """SELECT DISTINCT ?property
61
+ WHERE {?s ?property ?o . FILTER(?property != rdf:type)}"""
62
+ return { # type: ignore[misc, index, arg-type]
63
+ cast(URIRef, type_): remove_namespace_from_uri(cast(URIRef, type_))
64
+ for (type_,) in list(self.graph(named_graph).query(query))
65
+ }
66
+
67
+ def properties_by_type(self, named_graph: URIRef | None = None) -> dict[URIRef, dict[URIRef, str]]:
68
+ """Properties and their short form in the graph by type
69
+
70
+ Args:
71
+ named_graph: Named graph to query over, default None (default graph)
72
+
73
+ """
74
+ query = """SELECT DISTINCT ?type ?property
75
+ WHERE {?s a ?type . ?s ?property ?o . FILTER(?property != rdf:type)}"""
76
+ properties_by_type: dict[URIRef, dict[URIRef, str]] = defaultdict(dict)
77
+ for type_, property_ in cast(ResultRow, list(self.graph(named_graph).query(query))):
78
+ properties_by_type[type_][property_] = remove_namespace_from_uri(property_) # type: ignore[index]
79
+ return properties_by_type
80
+
81
+ def property_uri(self, property_: str, named_graph: URIRef | None = None) -> list[URIRef]:
82
+ """Get the URIRef of a property
83
+
84
+ Args:
85
+ property_: Property to find URIRef for
86
+ named_graph: Named graph to query over, default None (default graph)
87
+ """
88
+ return [k for k, v in self.properties(named_graph).items() if v == property_]
89
+
90
+ @overload
91
+ def list_instances_ids(
92
+ self, class_uri: None = None, limit: int = -1, named_graph: URIRef | None = None
93
+ ) -> Iterable[tuple[URIRef, URIRef]]: ...
94
+
95
+ @overload
96
+ def list_instances_ids(
97
+ self, class_uri: URIRef, limit: int = -1, named_graph: URIRef | None = None
98
+ ) -> Iterable[URIRef]: ...
99
+
100
+ def list_instances_ids(
101
+ self, class_uri: URIRef | None = None, limit: int = -1, named_graph: URIRef | None = None
102
+ ) -> Iterable[URIRef] | Iterable[tuple[URIRef, URIRef]]:
103
+ """List all instance IDs
104
+
105
+ Args:
106
+ class_uri: Class for which instances are to be found, default None (all instances)
107
+ limit: Max number of instances to return, by default -1 meaning all instances
108
+ named_graph: Named graph to query over, default None (default graph)
109
+
110
+ Returns:
111
+ List of class instance URIs
112
+ """
113
+ query = "SELECT DISTINCT ?subject"
114
+ if class_uri:
115
+ query += f" WHERE {{ ?subject a <{class_uri}> .}}"
116
+ else:
117
+ query += " ?type WHERE {{ ?subject a ?type .}}"
118
+ if limit != -1:
119
+ query += f" LIMIT {limit}"
120
+ # MyPy is not very happy with RDFLib, so just ignore the type hinting here
121
+ return (tuple(res) if class_uri is None else res[0] for res in self.graph(named_graph).query(query)) # type: ignore[index, return-value, arg-type]
122
+
123
+ def type_with_property(self, type_: URIRef, property_uri: URIRef, named_graph: URIRef | None = None) -> bool:
124
+ """Check if a property exists in the graph store
125
+
126
+ Args:
127
+ type_: Type URI to check
128
+ property_uri: Property URI to check
129
+ named_graph: Named graph to query over, default None (default graph)
130
+
131
+ Returns:
132
+ True if property exists, False otherwise
133
+ """
134
+ query = f"SELECT ?o WHERE {{ ?s a <{type_}> ; <{property_uri}> ?o .}} Limit 1"
135
+ return bool(list(self.graph(named_graph).query(query)))
136
+
137
+ def has_namespace(self, namespace: Namespace, named_graph: URIRef | None = None) -> bool:
138
+ """Check if a namespace exists in the graph store
139
+
140
+ Args:
141
+ namespace: Namespace to check
142
+ named_graph: Named graph to query over, default None (default graph)
143
+
144
+ Returns:
145
+ True if namespace exists, False otherwise
146
+ """
147
+ query = f"ASK WHERE {{ ?s ?p ?o . FILTER(STRSTARTS(STR(?p), STR(<{namespace}>))) }}"
148
+ return bool(self.graph(named_graph).query(query))
149
+
150
+ def has_data(self) -> bool:
151
+ """Check if the graph store has data"""
152
+ return cast(bool, next(iter(self.dataset.query("ASK WHERE { ?s ?p ?o }"))))
153
+
154
+ def has_type(self, type_: URIRef, named_graph: URIRef | None = None) -> bool:
155
+ """Check if a type exists in the graph store
156
+
157
+ Args:
158
+ type_: Type to check
159
+ named_graph: Named graph to query over, default None (default graph)
160
+
161
+ Returns:
162
+ True if type exists, False otherwise
163
+ """
164
+ query = f"ASK WHERE {{ ?s a <{type_}> }}"
165
+ return bool(self.graph(named_graph).query(query))
166
+
167
+ def describe(
168
+ self,
169
+ instance_id: URIRef,
170
+ instance_type: URIRef | None = None,
171
+ property_renaming_config: dict | None = None,
172
+ named_graph: URIRef | None = None,
173
+ remove_uri_namespace: bool = True,
174
+ ) -> tuple[URIRef, dict[str | InstanceType, list[Any]]] | None:
175
+ """DESCRIBE instance for a given class from the graph store
176
+
177
+ Args:
178
+ instance_id: Instance id for which we want to generate query
179
+ instance_type: Type of the instance, default None (will be inferred from triples)
180
+ property_renaming_config: Dictionary to rename properties, default None (no renaming)
181
+ named_graph: Named graph to query over, default None (default graph)
182
+ remove_uri_namespace: Whether to remove the namespace from the URI, by default True
183
+
184
+
185
+ Returns:
186
+ Dictionary of instance properties
187
+ """
188
+ property_values: dict[str, list[str] | list[URIRef]] = defaultdict(list)
189
+ for _, predicate, object_ in cast(list[ResultRow], self.graph(named_graph).query(f"DESCRIBE <{instance_id}>")):
190
+ if object_.lower() in [
191
+ "",
192
+ "none",
193
+ "nan",
194
+ "null",
195
+ ]:
196
+ continue
197
+
198
+ # set property
199
+ if property_renaming_config and predicate != RDF.type:
200
+ property_ = remove_namespace_from_uri(predicate, validation="prefix")
201
+ renamed_property_ = property_renaming_config.get(predicate, property_)
202
+
203
+ elif not property_renaming_config and predicate != RDF.type:
204
+ property_ = remove_namespace_from_uri(predicate, validation="prefix")
205
+ renamed_property_ = property_
206
+
207
+ else:
208
+ property_ = RDF.type
209
+ renamed_property_ = property_
210
+
211
+ value: Any
212
+ if isinstance(object_, URIRef) and remove_uri_namespace:
213
+ # These properties contain the space in the Namespace.
214
+ value = remove_namespace_from_uri(object_, validation="prefix")
215
+ elif isinstance(object_, URIRef):
216
+ value = object_
217
+ elif isinstance(object_, RdfLiteral):
218
+ if object_.datatype == XSD._NS["json"]:
219
+ # For JSON literals, the .toPython() returns a Literal object.
220
+ value = str(object_)
221
+ else:
222
+ value = object_.toPython()
223
+ else:
224
+ # It is a blank node
225
+ value = str(object_)
226
+
227
+ # add type to the dictionary
228
+ if predicate != RDF.type:
229
+ property_values[renamed_property_].append(value) # type: ignore[arg-type]
230
+ else:
231
+ # guarding against multiple rdf:type values as this is not allowed in CDF
232
+ if RDF.type not in property_values:
233
+ property_values[RDF.type].append(
234
+ remove_namespace_from_uri(instance_type, validation="prefix") if instance_type else value # type: ignore[arg-type]
235
+ )
236
+ else:
237
+ # we should not have multiple rdf:type values
238
+ continue
239
+ if property_values:
240
+ return (
241
+ instance_id,
242
+ property_values,
243
+ )
244
+ else:
245
+ return None
246
+
247
+ def list_triples(self, limit: int = 25, named_graph: URIRef | None = None) -> list[ResultRow]:
248
+ """List triples in the graph store
249
+
250
+ Args:
251
+ limit: Max number of triples to return, by default 25
252
+ named_graph: Named graph to query over, default None (default graph)
253
+
254
+ Returns:
255
+ List of triples
256
+ """
257
+ query = f"SELECT ?subject ?predicate ?object WHERE {{ ?subject ?predicate ?object }} LIMIT {limit}"
258
+ return cast(list[ResultRow], list(self.graph(named_graph).query(query)))
259
+
260
+ @overload
261
+ def list_types(self, remove_namespace: Literal[False] = False, limit: int = 25) -> list[ResultRow]: ...
262
+
263
+ @overload
264
+ def list_types(
265
+ self,
266
+ remove_namespace: Literal[True],
267
+ limit: int = 25,
268
+ named_graph: URIRef | None = None,
269
+ ) -> list[str]: ...
270
+
271
+ def list_types(
272
+ self,
273
+ remove_namespace: bool = False,
274
+ limit: int | None = 25,
275
+ named_graph: URIRef | None = None,
276
+ ) -> list[ResultRow] | list[str]:
277
+ """List types in the graph store
278
+
279
+ Args:
280
+ limit: Max number of types to return, by default 25
281
+ remove_namespace: Whether to remove the namespace from the type, by default False
282
+
283
+ Returns:
284
+ List of types
285
+ """
286
+ query = "SELECT DISTINCT ?type WHERE { ?subject a ?type }"
287
+ if limit is not None:
288
+ query += f" LIMIT {limit}"
289
+ result = cast(list[ResultRow], list(self.graph(named_graph).query(query)))
290
+ if remove_namespace:
291
+ return [remove_namespace_from_uri(res[0]) for res in result]
292
+ return result
293
+
294
+ def multi_value_type_property(
295
+ self,
296
+ named_graph: URIRef | None = None,
297
+ ) -> Iterable[tuple[URIRef, URIRef, list[URIRef]]]:
298
+ query = """SELECT ?sourceType ?property
299
+ (GROUP_CONCAT(DISTINCT STR(?valueType); SEPARATOR=",") AS ?valueTypes)
300
+
301
+ WHERE {{
302
+ ?s ?property ?o .
303
+ ?s a ?sourceType .
304
+ OPTIONAL {{ ?o a ?type }}
305
+
306
+ # Key part to determine value type: either object, data or unknown
307
+ BIND( IF(isLiteral(?o),DATATYPE(?o),
308
+ IF(BOUND(?type), ?type,
309
+ <{unknownType}>)) AS ?valueType)
310
+ }}
311
+
312
+ GROUP BY ?sourceType ?property
313
+ HAVING (COUNT(DISTINCT ?valueType) > 1)"""
314
+
315
+ for (
316
+ source_type,
317
+ property_,
318
+ value_types,
319
+ ) in cast(
320
+ ResultRow,
321
+ self.graph(named_graph).query(query.format(unknownType=str(NEAT.UnknownType))),
322
+ ):
323
+ yield cast(URIRef, source_type), cast(URIRef, property_), [URIRef(uri) for uri in value_types.split(",")]
324
+
325
+ def multi_type_instances(self, named_graph: URIRef | None = None) -> dict[str, list[str]]:
326
+ """Find instances with multiple types
327
+
328
+ Args:
329
+ named_graph: Named graph to query over, default None (default graph)
330
+
331
+ """
332
+
333
+ query = """
334
+ SELECT ?instance (GROUP_CONCAT(str(?type); SEPARATOR=",") AS ?types)
335
+ WHERE {
336
+ ?instance a ?type .
337
+ }
338
+ GROUP BY ?instance
339
+ HAVING (COUNT(?type) > 1)
340
+ """
341
+
342
+ result = {}
343
+ for instance, types in self.graph(named_graph).query(query): # type: ignore
344
+ result[remove_namespace_from_uri(instance)] = remove_namespace_from_uri(types.split(","))
345
+
346
+ return result
347
+
348
+ def count_of_type(self, class_uri: URIRef, named_graph: URIRef | None = None) -> int:
349
+ query = f"SELECT (COUNT(?instance) AS ?instanceCount) WHERE {{ ?instance a <{class_uri}> }}"
350
+ return int(next(iter(self.graph(named_graph).query(query)))[0]) # type: ignore[arg-type, index]
351
+
352
+ def types_with_instance_and_property_count(
353
+ self, remove_namespace: bool = True, named_graph: URIRef | None = None
354
+ ) -> list[dict[str, Any]]:
355
+ query = """
356
+ SELECT ?type (COUNT(DISTINCT ?instance) AS ?instanceCount) (COUNT(DISTINCT ?property) AS ?propertyCount)
357
+ WHERE {
358
+ ?instance a ?type .
359
+ ?instance ?property ?value .
360
+ FILTER(?property != rdf:type)
361
+ }
362
+ GROUP BY ?type
363
+ ORDER BY DESC(?instanceCount)"""
364
+ return [
365
+ {
366
+ "type": urllib.parse.unquote(remove_namespace_from_uri(type_)) if remove_namespace else type_,
367
+ "instanceCount": cast(RdfLiteral, instance_count).toPython(),
368
+ "propertyCount": cast(RdfLiteral, property_count).toPython(),
369
+ }
370
+ for type_, instance_count, property_count in list(
371
+ cast(list[ResultRow], self.graph(named_graph).query(query))
372
+ )
373
+ ]
374
+
375
+ def properties_with_count(
376
+ self, remove_namespace: bool = True, named_graph: URIRef | None = None
377
+ ) -> list[dict[str, Any]]:
378
+ instance_count_by_type = {
379
+ entry["type"]: entry["instanceCount"]
380
+ for entry in self.types_with_instance_and_property_count(remove_namespace=False, named_graph=named_graph)
381
+ }
382
+ query = """SELECT ?type ?property (COUNT(DISTINCT ?instance) AS ?instanceCount)
383
+ WHERE {
384
+ ?instance a ?type .
385
+ ?instance ?property ?value .
386
+ FILTER(?property != rdf:type)
387
+ }
388
+ GROUP BY ?type ?property
389
+ ORDER BY ASC(?type) ASC(?property)"""
390
+ return [
391
+ {
392
+ "type": urllib.parse.unquote(remove_namespace_from_uri(type_)) if remove_namespace else type_,
393
+ "property": urllib.parse.unquote(remove_namespace_from_uri(property)) if remove_namespace else property,
394
+ "instanceCount": cast(RdfLiteral, instance_count).toPython(),
395
+ "total": instance_count_by_type[type_],
396
+ }
397
+ for type_, property, instance_count in list(cast(list[ResultRow], self.graph(named_graph).query(query)))
398
+ ]
399
+
400
+ @overload
401
+ def instances_with_properties(
402
+ self, type: URIRef, remove_namespace: Literal[False], named_graph: URIRef | None = None
403
+ ) -> dict[URIRef, set[URIRef]]: ...
404
+
405
+ @overload
406
+ def instances_with_properties(
407
+ self, type: URIRef, remove_namespace: Literal[True], named_graph: URIRef | None = None
408
+ ) -> dict[str, set[str]]: ...
409
+
410
+ def instances_with_properties(
411
+ self, type: URIRef, remove_namespace: bool = True, named_graph: URIRef | None = None
412
+ ) -> dict[str, set[str]] | dict[URIRef, set[URIRef]]:
413
+ query = """SELECT DISTINCT ?instance ?property
414
+ WHERE {{
415
+ ?instance a <{type}> .
416
+ ?instance ?property ?value .
417
+ FILTER(?property != rdf:type)
418
+ }}"""
419
+ result = defaultdict(set)
420
+ for instance, property_ in cast(Iterable[ResultRow], self.graph(named_graph).query(query.format(type=type))):
421
+ instance_str = urllib.parse.unquote(remove_namespace_from_uri(instance)) if remove_namespace else instance
422
+ property_str = urllib.parse.unquote(remove_namespace_from_uri(property_)) if remove_namespace else property_
423
+ result[instance_str].add(property_str)
424
+ return result
425
+
426
+ def list_instances_ids_by_space(
427
+ self, space_property: URIRef, named_graph: URIRef | None = None
428
+ ) -> Iterable[tuple[URIRef, str]]:
429
+ """Returns instance ids by space"""
430
+ query = f"""SELECT DISTINCT ?instance ?space
431
+ WHERE {{?instance <{space_property}> ?space}}"""
432
+
433
+ for result in cast(Iterable[ResultRow], self.graph(named_graph).query(query)):
434
+ instance_id, space = cast(tuple[URIRef, URIRef | RdfLiteral], result)
435
+ if isinstance(space, URIRef):
436
+ yield instance_id, remove_namespace_from_uri(space)
437
+ elif isinstance(space, RdfLiteral):
438
+ yield instance_id, str(space.toPython())
439
+ else:
440
+ yield instance_id, str(space)
@@ -0,0 +1,37 @@
1
+ from rdflib import Dataset, URIRef
2
+
3
+ from cognite.neat._utils.rdf_ import remove_instance_ids_in_batch
4
+
5
+ from ._base import BaseQuery
6
+ from ._select import SelectQueries
7
+
8
+
9
+ class UpdateQueries(BaseQuery):
10
+ """This class holds a set of SPARQL queries which are updating triples in the knowledge graph.
11
+ The update queries are executed against update endpoint, and typically start with UPDATE statement
12
+ """
13
+
14
+ def __init__(self, read: SelectQueries, dataset: Dataset, default_named_graph: URIRef | None = None) -> None:
15
+ super().__init__(dataset, default_named_graph)
16
+ self._read = read
17
+
18
+ def drop_types(
19
+ self,
20
+ type_: list[URIRef],
21
+ named_graph: URIRef | None = None,
22
+ ) -> dict[URIRef, int]:
23
+ """Drop types from the graph store
24
+
25
+ Args:
26
+ type_: List of types to drop
27
+ named_graph: Named graph to query over, default None (default graph
28
+
29
+ Returns:
30
+ Dictionary of dropped types
31
+ """
32
+ dropped_types: dict[URIRef, int] = {}
33
+ for t in type_:
34
+ instance_ids = list(self._read.list_instances_ids(t))
35
+ dropped_types[t] = len(instance_ids)
36
+ remove_instance_ids_in_batch(self.graph(named_graph), instance_ids)
37
+ return dropped_types