cognite-neat 0.106.0__py3-none-any.whl → 0.107.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-neat might be problematic. Click here for more details.

Files changed (47) hide show
  1. cognite/neat/_graph/extractors/__init__.py +5 -1
  2. cognite/neat/_graph/extractors/_base.py +32 -0
  3. cognite/neat/_graph/extractors/_classic_cdf/_base.py +16 -3
  4. cognite/neat/_graph/extractors/_classic_cdf/_classic.py +74 -7
  5. cognite/neat/_graph/extractors/_classic_cdf/_relationships.py +2 -0
  6. cognite/neat/_graph/extractors/_classic_cdf/_sequences.py +8 -1
  7. cognite/neat/_graph/extractors/_dms.py +48 -14
  8. cognite/neat/_graph/extractors/_dms_graph.py +149 -0
  9. cognite/neat/_graph/extractors/_rdf_file.py +32 -5
  10. cognite/neat/_graph/loaders/_rdf2dms.py +112 -18
  11. cognite/neat/_graph/queries/_construct.py +1 -1
  12. cognite/neat/_graph/transformers/__init__.py +5 -0
  13. cognite/neat/_graph/transformers/_base.py +9 -1
  14. cognite/neat/_graph/transformers/_classic_cdf.py +90 -3
  15. cognite/neat/_graph/transformers/_rdfpath.py +3 -3
  16. cognite/neat/_graph/transformers/_value_type.py +54 -44
  17. cognite/neat/_rules/analysis/_base.py +1 -1
  18. cognite/neat/_rules/analysis/_information.py +14 -13
  19. cognite/neat/_rules/catalog/__init__.py +1 -0
  20. cognite/neat/_rules/catalog/classic_model.xlsx +0 -0
  21. cognite/neat/_rules/catalog/info-rules-imf.xlsx +0 -0
  22. cognite/neat/_rules/importers/_dms2rules.py +7 -5
  23. cognite/neat/_rules/importers/_rdf/_inference2rules.py +1 -1
  24. cognite/neat/_rules/models/_base_rules.py +0 -12
  25. cognite/neat/_rules/models/_types.py +5 -0
  26. cognite/neat/_rules/models/dms/_rules.py +50 -2
  27. cognite/neat/_rules/models/information/_rules.py +48 -5
  28. cognite/neat/_rules/models/information/_rules_input.py +1 -1
  29. cognite/neat/_rules/models/mapping/_classic2core.py +4 -5
  30. cognite/neat/_rules/transformers/__init__.py +4 -0
  31. cognite/neat/_rules/transformers/_converters.py +209 -62
  32. cognite/neat/_session/_base.py +2 -6
  33. cognite/neat/_session/_mapping.py +17 -6
  34. cognite/neat/_session/_prepare.py +0 -47
  35. cognite/neat/_session/_read.py +63 -5
  36. cognite/neat/_session/_state.py +7 -0
  37. cognite/neat/_session/_to.py +40 -2
  38. cognite/neat/_session/exceptions.py +7 -3
  39. cognite/neat/_store/_graph_store.py +52 -11
  40. cognite/neat/_store/_rules_store.py +22 -0
  41. cognite/neat/_utils/auth.py +2 -0
  42. cognite/neat/_version.py +1 -1
  43. {cognite_neat-0.106.0.dist-info → cognite_neat-0.107.0.dist-info}/METADATA +2 -2
  44. {cognite_neat-0.106.0.dist-info → cognite_neat-0.107.0.dist-info}/RECORD +47 -45
  45. {cognite_neat-0.106.0.dist-info → cognite_neat-0.107.0.dist-info}/WHEEL +1 -1
  46. {cognite_neat-0.106.0.dist-info → cognite_neat-0.107.0.dist-info}/LICENSE +0 -0
  47. {cognite_neat-0.106.0.dist-info → cognite_neat-0.107.0.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,7 @@
1
+ import zipfile
1
2
  from collections.abc import Iterable
2
3
  from pathlib import Path
3
- from typing import get_args
4
+ from typing import cast, get_args
4
5
 
5
6
  from rdflib import URIRef
6
7
  from rdflib.util import guess_format
@@ -10,6 +11,7 @@ from cognite.neat._graph._shared import RDFTypes
10
11
  from cognite.neat._graph.extractors._base import BaseExtractor
11
12
  from cognite.neat._issues._base import IssueList
12
13
  from cognite.neat._issues.errors import FileNotFoundNeatError, FileTypeUnexpectedError
14
+ from cognite.neat._issues.errors._general import NeatValueError
13
15
  from cognite.neat._shared import Triple
14
16
 
15
17
 
@@ -24,25 +26,50 @@ class RdfFileExtractor(BaseExtractor):
24
26
 
25
27
  def __init__(
26
28
  self,
27
- filepath: Path,
29
+ filepath: Path | zipfile.ZipExtFile,
28
30
  base_uri: URIRef = DEFAULT_BASE_URI,
29
31
  issue_list: IssueList | None = None,
30
32
  ):
31
33
  self.issue_list = issue_list or IssueList(title=f"{filepath.name}")
32
34
  self.base_uri = base_uri
33
35
  self.filepath = filepath
34
- self.format = guess_format(str(self.filepath))
35
36
 
36
- if not self.filepath.exists():
37
+ self.format = guess_format(str(self.filepath) if isinstance(self.filepath, Path) else self.filepath.name)
38
+
39
+ print(self.format)
40
+ if isinstance(self.filepath, Path) and not self.filepath.exists():
37
41
  self.issue_list.append(FileNotFoundNeatError(self.filepath))
38
42
 
39
43
  if not self.format:
40
44
  self.issue_list.append(
41
45
  FileTypeUnexpectedError(
42
- self.filepath,
46
+ (self.filepath if isinstance(self.filepath, Path) else Path(self.filepath.name)),
43
47
  frozenset(get_args(RDFTypes)),
44
48
  )
45
49
  )
46
50
 
47
51
  def extract(self) -> Iterable[Triple]:
48
52
  raise NotImplementedError()
53
+
54
+ @classmethod
55
+ def from_zip(
56
+ cls,
57
+ filepath: Path,
58
+ filename: str = "neat-session/instances/instances.ttl",
59
+ base_uri: URIRef = DEFAULT_BASE_URI,
60
+ issue_list: IssueList | None = None,
61
+ ):
62
+ if not filepath.exists():
63
+ raise FileNotFoundNeatError(filepath)
64
+ if filepath.suffix not in {".zip"}:
65
+ raise NeatValueError("Expected a zip file, got {filepath.suffix}")
66
+
67
+ with zipfile.ZipFile(filepath, "r") as zip_ref:
68
+ for file_info in zip_ref.infolist():
69
+ if file_info.filename == filename:
70
+ # We need to open the file in the zip file, and close it upon
71
+ # triple extraction ...
72
+ file = zip_ref.open(file_info)
73
+ return cls(cast(zipfile.ZipExtFile, file), base_uri, issue_list)
74
+
75
+ raise NeatValueError(f"Cannot extract {filename} from zip file {filepath}")
@@ -32,7 +32,7 @@ from cognite.neat._issues.errors import (
32
32
  from cognite.neat._issues.warnings import PropertyDirectRelationLimitWarning, PropertyTypeNotSupportedWarning
33
33
  from cognite.neat._rules.analysis._dms import DMSAnalysis
34
34
  from cognite.neat._rules.models import DMSRules
35
- from cognite.neat._rules.models.data_types import _DATA_TYPE_BY_DMS_TYPE, Json
35
+ from cognite.neat._rules.models.data_types import _DATA_TYPE_BY_DMS_TYPE, Json, String
36
36
  from cognite.neat._rules.models.entities._single_value import ViewEntity
37
37
  from cognite.neat._shared import InstanceType
38
38
  from cognite.neat._store import NeatGraphStore
@@ -159,7 +159,7 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
159
159
 
160
160
  tracker = self._tracker(type(self).__name__, view_ids, "views")
161
161
  for view_id, (view, instance_count) in view_and_count_by_id.items():
162
- pydantic_cls, edge_by_type, issues = self._create_validation_classes(view) # type: ignore[var-annotated]
162
+ pydantic_cls, edge_by_type, edge_by_prop_id, issues = self._create_validation_classes(view) # type: ignore[var-annotated]
163
163
  yield from issues
164
164
  tracker.issue(issues)
165
165
 
@@ -200,20 +200,61 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
200
200
  )
201
201
 
202
202
  for identifier, properties in instance_iterable:
203
+ start_node, end_node = self._pop_start_end_node(properties)
204
+ is_edge = start_node and end_node
205
+ if (is_edge and view.used_for == "node") or (not is_edge and view.used_for == "edge"):
206
+ instance_type = "edge" if is_edge else "node"
207
+ creation_error = ResourceCreationError(
208
+ identifier,
209
+ instance_type,
210
+ error=f"{instance_type.capitalize()} found in {view.used_for} view",
211
+ )
212
+ tracker.issue(creation_error)
213
+ if stop_on_exception:
214
+ raise creation_error
215
+ yield creation_error
216
+ continue
217
+
203
218
  if skip_properties:
204
219
  properties = {k: v for k, v in properties.items() if k not in skip_properties}
205
- try:
206
- yield self._create_node(identifier, properties, pydantic_cls, view_id)
207
- except ValueError as e:
208
- error_node = ResourceCreationError(identifier, "node", error=str(e))
209
- tracker.issue(error_node)
210
- if stop_on_exception:
211
- raise error_node from e
212
- yield error_node
213
- yield from self._create_edges(identifier, properties, edge_by_type, tracker)
220
+
221
+ if start_node and end_node:
222
+ # Is an edge
223
+ try:
224
+ yield self._create_edge_with_properties(
225
+ identifier, properties, start_node, end_node, pydantic_cls, view_id
226
+ )
227
+ except ValueError as e:
228
+ error_edge = ResourceCreationError(identifier, "edge", error=str(e))
229
+ tracker.issue(error_edge)
230
+ if stop_on_exception:
231
+ raise error_edge from e
232
+ yield error_edge
233
+ else:
234
+ try:
235
+ yield self._create_node(identifier, properties, pydantic_cls, view_id)
236
+ except ValueError as e:
237
+ error_node = ResourceCreationError(identifier, "node", error=str(e))
238
+ tracker.issue(error_node)
239
+ if stop_on_exception:
240
+ raise error_node from e
241
+ yield error_node
242
+ yield from self._create_edges_without_properties(
243
+ identifier, properties, edge_by_type, edge_by_prop_id, tracker
244
+ )
214
245
  tracker.finish(track_id)
215
246
  yield _END_OF_CLASS
216
247
 
248
+ @staticmethod
249
+ def _pop_start_end_node(properties: dict[str | InstanceType, list[str]]) -> tuple[str | None, str | None]:
250
+ start_node = properties.pop("startNode", [None])[0]
251
+ if not start_node:
252
+ start_node = properties.pop("start_node", [None])[0]
253
+ end_node = properties.pop("endNode", [None])[0]
254
+ if not end_node:
255
+ end_node = properties.pop("end_node", [None])[0]
256
+ return start_node, end_node
257
+
217
258
  def write_to_file(self, filepath: Path) -> None:
218
259
  if filepath.suffix not in [".json", ".yaml", ".yml"]:
219
260
  raise ValueError(f"File format {filepath.suffix} is not supported")
@@ -298,17 +339,30 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
298
339
 
299
340
  def _create_validation_classes(
300
341
  self, view: dm.View
301
- ) -> tuple[type[BaseModel], dict[str, tuple[str, dm.EdgeConnection]], NeatIssueList]:
342
+ ) -> tuple[
343
+ type[BaseModel],
344
+ dict[str, tuple[str, dm.EdgeConnection]],
345
+ dict[str, tuple[str, dm.EdgeConnection]],
346
+ NeatIssueList,
347
+ ]:
302
348
  issues = IssueList()
303
349
  field_definitions: dict[str, tuple[type, Any]] = {}
304
- edge_by_property: dict[str, tuple[str, dm.EdgeConnection]] = {}
350
+ edge_by_type: dict[str, tuple[str, dm.EdgeConnection]] = {}
351
+ edge_by_prop_id: dict[str, tuple[str, dm.EdgeConnection]] = {}
305
352
  validators: dict[str, classmethod] = {}
306
353
  direct_relation_by_property: dict[str, dm.DirectRelation] = {}
307
354
  unit_properties: list[str] = []
308
355
  json_fields: list[str] = []
356
+ text_fields: list[str] = []
309
357
  for prop_id, prop in view.properties.items():
310
358
  if isinstance(prop, dm.EdgeConnection):
311
- edge_by_property[prop_id] = prop_id, prop
359
+ if prop.edge_source:
360
+ # Edges with properties are created separately
361
+ continue
362
+
363
+ edge_by_type[prop.type.external_id] = prop_id, prop
364
+ edge_by_prop_id[prop_id] = prop_id, prop
365
+
312
366
  if isinstance(prop, dm.MappedProperty):
313
367
  if is_readonly_property(prop.container, prop.container_property_identifier):
314
368
  continue
@@ -334,6 +388,8 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
334
388
 
335
389
  if data_type == Json:
336
390
  json_fields.append(prop_id)
391
+ elif data_type == String:
392
+ text_fields.append(prop_id)
337
393
  python_type = data_type.python
338
394
  if isinstance(prop.type, ListablePropertyType) and prop.type.is_list:
339
395
  python_type = list[python_type]
@@ -369,11 +425,20 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
369
425
  else:
370
426
  raise ValueError(f"Expect valid JSON string or dict for {info.field_name}: {value}")
371
427
 
428
+ def parse_text(cls, value: Any, info: ValidationInfo) -> Any:
429
+ if isinstance(value, list):
430
+ return [remove_namespace_from_uri(v) for v in value]
431
+ else:
432
+ return remove_namespace_from_uri(value)
433
+
372
434
  if json_fields:
373
435
  validators["parse_json_string"] = field_validator(*json_fields, mode="before")(parse_json_string) # type: ignore[assignment, arg-type]
374
436
 
375
437
  validators["parse_list"] = field_validator("*", mode="before")(parse_list) # type: ignore[assignment, arg-type]
376
438
 
439
+ if text_fields:
440
+ validators["parse_text"] = field_validator(*text_fields, mode="before")(parse_text) # type: ignore[assignment, arg-type]
441
+
377
442
  if direct_relation_by_property:
378
443
 
379
444
  def parse_direct_relation(cls, value: list, info: ValidationInfo) -> dict | list[dict]:
@@ -414,7 +479,7 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
414
479
  )
415
480
 
416
481
  pydantic_cls = create_model(view.external_id, __validators__=validators, **field_definitions) # type: ignore[arg-type, call-overload]
417
- return pydantic_cls, edge_by_property, issues
482
+ return pydantic_cls, edge_by_type, edge_by_prop_id, issues
418
483
 
419
484
  def _create_node(
420
485
  self,
@@ -435,17 +500,46 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
435
500
  ],
436
501
  )
437
502
 
438
- def _create_edges(
503
+ def _create_edge_with_properties(
504
+ self,
505
+ identifier: str,
506
+ properties: dict[str | InstanceType, list[str]],
507
+ start_node: str,
508
+ end_node: str,
509
+ pydantic_cls: type[BaseModel],
510
+ view_id: dm.ViewId,
511
+ ) -> dm.EdgeApply:
512
+ type_ = properties.pop(RDF.type, [None])[0]
513
+ created = pydantic_cls.model_validate(properties)
514
+ if type_ is None:
515
+ raise ValueError(f"Missing type for edge {identifier}")
516
+
517
+ return dm.EdgeApply(
518
+ space=self.instance_space,
519
+ external_id=identifier,
520
+ type=dm.DirectRelationReference(view_id.space, view_id.external_id),
521
+ start_node=dm.DirectRelationReference(self.instance_space, start_node),
522
+ end_node=dm.DirectRelationReference(self.instance_space, end_node),
523
+ sources=[
524
+ dm.NodeOrEdgeData(source=view_id, properties=dict(created.model_dump(exclude_unset=True).items()))
525
+ ],
526
+ )
527
+
528
+ def _create_edges_without_properties(
439
529
  self,
440
530
  identifier: str,
441
531
  properties: dict[str, list[str]],
442
532
  edge_by_type: dict[str, tuple[str, dm.EdgeConnection]],
533
+ edge_by_prop_id: dict[str, tuple[str, dm.EdgeConnection]],
443
534
  tracker: Tracker,
444
535
  ) -> Iterable[dm.EdgeApply | NeatIssue]:
445
536
  for predicate, values in properties.items():
446
- if predicate not in edge_by_type:
537
+ if predicate in edge_by_type:
538
+ prop_id, edge = edge_by_type[predicate]
539
+ elif predicate in edge_by_prop_id:
540
+ prop_id, edge = edge_by_prop_id[predicate]
541
+ else:
447
542
  continue
448
- prop_id, edge = edge_by_type[predicate]
449
543
  if isinstance(edge, SingleEdgeConnection) and len(values) > 1:
450
544
  error = ResourceDuplicatedError(
451
545
  resource_type="edge",
@@ -106,7 +106,7 @@ def to_construct_triples(
106
106
  non_inherited_starting_rdf_types = []
107
107
 
108
108
  for transformation in transformations:
109
- traversal = cast(RDFPath, transformation.transformation).traversal
109
+ traversal = cast(RDFPath, transformation.instance_source).traversal
110
110
 
111
111
  # keeping track of starting rdf types of non-inherited transformations/properties
112
112
  if isinstance(traversal, Traversal) and not transformation.inherited:
@@ -1,3 +1,4 @@
1
+ from ._base import BaseTransformerStandardised
1
2
  from ._classic_cdf import (
2
3
  AddAssetDepth,
3
4
  AssetEventConnector,
@@ -5,6 +6,7 @@ from ._classic_cdf import (
5
6
  AssetRelationshipConnector,
6
7
  AssetSequenceConnector,
7
8
  AssetTimeSeriesConnector,
9
+ LookupRelationshipSourceTarget,
8
10
  RelationshipAsEdgeTransformer,
9
11
  )
10
12
  from ._prune_graph import (
@@ -29,6 +31,7 @@ __all__ = [
29
31
  "ConnectionToLiteral",
30
32
  "ConvertLiteral",
31
33
  "LiteralToEntity",
34
+ "LookupRelationshipSourceTarget",
32
35
  "MakeConnectionOnExactMatch",
33
36
  "PruneDanglingNodes",
34
37
  "PruneDeadEndEdges",
@@ -57,4 +60,6 @@ Transformers = (
57
60
  | ConvertLiteral
58
61
  | LiteralToEntity
59
62
  | ConnectionToLiteral
63
+ | BaseTransformerStandardised
64
+ | LookupRelationshipSourceTarget
60
65
  )
@@ -1,6 +1,7 @@
1
1
  import dataclasses
2
2
  import warnings
3
3
  from abc import ABC, abstractmethod
4
+ from collections.abc import Iterator
4
5
  from typing import ClassVar, TypeAlias, cast
5
6
 
6
7
  from rdflib import Graph
@@ -65,9 +66,16 @@ class BaseTransformerStandardised(ABC):
65
66
  The query to use for extracting target triples from the graph and performing the transformation.
66
67
  Returns:
67
68
  A query string.
69
+
70
+ !!! note "Complex Queries"
71
+ In majority of cases the query should be a simple SELECT query. However, in case
72
+ when there is a need to have one or more sub iterators, one can overwrite the ._iterator() method
68
73
  """
69
74
  raise NotImplementedError()
70
75
 
76
+ def _iterator(self, graph: Graph) -> Iterator:
77
+ yield from graph.query(self._iterate_query())
78
+
71
79
  def _skip_count_query(self) -> str:
72
80
  """
73
81
  The query to use for extracting target triples from the graph and performing the transformation.
@@ -97,7 +105,7 @@ class BaseTransformerStandardised(ABC):
97
105
  if iteration_count == 0:
98
106
  return outcome
99
107
 
100
- result_iterable = graph.query(self._iterate_query())
108
+ result_iterable = self._iterator(graph)
101
109
  result_iterable = iterate_progress_bar_if_above_config_threshold(
102
110
  result_iterable, iteration_count, self.description
103
111
  )
@@ -1,6 +1,7 @@
1
+ import urllib.parse
1
2
  import warnings
2
3
  from abc import ABC
3
- from collections.abc import Callable, Iterable
4
+ from collections.abc import Callable, Iterable, Iterator
4
5
  from functools import lru_cache
5
6
  from typing import cast
6
7
 
@@ -9,6 +10,7 @@ from rdflib.query import ResultRow
9
10
 
10
11
  from cognite.neat._constants import CLASSIC_CDF_NAMESPACE, DEFAULT_NAMESPACE
11
12
  from cognite.neat._graph import extractors
13
+ from cognite.neat._issues.errors import NeatValueError
12
14
  from cognite.neat._issues.warnings import ResourceNotFoundWarning
13
15
  from cognite.neat._utils.collection_ import iterate_progress_bar
14
16
  from cognite.neat._utils.rdf_ import (
@@ -229,7 +231,6 @@ class AssetEventConnector(BaseAssetConnector):
229
231
  )
230
232
 
231
233
 
232
- # TODO: standardise
233
234
  class AssetRelationshipConnector(BaseTransformerStandardised):
234
235
  description: str = "Connects assets via relationships"
235
236
  _use_only_once: bool = True
@@ -465,7 +466,7 @@ WHERE {{
465
466
  ResourceNotFoundWarning(target_source_id, "class", str(relationship_id), "class"), stacklevel=2
466
467
  )
467
468
  return []
468
- edge_id = str(object_by_predicates["externalId"])
469
+ edge_id = urllib.parse.quote(str(object_by_predicates["externalId"]))
469
470
  # If there is properties on the relationship, we create a new intermediate node
470
471
  edge_type = self._namespace[f"{source_type}To{target_type}Edge"]
471
472
  return self._create_edge(
@@ -516,3 +517,89 @@ WHERE {{
516
517
 
517
518
  def _predicate(self, target_type: str) -> URIRef:
518
519
  return self._namespace[f"relationship{target_type.capitalize()}"]
520
+
521
+
522
+ class LookupRelationshipSourceTarget(BaseTransformerStandardised):
523
+ """When relationships are extracted, the source and target are extracted as literals. This transformers
524
+ lookup the externalID of the source and target and replaces the literals with the URIRef of the entity.
525
+ """
526
+
527
+ description = "Lookup relationships source and target externalId"
528
+ _use_only_once: bool = True
529
+ _need_changes = frozenset({extractors.RelationshipsExtractor.__name__})
530
+
531
+ _lookup_entity_query = """SELECT ?entity
532
+ WHERE {{
533
+ ?entity a <{entity_type}> .
534
+ ?entity <{namespace}externalId> "{external_id}" .
535
+ }}"""
536
+
537
+ def __init__(self, namespace: Namespace = CLASSIC_CDF_NAMESPACE, type_prefix: str | None = None) -> None:
538
+ self._namespace = namespace
539
+ self._type_prefix = type_prefix
540
+ self._lookup_entity: Callable[[URIRef, str], URIRef] | None = None
541
+
542
+ def _count_query(self) -> str:
543
+ return f"""SELECT (COUNT(?instance) AS ?instanceCount)
544
+ WHERE {{
545
+ ?instance a <{self._namespace}ClassicRelationship> .
546
+ }}"""
547
+
548
+ def _iterate_query(self) -> str:
549
+ return f"""SELECT ?instance ?source ?sourceType ?target ?targetType
550
+ WHERE {{
551
+ ?instance a <{self._namespace}ClassicRelationship> .
552
+ ?instance <{self._namespace}sourceExternalId> ?source .
553
+ ?instance <{self._namespace}targetExternalId> ?target .
554
+ ?instance <{self._namespace}sourceType> ?sourceType .
555
+ ?instance <{self._namespace}targetType> ?targetType
556
+ }}"""
557
+
558
+ def _iterator(self, graph: Graph) -> Iterator:
559
+ self._lookup_entity = self.create_lookup_entity_with_external_id(graph, self._namespace, self._type_prefix)
560
+ yield from graph.query(self._iterate_query())
561
+
562
+ def operation(self, query_result_row: ResultRow) -> RowTransformationOutput:
563
+ output = RowTransformationOutput()
564
+ instance, source, source_type, target, target_type = cast(
565
+ tuple[URIRef, Literal, URIRef, Literal, URIRef], query_result_row
566
+ )
567
+ if self._lookup_entity is None:
568
+ raise NeatValueError(f"{type(self)}: .operation() called before .transform()")
569
+ try:
570
+ source_id = self._lookup_entity(source_type, source.toPython())
571
+ except ValueError:
572
+ warnings.warn(ResourceNotFoundWarning(source, "class", str(instance), "class"), stacklevel=2)
573
+ return output
574
+
575
+ try:
576
+ target_id = self._lookup_entity(target_type, target.toPython())
577
+ except ValueError:
578
+ warnings.warn(ResourceNotFoundWarning(target, "class", str(instance), "class"), stacklevel=2)
579
+ return output
580
+
581
+ output.remove_triples.append((instance, self._namespace.sourceExternalId, source))
582
+ output.remove_triples.append((instance, self._namespace.targetExternalId, target))
583
+ output.add_triples.append((instance, self._namespace.sourceExternalId, source_id))
584
+ output.add_triples.append((instance, self._namespace.targetExternalId, target_id))
585
+ output.instances_modified_count += 1
586
+ return output
587
+
588
+ @staticmethod
589
+ def create_lookup_entity_with_external_id(
590
+ graph: Graph, namespace: Namespace, type_prefix: str | None
591
+ ) -> Callable[[URIRef, str], URIRef]:
592
+ @lru_cache(maxsize=10_000)
593
+ def lookup_entity_with_external_id(entity_type: URIRef, external_id: str) -> URIRef:
594
+ if type_prefix:
595
+ entity_type = namespace[type_prefix + remove_namespace_from_uri(entity_type)]
596
+
597
+ query = LookupRelationshipSourceTarget._lookup_entity_query.format(
598
+ namespace=namespace, entity_type=entity_type, external_id=external_id
599
+ )
600
+ result = list(graph.query(query))
601
+ if len(result) == 1:
602
+ return cast(URIRef, result[0][0]) # type: ignore[index]
603
+ raise ValueError(f"Could not find entity with external_id {external_id} and type {entity_type}")
604
+
605
+ return lookup_entity_with_external_id
@@ -35,8 +35,8 @@ class AddSelfReferenceProperty(BaseTransformer):
35
35
 
36
36
  def transform(self, graph: Graph) -> None:
37
37
  for property_ in self.properties:
38
- prefix = property_.transformation.traversal.class_.prefix
39
- suffix = property_.transformation.traversal.class_.suffix
38
+ prefix = property_.instance_source.traversal.class_.prefix
39
+ suffix = property_.instance_source.traversal.class_.suffix
40
40
 
41
41
  namespace = self.rules.prefixes[prefix] if prefix in self.rules.prefixes else self.rules.metadata.namespace
42
42
 
@@ -54,7 +54,7 @@ class AddSelfReferenceProperty(BaseTransformer):
54
54
  property_=f"{self.rules.metadata.prefix}:{property_.property_}",
55
55
  )
56
56
 
57
- property_.transformation = RDFPath(traversal=traversal)
57
+ property_.instance_source = RDFPath(traversal=traversal)
58
58
 
59
59
 
60
60
  class MakeConnectionOnExactMatch(BaseTransformerStandardised):
@@ -1,23 +1,21 @@
1
1
  import warnings
2
- from collections.abc import Callable
2
+ from collections.abc import Callable, Iterator
3
3
  from typing import Any, cast
4
4
  from urllib.parse import quote
5
5
 
6
6
  import rdflib
7
- from rdflib import RDF, XSD, Graph, Namespace, URIRef
7
+ from rdflib import RDF, Namespace, URIRef
8
8
  from rdflib.query import ResultRow
9
9
 
10
10
  from cognite.neat._constants import UNKNOWN_TYPE
11
- from cognite.neat._graph.queries import Queries
12
11
  from cognite.neat._issues.warnings import PropertyDataTypeConversionWarning
13
12
  from cognite.neat._utils.auxiliary import string_to_ideal_type
14
- from cognite.neat._utils.rdf_ import get_namespace, remove_namespace_from_uri
13
+ from cognite.neat._utils.rdf_ import Triple, get_namespace, remove_namespace_from_uri
15
14
 
16
- from ._base import BaseTransformer, BaseTransformerStandardised, RowTransformationOutput
15
+ from ._base import BaseTransformerStandardised, RowTransformationOutput
17
16
 
18
17
 
19
- # TODO: Standardise
20
- class SplitMultiValueProperty(BaseTransformer):
18
+ class SplitMultiValueProperty(BaseTransformerStandardised):
21
19
  description: str = (
22
20
  "SplitMultiValueProperty is a transformer that splits a "
23
21
  "multi-value property into multiple single-value properties."
@@ -25,55 +23,67 @@ class SplitMultiValueProperty(BaseTransformer):
25
23
  _use_only_once: bool = True
26
24
  _need_changes = frozenset({})
27
25
 
28
- _object_property_template: str = """SELECT ?s ?o WHERE{{
26
+ def __init__(self, unknown_type: URIRef | None = None) -> None:
27
+ self.unknown_type = unknown_type or UNKNOWN_TYPE
29
28
 
30
- ?s a <{subject_uri}> .
31
- ?s <{property_uri}> ?o .
32
- ?o a <{object_uri}> .
29
+ def _iterate_query(self) -> str:
30
+ query = """SELECT ?subjectType ?property
31
+ (GROUP_CONCAT(DISTINCT STR(?valueType); SEPARATOR=",") AS ?valueTypes)
32
+
33
+ WHERE {{
34
+ ?s ?property ?o .
35
+ ?s a ?subjectType .
36
+ OPTIONAL {{ ?o a ?type }}
37
+
38
+ # Key part to determine value type: either object, data or unknown
39
+ BIND( IF(isLiteral(?o),DATATYPE(?o),
40
+ IF(BOUND(?type), ?type,
41
+ <{unknownType}>)) AS ?valueType)
42
+ }}
43
+
44
+ GROUP BY ?subjectType ?property
45
+ HAVING (COUNT(DISTINCT ?valueType) > 1)"""
46
+
47
+ return query.format(unknownType=self.unknown_type)
48
+
49
+ def _count_query(self) -> str:
50
+ query = """SELECT (COUNT(*) AS ?tripleCount)
51
+ WHERE {?s ?p ?o .}"""
52
+ return query
53
+
54
+ def _sub_iterate_query(self, type_: URIRef, property_: URIRef) -> str:
55
+ query = """ SELECT ?s ?p ?o ?valueType WHERE {{
56
+ ?s a <{subject_uri}> .
57
+ ?s <{property_uri}> ?o .
33
58
 
34
- }}"""
59
+ OPTIONAL {{ ?o a ?type }}
35
60
 
36
- _datatype_property_template: str = """SELECT ?s ?o WHERE {{
61
+ BIND(<{property_uri}> AS ?p)
37
62
 
38
- ?s a <{subject_uri}> .
39
- ?s <{property_uri}> ?o .
40
- FILTER (datatype(?o) = <{object_uri}>)
63
+ BIND(IF(isLiteral(?o), DATATYPE(?o),
64
+ IF(BOUND(?type),?type,
65
+ <{unknownType}>)) AS ?valueType)
41
66
 
42
- }}"""
67
+ }} """
43
68
 
44
- _unknown_property_template: str = """SELECT ?s ?o WHERE {{
69
+ return query.format(unknownType=self.unknown_type, subject_uri=type_, property_uri=property_)
45
70
 
46
- ?s a <{subject_uri}> .
47
- ?s <{property_uri}> ?o .
48
- FILTER NOT EXISTS {{ ?o a ?objectType }}
49
- }}"""
71
+ def _iterator(self, graph) -> Iterator:
72
+ for type_, property_, _ in graph.query(self._iterate_query()):
73
+ yield from graph.query(self._sub_iterate_query(type_, property_))
50
74
 
51
- def transform(self, graph: Graph) -> None:
52
- # handle multi value type object properties
53
- for subject_uri, property_uri, value_types in Queries(graph).multi_value_type_property():
54
- for value_type_uri in value_types:
55
- _args = {
56
- "subject_uri": subject_uri,
57
- "property_uri": property_uri,
58
- "object_uri": value_type_uri,
59
- }
75
+ def operation(self, query_result_row: ResultRow) -> RowTransformationOutput:
76
+ row_output = RowTransformationOutput()
77
+ subject, old_property, object, value_type = query_result_row
60
78
 
61
- # Case 1: Unknown value type
62
- if value_type_uri == UNKNOWN_TYPE:
63
- iterator = graph.query(self._unknown_property_template.format(**_args))
79
+ new_property = URIRef(f"{old_property}_{remove_namespace_from_uri(value_type)}")
64
80
 
65
- # Case 2: Datatype value type
66
- elif value_type_uri.startswith(str(XSD)):
67
- iterator = graph.query(self._datatype_property_template.format(**_args))
81
+ row_output.add_triples.append(cast(Triple, (subject, new_property, object)))
82
+ row_output.remove_triples.append(cast(Triple, (subject, old_property, object)))
68
83
 
69
- # Case 3: Object value type
70
- else:
71
- iterator = graph.query(self._object_property_template.format(**_args))
84
+ row_output.instances_modified_count += 1
72
85
 
73
- for s, o in iterator: # type: ignore [misc]
74
- graph.remove((s, property_uri, o))
75
- new_property = URIRef(f"{property_uri}_{remove_namespace_from_uri(value_type_uri)}")
76
- graph.add((s, new_property, o))
86
+ return row_output
77
87
 
78
88
 
79
89
  class ConvertLiteral(BaseTransformerStandardised):
@@ -251,7 +251,7 @@ class BaseAnalysis(ABC, Generic[T_Rules, T_Class, T_Property, T_ClassEntity, T_P
251
251
  if (
252
252
  only_rdfpath
253
253
  and isinstance(property_, InformationProperty)
254
- and isinstance(property_.transformation, RDFPath)
254
+ and isinstance(property_.instance_source, RDFPath)
255
255
  ) or not only_rdfpath:
256
256
  processed_properties[prop_entity] = property_
257
257
  class_property_pairs[class_] = processed_properties