cognite-neat 0.106.0__py3-none-any.whl → 0.108.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-neat might be problematic. Click here for more details.

Files changed (67) hide show
  1. cognite/neat/_constants.py +35 -1
  2. cognite/neat/_graph/_shared.py +4 -0
  3. cognite/neat/_graph/extractors/__init__.py +5 -1
  4. cognite/neat/_graph/extractors/_base.py +32 -0
  5. cognite/neat/_graph/extractors/_classic_cdf/_base.py +128 -14
  6. cognite/neat/_graph/extractors/_classic_cdf/_classic.py +156 -12
  7. cognite/neat/_graph/extractors/_classic_cdf/_relationships.py +50 -12
  8. cognite/neat/_graph/extractors/_classic_cdf/_sequences.py +26 -1
  9. cognite/neat/_graph/extractors/_dms.py +196 -47
  10. cognite/neat/_graph/extractors/_dms_graph.py +199 -0
  11. cognite/neat/_graph/extractors/_mock_graph_generator.py +1 -1
  12. cognite/neat/_graph/extractors/_rdf_file.py +33 -5
  13. cognite/neat/_graph/loaders/__init__.py +1 -3
  14. cognite/neat/_graph/loaders/_rdf2dms.py +123 -19
  15. cognite/neat/_graph/queries/_base.py +140 -84
  16. cognite/neat/_graph/queries/_construct.py +2 -2
  17. cognite/neat/_graph/transformers/__init__.py +8 -1
  18. cognite/neat/_graph/transformers/_base.py +9 -1
  19. cognite/neat/_graph/transformers/_classic_cdf.py +90 -3
  20. cognite/neat/_graph/transformers/_rdfpath.py +3 -3
  21. cognite/neat/_graph/transformers/_value_type.py +106 -45
  22. cognite/neat/_issues/errors/_resources.py +1 -1
  23. cognite/neat/_issues/warnings/__init__.py +0 -2
  24. cognite/neat/_issues/warnings/_models.py +1 -1
  25. cognite/neat/_issues/warnings/_properties.py +0 -8
  26. cognite/neat/_rules/analysis/_base.py +1 -1
  27. cognite/neat/_rules/analysis/_information.py +14 -13
  28. cognite/neat/_rules/catalog/__init__.py +1 -0
  29. cognite/neat/_rules/catalog/classic_model.xlsx +0 -0
  30. cognite/neat/_rules/catalog/info-rules-imf.xlsx +0 -0
  31. cognite/neat/_rules/exporters/_rules2instance_template.py +3 -3
  32. cognite/neat/_rules/importers/__init__.py +3 -1
  33. cognite/neat/_rules/importers/_dms2rules.py +7 -5
  34. cognite/neat/_rules/importers/_dtdl2rules/spec.py +1 -2
  35. cognite/neat/_rules/importers/_rdf/__init__.py +2 -2
  36. cognite/neat/_rules/importers/_rdf/_base.py +2 -2
  37. cognite/neat/_rules/importers/_rdf/_inference2rules.py +242 -19
  38. cognite/neat/_rules/models/_base_rules.py +13 -15
  39. cognite/neat/_rules/models/_types.py +5 -0
  40. cognite/neat/_rules/models/dms/_rules.py +51 -10
  41. cognite/neat/_rules/models/dms/_rules_input.py +4 -0
  42. cognite/neat/_rules/models/information/_rules.py +48 -5
  43. cognite/neat/_rules/models/information/_rules_input.py +6 -1
  44. cognite/neat/_rules/models/mapping/_classic2core.py +4 -5
  45. cognite/neat/_rules/transformers/__init__.py +10 -0
  46. cognite/neat/_rules/transformers/_converters.py +300 -62
  47. cognite/neat/_session/_base.py +57 -10
  48. cognite/neat/_session/_drop.py +5 -1
  49. cognite/neat/_session/_inspect.py +3 -2
  50. cognite/neat/_session/_mapping.py +17 -6
  51. cognite/neat/_session/_prepare.py +0 -47
  52. cognite/neat/_session/_read.py +115 -10
  53. cognite/neat/_session/_set.py +27 -0
  54. cognite/neat/_session/_show.py +4 -4
  55. cognite/neat/_session/_state.py +12 -1
  56. cognite/neat/_session/_to.py +43 -2
  57. cognite/neat/_session/_wizard.py +1 -1
  58. cognite/neat/_session/exceptions.py +8 -3
  59. cognite/neat/_store/_graph_store.py +331 -136
  60. cognite/neat/_store/_rules_store.py +130 -1
  61. cognite/neat/_utils/auth.py +3 -1
  62. cognite/neat/_version.py +1 -1
  63. {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/METADATA +2 -2
  64. {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/RECORD +67 -65
  65. {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/WHEEL +1 -1
  66. {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/LICENSE +0 -0
  67. {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/entry_points.txt +0 -0
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from cognite.client import data_modeling as dm
6
6
  from cognite.client.data_classes.data_modeling.ids import DataModelId
7
7
  from rdflib import DC, DCTERMS, FOAF, OWL, RDF, RDFS, SH, SKOS, XSD, Namespace, URIRef
8
+ from rdflib.namespace import DefinedNamespace
8
9
 
9
10
  from cognite import neat
10
11
 
@@ -73,10 +74,22 @@ DEFAULT_NAMESPACE = Namespace("http://purl.org/cognite/neat/")
73
74
  CDF_NAMESPACE = Namespace("https://cognitedata.com/")
74
75
  DEFAULT_BASE_URI = URIRef(DEFAULT_NAMESPACE)
75
76
  CLASSIC_CDF_NAMESPACE = Namespace("http://purl.org/cognite/cdf-classic#")
76
- UNKNOWN_TYPE = DEFAULT_NAMESPACE.UnknownType
77
77
  XML_SCHEMA_NAMESPACE = Namespace("http://www.w3.org/2001/XMLSchema#")
78
78
 
79
79
 
80
+ class NEAT(DefinedNamespace):
81
+ """
82
+ NEAT internal data model used for internal purposes of the NEAT library
83
+
84
+ """
85
+
86
+ _fail = True
87
+ _NS = Namespace("http://thisisneat.io/internal/")
88
+
89
+ type: URIRef # type property used to express a type of a subject
90
+ UnknownType: URIRef # Unknown type used to express that the type of a subject is unknown
91
+
92
+
80
93
  def get_default_prefixes_and_namespaces() -> dict[str, Namespace]:
81
94
  return {
82
95
  "owl": OWL._NS,
@@ -154,3 +167,24 @@ READONLY_PROPERTIES_BY_CONTAINER: Mapping[dm.ContainerId, frozenset[str]] = {
154
167
 
155
168
  def is_readonly_property(container: dm.ContainerId, property_: str) -> bool:
156
169
  return container in READONLY_PROPERTIES_BY_CONTAINER and property_ in READONLY_PROPERTIES_BY_CONTAINER[container]
170
+
171
+
172
+ DMS_RESERVED_PROPERTIES = frozenset(
173
+ {
174
+ "createdTime",
175
+ "deletedTime",
176
+ "edge_id",
177
+ "extensions",
178
+ "externalId",
179
+ "lastUpdatedTime",
180
+ "node_id",
181
+ "project_id",
182
+ "property_group",
183
+ "seq",
184
+ "space",
185
+ "version",
186
+ "tg_table_name",
187
+ "startNode",
188
+ "endNode",
189
+ }
190
+ )
@@ -7,6 +7,10 @@ MIMETypes: TypeAlias = Literal[
7
7
  RDFTypes: TypeAlias = Literal["xml", "rdf", "owl", "n3", "ttl", "turtle", "nt", "nq", "nquads", "trig"]
8
8
 
9
9
 
10
+ def quad_formats() -> list[str]:
11
+ return ["trig", "nquads", "nq", "nt"]
12
+
13
+
10
14
  def rdflib_to_oxi_type(rdflib_format: str) -> str | None:
11
15
  """Convert an RDFlib format to a MIME type.
12
16
 
@@ -1,6 +1,6 @@
1
1
  from cognite.neat._session.engine._interface import Extractor as EngineExtractor
2
2
 
3
- from ._base import BaseExtractor
3
+ from ._base import BaseExtractor, KnowledgeGraphExtractor
4
4
  from ._classic_cdf._assets import AssetsExtractor
5
5
  from ._classic_cdf._classic import ClassicGraphExtractor
6
6
  from ._classic_cdf._data_sets import DataSetExtractor
@@ -12,6 +12,7 @@ from ._classic_cdf._sequences import SequencesExtractor
12
12
  from ._classic_cdf._timeseries import TimeSeriesExtractor
13
13
  from ._dexpi import DexpiExtractor
14
14
  from ._dms import DMSExtractor
15
+ from ._dms_graph import DMSGraphExtractor
15
16
  from ._iodd import IODDExtractor
16
17
  from ._mock_graph_generator import MockGraphGenerator
17
18
  from ._rdf_file import RdfFileExtractor
@@ -21,11 +22,13 @@ __all__ = [
21
22
  "BaseExtractor",
22
23
  "ClassicGraphExtractor",
23
24
  "DMSExtractor",
25
+ "DMSGraphExtractor",
24
26
  "DataSetExtractor",
25
27
  "DexpiExtractor",
26
28
  "EventsExtractor",
27
29
  "FilesExtractor",
28
30
  "IODDExtractor",
31
+ "KnowledgeGraphExtractor",
29
32
  "LabelsExtractor",
30
33
  "MockGraphGenerator",
31
34
  "RdfFileExtractor",
@@ -51,6 +54,7 @@ TripleExtractors = (
51
54
  | ClassicGraphExtractor
52
55
  | DataSetExtractor
53
56
  | EngineExtractor
57
+ | DMSGraphExtractor
54
58
  )
55
59
 
56
60
 
@@ -1,9 +1,17 @@
1
1
  from abc import abstractmethod
2
2
  from collections.abc import Iterable
3
+ from typing import TYPE_CHECKING
3
4
 
5
+ from rdflib import URIRef
6
+
7
+ from cognite.neat._constants import DEFAULT_NAMESPACE
8
+ from cognite.neat._rules.models import InformationRules
4
9
  from cognite.neat._shared import Triple
5
10
  from cognite.neat._utils.auxiliary import class_html_doc
6
11
 
12
+ if TYPE_CHECKING:
13
+ from cognite.neat._store._provenance import Agent as ProvenanceAgent
14
+
7
15
 
8
16
  class BaseExtractor:
9
17
  """This is the base class for all extractors. It defines the interface that
@@ -24,3 +32,27 @@ class BaseExtractor:
24
32
  @classmethod
25
33
  def _repr_html_(cls) -> str:
26
34
  return class_html_doc(cls)
35
+
36
+
37
+ class KnowledgeGraphExtractor(BaseExtractor):
38
+ """A knowledge graph extractor extracts triples with a schema"""
39
+
40
+ @abstractmethod
41
+ def get_information_rules(self) -> InformationRules:
42
+ """Returns the information rules that the extractor uses."""
43
+ raise NotImplementedError()
44
+
45
+ @property
46
+ def description(self) -> str:
47
+ return self.__doc__.strip().split("\n")[0] if self.__doc__ else "Missing"
48
+
49
+ @property
50
+ def source_uri(self) -> URIRef:
51
+ raise NotImplementedError
52
+
53
+ @property
54
+ def agent(self) -> "ProvenanceAgent":
55
+ """Provenance agent for the importer."""
56
+ from cognite.neat._store._provenance import Agent as ProvenanceAgent
57
+
58
+ return ProvenanceAgent(id_=DEFAULT_NAMESPACE[f"agent/{type(self).__name__}"])
@@ -1,6 +1,8 @@
1
1
  import json
2
2
  import re
3
3
  import sys
4
+ import typing
5
+ import urllib.parse
4
6
  import warnings
5
7
  from abc import ABC, abstractmethod
6
8
  from collections.abc import Callable, Iterable, Sequence, Set
@@ -16,7 +18,8 @@ from rdflib import RDF, XSD, Literal, Namespace, URIRef
16
18
 
17
19
  from cognite.neat._constants import DEFAULT_NAMESPACE
18
20
  from cognite.neat._graph.extractors._base import BaseExtractor
19
- from cognite.neat._issues.warnings import CDFAuthWarning
21
+ from cognite.neat._issues.errors import NeatValueError
22
+ from cognite.neat._issues.warnings import CDFAuthWarning, NeatValueWarning
20
23
  from cognite.neat._shared import Triple
21
24
  from cognite.neat._utils.auxiliary import string_to_ideal_type
22
25
  from cognite.neat._utils.collection_ import iterate_progress_bar_if_above_config_threshold
@@ -72,6 +75,8 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
72
75
  camel_case (bool, optional): Whether to use camelCase instead of snake_case for property names.
73
76
  Defaults to True.
74
77
  as_write (bool, optional): Whether to use the write/request format of the items. Defaults to False.
78
+ prefix (str, optional): A prefix to add to the rdf type. Defaults to None.
79
+ identifier (Literal["id", "externalId"], optional): The identifier to use. Defaults to "id".
75
80
  """
76
81
 
77
82
  _default_rdf_type: str
@@ -89,6 +94,8 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
89
94
  skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
90
95
  camel_case: bool = True,
91
96
  as_write: bool = False,
97
+ prefix: str | None = None,
98
+ identifier: typing.Literal["id", "externalId"] = "id",
92
99
  ):
93
100
  self.namespace = namespace or DEFAULT_NAMESPACE
94
101
  self.items = items
@@ -99,9 +106,19 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
99
106
  self.skip_metadata_values = skip_metadata_values
100
107
  self.camel_case = camel_case
101
108
  self.as_write = as_write
109
+ self.prefix = prefix
110
+ self.identifier = identifier
111
+ # If identifier=externalId, we need to keep track of the external ids
112
+ # and use them in linking of Files, Sequences, TimeSeries, and Events.
113
+ self.asset_external_ids_by_id: dict[int, str] = {}
114
+ self.lookup_dataset_external_id: Callable[[int], str] | None = None
115
+ # Used by the ClassicGraphExtractor to log URIRefs
116
+ self._log_urirefs = False
117
+ self._uriref_by_external_id: dict[str, URIRef] = {}
102
118
 
103
119
  def extract(self) -> Iterable[Triple]:
104
120
  """Extracts an asset with the given asset_id."""
121
+ from ._assets import AssetsExtractor
105
122
 
106
123
  if self.total is not None and self.total > 0:
107
124
  to_iterate = iterate_progress_bar_if_above_config_threshold(
@@ -109,21 +126,40 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
109
126
  )
110
127
  else:
111
128
  to_iterate = self.items
129
+ if self.identifier == "externalId" and isinstance(self, AssetsExtractor):
130
+ to_iterate = self._store_asset_external_ids(to_iterate) # type: ignore[attr-defined]
131
+
112
132
  for no, asset in enumerate(to_iterate):
113
133
  yield from self._item2triples(asset)
114
134
  if self.limit and no >= self.limit:
115
135
  break
116
136
 
137
+ def _store_asset_external_ids(self, items: Iterable[T_CogniteResource]) -> Iterable[T_CogniteResource]:
138
+ for item in items:
139
+ if hasattr(item, "id") and hasattr(item, "external_id"):
140
+ self.asset_external_ids_by_id[item.id] = item.external_id
141
+ yield item
142
+
117
143
  def _item2triples(self, item: T_CogniteResource) -> list[Triple]:
118
- id_value: str | None
119
- if hasattr(item, "id"):
120
- id_value = str(item.id)
144
+ if self.identifier == "id":
145
+ id_value: str | None
146
+ if hasattr(item, "id"):
147
+ id_value = str(item.id)
148
+ else:
149
+ id_value = self._fallback_id(item)
150
+ if id_value is None:
151
+ return []
152
+ id_suffix = id_value
153
+ elif self.identifier == "externalId":
154
+ if not hasattr(item, "external_id"):
155
+ return []
156
+ id_suffix = self._external_id_as_uri_suffix(item.external_id)
121
157
  else:
122
- id_value = self._fallback_id(item)
123
- if id_value is None:
124
- return []
158
+ raise NeatValueError(f"Unknown identifier {self.identifier}")
125
159
 
126
- id_ = self.namespace[f"{self._instance_id_prefix}{id_value}"]
160
+ id_ = self.namespace[f"{self._instance_id_prefix}{id_suffix}"]
161
+ if self._log_urirefs and hasattr(item, "external_id"):
162
+ self._uriref_by_external_id[item.external_id] = id_
127
163
 
128
164
  type_ = self._get_rdf_type(item)
129
165
 
@@ -152,10 +188,25 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
152
188
  """This can be overridden to handle special cases for the item."""
153
189
  return []
154
190
 
191
+ @classmethod
192
+ def _external_id_as_uri_suffix(cls, external_id: str | None) -> str:
193
+ if external_id == "":
194
+ warnings.warn(NeatValueWarning(f"Empty external id in {cls._default_rdf_type}"), stacklevel=2)
195
+ return "empty"
196
+ elif external_id == "\x00":
197
+ warnings.warn(NeatValueWarning(f"Null external id in {cls._default_rdf_type}"), stacklevel=2)
198
+ return "null"
199
+ elif external_id is None:
200
+ warnings.warn(NeatValueWarning(f"None external id in {cls._default_rdf_type}"), stacklevel=2)
201
+ return "None"
202
+ # The external ID needs to pass the ^[^\\x00]{1,256}$ regex for the DMS API.
203
+ # In addition, neat internals requires the external ID to be a valid URI.
204
+ return urllib.parse.quote(external_id)
205
+
155
206
  def _fallback_id(self, item: T_CogniteResource) -> str | None:
156
207
  raise AttributeError(
157
208
  f"Item of type {type(item)} does not have an id attribute. "
158
- f"Please implement the _fallback_id method in the extractor."
209
+ "Please implement the _fallback_id method in the extractor."
159
210
  )
160
211
 
161
212
  def _metadata_to_triples(self, id_: URIRef, metadata: dict[str, str]) -> Iterable[Triple]:
@@ -174,13 +225,34 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
174
225
  type_ = self._default_rdf_type
175
226
  if self.to_type:
176
227
  type_ = self.to_type(item) or type_
228
+ if self.prefix:
229
+ type_ = f"{self.prefix}{type_}"
177
230
  return self._SPACE_PATTERN.sub("_", type_)
178
231
 
179
232
  def _as_object(self, raw: Any, key: str) -> Literal | URIRef:
233
+ """Return properly formatted object part of s-p-o triple"""
180
234
  if key in {"data_set_id", "dataSetId"}:
181
- return self.namespace[f"{InstanceIdPrefix.data_set}{raw}"]
235
+ if self.identifier == "externalId" and self.lookup_dataset_external_id:
236
+ try:
237
+ data_set_external_id = self.lookup_dataset_external_id(raw)
238
+ except KeyError:
239
+ return Literal("Unknown data set")
240
+ else:
241
+ return self.namespace[
242
+ f"{InstanceIdPrefix.data_set}{self._external_id_as_uri_suffix(data_set_external_id)}"
243
+ ]
244
+ else:
245
+ return self.namespace[f"{InstanceIdPrefix.data_set}{raw}"]
182
246
  elif key in {"assetId", "asset_id", "assetIds", "asset_ids", "parentId", "rootId", "parent_id", "root_id"}:
183
- return self.namespace[f"{InstanceIdPrefix.asset}{raw}"]
247
+ if self.identifier == "id":
248
+ return self.namespace[f"{InstanceIdPrefix.asset}{raw}"]
249
+ else:
250
+ try:
251
+ asset_external_id = self._external_id_as_uri_suffix(self.asset_external_ids_by_id[raw])
252
+ except KeyError:
253
+ return Literal("Unknown asset", datatype=XSD.string)
254
+ else:
255
+ return self.namespace[f"{InstanceIdPrefix.asset}{asset_external_id}"]
184
256
  elif key in {
185
257
  "startTime",
186
258
  "endTime",
@@ -218,9 +290,23 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
218
290
  skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
219
291
  camel_case: bool = True,
220
292
  as_write: bool = False,
293
+ prefix: str | None = None,
294
+ identifier: typing.Literal["id", "externalId"] = "id",
221
295
  ):
222
296
  total, items = cls._handle_no_access(lambda: cls._from_dataset(client, data_set_external_id))
223
- return cls(items, namespace, to_type, total, limit, unpack_metadata, skip_metadata_values, camel_case, as_write)
297
+ return cls(
298
+ items,
299
+ namespace,
300
+ to_type,
301
+ total,
302
+ limit,
303
+ unpack_metadata,
304
+ skip_metadata_values,
305
+ camel_case,
306
+ as_write,
307
+ prefix,
308
+ identifier,
309
+ )
224
310
 
225
311
  @classmethod
226
312
  @abstractmethod
@@ -241,9 +327,23 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
241
327
  skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
242
328
  camel_case: bool = True,
243
329
  as_write: bool = False,
330
+ prefix: str | None = None,
331
+ identifier: typing.Literal["id", "externalId"] = "id",
244
332
  ):
245
333
  total, items = cls._handle_no_access(lambda: cls._from_hierarchy(client, root_asset_external_id))
246
- return cls(items, namespace, to_type, total, limit, unpack_metadata, skip_metadata_values, camel_case, as_write)
334
+ return cls(
335
+ items,
336
+ namespace,
337
+ to_type,
338
+ total,
339
+ limit,
340
+ unpack_metadata,
341
+ skip_metadata_values,
342
+ camel_case,
343
+ as_write,
344
+ prefix,
345
+ identifier,
346
+ )
247
347
 
248
348
  @classmethod
249
349
  @abstractmethod
@@ -263,9 +363,23 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
263
363
  skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
264
364
  camel_case: bool = True,
265
365
  as_write: bool = False,
366
+ prefix: str | None = None,
367
+ identifier: typing.Literal["id", "externalId"] = "id",
266
368
  ):
267
369
  total, items = cls._from_file(file_path)
268
- return cls(items, namespace, to_type, total, limit, unpack_metadata, skip_metadata_values, camel_case, as_write)
370
+ return cls(
371
+ items,
372
+ namespace,
373
+ to_type,
374
+ total,
375
+ limit,
376
+ unpack_metadata,
377
+ skip_metadata_values,
378
+ camel_case,
379
+ as_write,
380
+ prefix,
381
+ identifier,
382
+ )
269
383
 
270
384
  @classmethod
271
385
  @abstractmethod
@@ -1,18 +1,27 @@
1
+ import typing
2
+ import urllib.parse
1
3
  import warnings
2
4
  from collections import defaultdict
3
5
  from collections.abc import Iterable, Sequence
4
- from typing import ClassVar, NamedTuple
6
+ from typing import ClassVar, NamedTuple, cast
5
7
 
6
8
  from cognite.client import CogniteClient
7
9
  from cognite.client.exceptions import CogniteAPIError
8
- from rdflib import Namespace
9
-
10
- from cognite.neat._constants import CLASSIC_CDF_NAMESPACE
11
- from cognite.neat._graph.extractors._base import BaseExtractor
12
- from cognite.neat._issues.warnings import CDFAuthWarning
10
+ from rdflib import Namespace, URIRef
11
+
12
+ from cognite.neat._constants import CLASSIC_CDF_NAMESPACE, DEFAULT_NAMESPACE, get_default_prefixes_and_namespaces
13
+ from cognite.neat._graph.extractors._base import KnowledgeGraphExtractor
14
+ from cognite.neat._issues.errors import NeatValueError, ResourceNotFoundError
15
+ from cognite.neat._issues.warnings import CDFAuthWarning, NeatValueWarning
16
+ from cognite.neat._rules._shared import ReadRules
17
+ from cognite.neat._rules.catalog import classic_model
18
+ from cognite.neat._rules.models import InformationInputRules, InformationRules
19
+ from cognite.neat._rules.models._rdfpath import Entity as RDFPathEntity
20
+ from cognite.neat._rules.models._rdfpath import RDFPath, SingleProperty
13
21
  from cognite.neat._shared import Triple
14
22
  from cognite.neat._utils.collection_ import chunker, iterate_progress_bar
15
23
  from cognite.neat._utils.rdf_ import remove_namespace_from_uri
24
+ from cognite.neat._utils.text import to_snake
16
25
 
17
26
  from ._assets import AssetsExtractor
18
27
  from ._base import ClassicCDFBaseExtractor, InstanceIdPrefix
@@ -37,7 +46,7 @@ class _ClassicCoreType(NamedTuple):
37
46
  api_name: str
38
47
 
39
48
 
40
- class ClassicGraphExtractor(BaseExtractor):
49
+ class ClassicGraphExtractor(KnowledgeGraphExtractor):
41
50
  """This extractor extracts all classic CDF Resources.
42
51
 
43
52
  The Classic Graph consists of the following core resource type.
@@ -93,6 +102,8 @@ class ClassicGraphExtractor(BaseExtractor):
93
102
  root_asset_external_id: str | None = None,
94
103
  namespace: Namespace | None = None,
95
104
  limit_per_type: int | None = None,
105
+ prefix: str | None = None,
106
+ identifier: typing.Literal["id", "externalId"] = "id",
96
107
  ):
97
108
  self._client = client
98
109
  if sum([bool(data_set_external_id), bool(root_asset_external_id)]) != 1:
@@ -101,16 +112,29 @@ class ClassicGraphExtractor(BaseExtractor):
101
112
  self._data_set_external_id = data_set_external_id
102
113
  self._namespace = namespace or CLASSIC_CDF_NAMESPACE
103
114
  self._extractor_args = dict(
104
- namespace=self._namespace, unpack_metadata=False, as_write=True, camel_case=True, limit=limit_per_type
115
+ namespace=self._namespace,
116
+ unpack_metadata=False,
117
+ as_write=True,
118
+ camel_case=True,
119
+ limit=limit_per_type,
120
+ prefix=prefix,
121
+ identifier=identifier,
105
122
  )
123
+ self._identifier = identifier
124
+ self._prefix = prefix
106
125
  self._limit_per_type = limit_per_type
107
126
 
127
+ self._uris_by_external_id_by_type: dict[InstanceIdPrefix, dict[str, URIRef]] = defaultdict(dict)
108
128
  self._source_external_ids_by_type: dict[InstanceIdPrefix, set[str]] = defaultdict(set)
109
129
  self._target_external_ids_by_type: dict[InstanceIdPrefix, set[str]] = defaultdict(set)
130
+ self._relationship_subject_predicate_type_external_id: list[tuple[URIRef, URIRef, str, str]] = []
110
131
  self._labels: set[str] = set()
111
132
  self._data_set_ids: set[int] = set()
133
+ self._data_set_external_ids: set[str] = set()
112
134
  self._extracted_labels = False
113
135
  self._extracted_data_sets = False
136
+ self._asset_external_ids_by_id: dict[int, str] = {}
137
+ self._dataset_external_ids_by_id: dict[int, str] = {}
114
138
 
115
139
  def _get_activity_names(self) -> list[str]:
116
140
  activities = [data_access_object.extractor_cls.__name__ for data_access_object in self._classic_node_types] + [
@@ -124,12 +148,17 @@ class ClassicGraphExtractor(BaseExtractor):
124
148
 
125
149
  def extract(self) -> Iterable[Triple]:
126
150
  """Extracts all classic CDF Resources."""
151
+ self._validate_exists()
152
+
127
153
  yield from self._extract_core_start_nodes()
128
154
 
129
155
  yield from self._extract_start_node_relationships()
130
156
 
131
157
  yield from self._extract_core_end_nodes()
132
158
 
159
+ if self._identifier == "id":
160
+ yield from self._extract_relationship_target_triples()
161
+
133
162
  try:
134
163
  yield from self._extract_labels()
135
164
  except CogniteAPIError as e:
@@ -144,6 +173,69 @@ class ClassicGraphExtractor(BaseExtractor):
144
173
  else:
145
174
  self._extracted_data_sets = True
146
175
 
176
+ def get_information_rules(self) -> InformationRules:
177
+ # To avoid circular imports
178
+ from cognite.neat._rules.importers import ExcelImporter
179
+
180
+ unverified = cast(ReadRules[InformationInputRules], ExcelImporter(classic_model).to_rules())
181
+ if unverified.rules is None:
182
+ raise NeatValueError(f"Could not read the classic model rules from {classic_model}.")
183
+
184
+ verified = unverified.rules.as_verified_rules()
185
+ prefixes = get_default_prefixes_and_namespaces()
186
+ instance_prefix: str | None = next((k for k, v in prefixes.items() if v == self._namespace), None)
187
+ if instance_prefix is None:
188
+ # We need to add a new prefix
189
+ instance_prefix = f"prefix_{len(prefixes) + 1}"
190
+ prefixes[instance_prefix] = self._namespace
191
+ verified.prefixes = prefixes
192
+
193
+ is_snake_case = self._extractor_args["camel_case"] is False
194
+ for prop in verified.properties:
195
+ prop_id = prop.property_
196
+ if is_snake_case:
197
+ prop_id = to_snake(prop_id)
198
+ prop.instance_source = RDFPath(
199
+ traversal=SingleProperty(
200
+ class_=RDFPathEntity(prefix=instance_prefix, suffix=prop.class_.suffix),
201
+ property=RDFPathEntity(prefix=instance_prefix, suffix=prop_id),
202
+ )
203
+ )
204
+ return verified
205
+
206
+ @property
207
+ def description(self) -> str:
208
+ if self._data_set_external_id:
209
+ source = f"data set {self._data_set_external_id}."
210
+ elif self._root_asset_external_id:
211
+ source = f"root asset {self._root_asset_external_id}."
212
+ else:
213
+ source = "unknown source."
214
+ return f"Extracting clasic CDF Graph (Assets, TimeSeries, Sequences, Events, Files) from {source}."
215
+
216
+ @property
217
+ def source_uri(self) -> URIRef:
218
+ if self._data_set_external_id:
219
+ resource = "dataset"
220
+ external_id = self._data_set_external_id
221
+ elif self._root_asset_external_id:
222
+ resource = "asset"
223
+ external_id = self._root_asset_external_id
224
+ else:
225
+ resource = "unknown"
226
+ external_id = "unknown"
227
+ return DEFAULT_NAMESPACE[f"{self._client.config.project}/{resource}/{urllib.parse.quote(external_id)}"]
228
+
229
+ def _validate_exists(self) -> None:
230
+ if self._data_set_external_id:
231
+ if self._client.data_sets.retrieve(external_id=self._data_set_external_id) is None:
232
+ raise ResourceNotFoundError(self._data_set_external_id, "data set")
233
+ elif self._root_asset_external_id:
234
+ if self._client.assets.retrieve(external_id=self._root_asset_external_id) is None:
235
+ raise ResourceNotFoundError(self._root_asset_external_id, "root asset")
236
+ else:
237
+ raise ValueError("Exactly one of data_set_external_id or root_asset_external_id must be set.")
238
+
147
239
  def _extract_core_start_nodes(self):
148
240
  for core_node in self._classic_node_types:
149
241
  if self._data_set_external_id:
@@ -157,8 +249,20 @@ class ClassicGraphExtractor(BaseExtractor):
157
249
  else:
158
250
  raise ValueError("Exactly one of data_set_external_id or root_asset_external_id must be set.")
159
251
 
252
+ if self._identifier == "externalId":
253
+ if isinstance(extractor, AssetsExtractor):
254
+ self._asset_external_ids_by_id = extractor.asset_external_ids_by_id
255
+ else:
256
+ extractor.asset_external_ids_by_id = self._asset_external_ids_by_id
257
+ extractor.lookup_dataset_external_id = self._lookup_dataset
258
+ elif self._identifier == "id":
259
+ extractor._log_urirefs = True
260
+
160
261
  yield from self._extract_with_logging_label_dataset(extractor, core_node.resource_type)
161
262
 
263
+ if self._identifier == "id":
264
+ self._uris_by_external_id_by_type[core_node.resource_type].update(extractor._uriref_by_external_id)
265
+
162
266
  def _extract_start_node_relationships(self):
163
267
  for start_resource_type, source_external_ids in self._source_external_ids_by_type.items():
164
268
  start_type = start_resource_type.removesuffix("_")
@@ -169,6 +273,8 @@ class ClassicGraphExtractor(BaseExtractor):
169
273
  extractor = RelationshipsExtractor(relationship_iterator, **self._extractor_args)
170
274
  # This is a private attribute, but we need to set it to log the target nodes.
171
275
  extractor._log_target_nodes = True
276
+ if self._identifier == "id":
277
+ extractor._uri_by_external_id_by_by_type = self._uris_by_external_id_by_type
172
278
 
173
279
  yield from extractor.extract()
174
280
 
@@ -187,6 +293,11 @@ class ClassicGraphExtractor(BaseExtractor):
187
293
  ):
188
294
  self._target_external_ids_by_type[end_type].add(external_id)
189
295
 
296
+ if self._identifier == "id":
297
+ # We need to store all future target triples which we will lookup after fetching
298
+ # the target nodes.
299
+ self._relationship_subject_predicate_type_external_id.extend(extractor._target_triples)
300
+
190
301
  def _extract_core_end_nodes(self):
191
302
  for core_node in self._classic_node_types:
192
303
  target_external_ids = self._target_external_ids_by_type[core_node.resource_type]
@@ -197,8 +308,26 @@ class ClassicGraphExtractor(BaseExtractor):
197
308
  ):
198
309
  resource_iterator = api.retrieve_multiple(external_ids=list(chunk), ignore_unknown_ids=True)
199
310
  extractor = core_node.extractor_cls(resource_iterator, **self._extractor_args)
311
+
312
+ extractor.asset_external_ids_by_id = self._asset_external_ids_by_id
313
+ extractor.lookup_dataset_external_id = self._lookup_dataset
314
+ if self._identifier == "id":
315
+ extractor._log_urirefs = True
316
+
200
317
  yield from self._extract_with_logging_label_dataset(extractor)
201
318
 
319
+ if self._identifier == "id":
320
+ self._uris_by_external_id_by_type[core_node.resource_type].update(extractor._uriref_by_external_id)
321
+
322
+ def _extract_relationship_target_triples(self):
323
+ for id_, predicate, type_, external_id in self._relationship_subject_predicate_type_external_id:
324
+ try:
325
+ object_uri = self._uris_by_external_id_by_type[InstanceIdPrefix.from_str(type_)][external_id]
326
+ except KeyError:
327
+ warnings.warn(NeatValueWarning(f"Missing externalId {external_id} for {type_}"), stacklevel=2)
328
+ else:
329
+ yield id_, predicate, object_uri
330
+
202
331
  def _extract_labels(self):
203
332
  for chunk in self._chunk(list(self._labels), description="Extracting labels"):
204
333
  label_iterator = self._client.labels.retrieve(external_id=list(chunk), ignore_unknown_ids=True)
@@ -208,6 +337,11 @@ class ClassicGraphExtractor(BaseExtractor):
208
337
  for chunk in self._chunk(list(self._data_set_ids), description="Extracting data sets"):
209
338
  data_set_iterator = self._client.data_sets.retrieve_multiple(ids=list(chunk), ignore_unknown_ids=True)
210
339
  yield from DataSetExtractor(data_set_iterator, **self._extractor_args).extract()
340
+ for chunk in self._chunk(list(self._data_set_external_ids), description="Extracting data sets"):
341
+ data_set_iterator = self._client.data_sets.retrieve_multiple(
342
+ external_ids=list(chunk), ignore_unknown_ids=True
343
+ )
344
+ yield from DataSetExtractor(data_set_iterator, **self._extractor_args).extract()
211
345
 
212
346
  def _extract_with_logging_label_dataset(
213
347
  self, extractor: ClassicCDFBaseExtractor, resource_type: InstanceIdPrefix | None = None
@@ -217,10 +351,12 @@ class ClassicGraphExtractor(BaseExtractor):
217
351
  self._source_external_ids_by_type[resource_type].add(remove_namespace_from_uri(triple[2]))
218
352
  elif triple[1] == self._namespace.labels:
219
353
  self._labels.add(remove_namespace_from_uri(triple[2]).removeprefix(InstanceIdPrefix.label))
220
- elif triple[1] == self._namespace.datasetId:
221
- self._data_set_ids.add(
222
- int(remove_namespace_from_uri(triple[2]).removeprefix(InstanceIdPrefix.data_set))
223
- )
354
+ elif triple[1] == self._namespace.dataSetId:
355
+ identifier = remove_namespace_from_uri(triple[2]).removeprefix(InstanceIdPrefix.data_set)
356
+ try:
357
+ self._data_set_ids.add(int(identifier))
358
+ except ValueError:
359
+ self._data_set_external_ids.add(identifier)
224
360
  yield triple
225
361
 
226
362
  @staticmethod
@@ -230,3 +366,11 @@ class ClassicGraphExtractor(BaseExtractor):
230
366
  return iterate_progress_bar(to_iterate, (len(items) // 1_000) + 1, description)
231
367
  else:
232
368
  return to_iterate
369
+
370
+ def _lookup_dataset(self, dataset_id: int) -> str:
371
+ if dataset_id not in self._dataset_external_ids_by_id:
372
+ if (dataset := self._client.data_sets.retrieve(id=dataset_id)) and dataset.external_id:
373
+ self._dataset_external_ids_by_id[dataset_id] = dataset.external_id
374
+ else:
375
+ raise KeyError(f"Could not find dataset with id {dataset_id}.")
376
+ return self._dataset_external_ids_by_id[dataset_id]