cognite-neat 0.106.0__py3-none-any.whl → 0.108.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cognite-neat might be problematic. Click here for more details.
- cognite/neat/_constants.py +35 -1
- cognite/neat/_graph/_shared.py +4 -0
- cognite/neat/_graph/extractors/__init__.py +5 -1
- cognite/neat/_graph/extractors/_base.py +32 -0
- cognite/neat/_graph/extractors/_classic_cdf/_base.py +128 -14
- cognite/neat/_graph/extractors/_classic_cdf/_classic.py +156 -12
- cognite/neat/_graph/extractors/_classic_cdf/_relationships.py +50 -12
- cognite/neat/_graph/extractors/_classic_cdf/_sequences.py +26 -1
- cognite/neat/_graph/extractors/_dms.py +196 -47
- cognite/neat/_graph/extractors/_dms_graph.py +199 -0
- cognite/neat/_graph/extractors/_mock_graph_generator.py +1 -1
- cognite/neat/_graph/extractors/_rdf_file.py +33 -5
- cognite/neat/_graph/loaders/__init__.py +1 -3
- cognite/neat/_graph/loaders/_rdf2dms.py +123 -19
- cognite/neat/_graph/queries/_base.py +140 -84
- cognite/neat/_graph/queries/_construct.py +2 -2
- cognite/neat/_graph/transformers/__init__.py +8 -1
- cognite/neat/_graph/transformers/_base.py +9 -1
- cognite/neat/_graph/transformers/_classic_cdf.py +90 -3
- cognite/neat/_graph/transformers/_rdfpath.py +3 -3
- cognite/neat/_graph/transformers/_value_type.py +106 -45
- cognite/neat/_issues/errors/_resources.py +1 -1
- cognite/neat/_issues/warnings/__init__.py +0 -2
- cognite/neat/_issues/warnings/_models.py +1 -1
- cognite/neat/_issues/warnings/_properties.py +0 -8
- cognite/neat/_rules/analysis/_base.py +1 -1
- cognite/neat/_rules/analysis/_information.py +14 -13
- cognite/neat/_rules/catalog/__init__.py +1 -0
- cognite/neat/_rules/catalog/classic_model.xlsx +0 -0
- cognite/neat/_rules/catalog/info-rules-imf.xlsx +0 -0
- cognite/neat/_rules/exporters/_rules2instance_template.py +3 -3
- cognite/neat/_rules/importers/__init__.py +3 -1
- cognite/neat/_rules/importers/_dms2rules.py +7 -5
- cognite/neat/_rules/importers/_dtdl2rules/spec.py +1 -2
- cognite/neat/_rules/importers/_rdf/__init__.py +2 -2
- cognite/neat/_rules/importers/_rdf/_base.py +2 -2
- cognite/neat/_rules/importers/_rdf/_inference2rules.py +242 -19
- cognite/neat/_rules/models/_base_rules.py +13 -15
- cognite/neat/_rules/models/_types.py +5 -0
- cognite/neat/_rules/models/dms/_rules.py +51 -10
- cognite/neat/_rules/models/dms/_rules_input.py +4 -0
- cognite/neat/_rules/models/information/_rules.py +48 -5
- cognite/neat/_rules/models/information/_rules_input.py +6 -1
- cognite/neat/_rules/models/mapping/_classic2core.py +4 -5
- cognite/neat/_rules/transformers/__init__.py +10 -0
- cognite/neat/_rules/transformers/_converters.py +300 -62
- cognite/neat/_session/_base.py +57 -10
- cognite/neat/_session/_drop.py +5 -1
- cognite/neat/_session/_inspect.py +3 -2
- cognite/neat/_session/_mapping.py +17 -6
- cognite/neat/_session/_prepare.py +0 -47
- cognite/neat/_session/_read.py +115 -10
- cognite/neat/_session/_set.py +27 -0
- cognite/neat/_session/_show.py +4 -4
- cognite/neat/_session/_state.py +12 -1
- cognite/neat/_session/_to.py +43 -2
- cognite/neat/_session/_wizard.py +1 -1
- cognite/neat/_session/exceptions.py +8 -3
- cognite/neat/_store/_graph_store.py +331 -136
- cognite/neat/_store/_rules_store.py +130 -1
- cognite/neat/_utils/auth.py +3 -1
- cognite/neat/_version.py +1 -1
- {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/METADATA +2 -2
- {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/RECORD +67 -65
- {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/WHEEL +1 -1
- {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/LICENSE +0 -0
- {cognite_neat-0.106.0.dist-info → cognite_neat-0.108.0.dist-info}/entry_points.txt +0 -0
cognite/neat/_constants.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from cognite.client import data_modeling as dm
|
|
6
6
|
from cognite.client.data_classes.data_modeling.ids import DataModelId
|
|
7
7
|
from rdflib import DC, DCTERMS, FOAF, OWL, RDF, RDFS, SH, SKOS, XSD, Namespace, URIRef
|
|
8
|
+
from rdflib.namespace import DefinedNamespace
|
|
8
9
|
|
|
9
10
|
from cognite import neat
|
|
10
11
|
|
|
@@ -73,10 +74,22 @@ DEFAULT_NAMESPACE = Namespace("http://purl.org/cognite/neat/")
|
|
|
73
74
|
CDF_NAMESPACE = Namespace("https://cognitedata.com/")
|
|
74
75
|
DEFAULT_BASE_URI = URIRef(DEFAULT_NAMESPACE)
|
|
75
76
|
CLASSIC_CDF_NAMESPACE = Namespace("http://purl.org/cognite/cdf-classic#")
|
|
76
|
-
UNKNOWN_TYPE = DEFAULT_NAMESPACE.UnknownType
|
|
77
77
|
XML_SCHEMA_NAMESPACE = Namespace("http://www.w3.org/2001/XMLSchema#")
|
|
78
78
|
|
|
79
79
|
|
|
80
|
+
class NEAT(DefinedNamespace):
|
|
81
|
+
"""
|
|
82
|
+
NEAT internal data model used for internal purposes of the NEAT library
|
|
83
|
+
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
_fail = True
|
|
87
|
+
_NS = Namespace("http://thisisneat.io/internal/")
|
|
88
|
+
|
|
89
|
+
type: URIRef # type property used to express a type of a subject
|
|
90
|
+
UnknownType: URIRef # Unknown type used to express that the type of a subject is unknown
|
|
91
|
+
|
|
92
|
+
|
|
80
93
|
def get_default_prefixes_and_namespaces() -> dict[str, Namespace]:
|
|
81
94
|
return {
|
|
82
95
|
"owl": OWL._NS,
|
|
@@ -154,3 +167,24 @@ READONLY_PROPERTIES_BY_CONTAINER: Mapping[dm.ContainerId, frozenset[str]] = {
|
|
|
154
167
|
|
|
155
168
|
def is_readonly_property(container: dm.ContainerId, property_: str) -> bool:
|
|
156
169
|
return container in READONLY_PROPERTIES_BY_CONTAINER and property_ in READONLY_PROPERTIES_BY_CONTAINER[container]
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
DMS_RESERVED_PROPERTIES = frozenset(
|
|
173
|
+
{
|
|
174
|
+
"createdTime",
|
|
175
|
+
"deletedTime",
|
|
176
|
+
"edge_id",
|
|
177
|
+
"extensions",
|
|
178
|
+
"externalId",
|
|
179
|
+
"lastUpdatedTime",
|
|
180
|
+
"node_id",
|
|
181
|
+
"project_id",
|
|
182
|
+
"property_group",
|
|
183
|
+
"seq",
|
|
184
|
+
"space",
|
|
185
|
+
"version",
|
|
186
|
+
"tg_table_name",
|
|
187
|
+
"startNode",
|
|
188
|
+
"endNode",
|
|
189
|
+
}
|
|
190
|
+
)
|
cognite/neat/_graph/_shared.py
CHANGED
|
@@ -7,6 +7,10 @@ MIMETypes: TypeAlias = Literal[
|
|
|
7
7
|
RDFTypes: TypeAlias = Literal["xml", "rdf", "owl", "n3", "ttl", "turtle", "nt", "nq", "nquads", "trig"]
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
def quad_formats() -> list[str]:
|
|
11
|
+
return ["trig", "nquads", "nq", "nt"]
|
|
12
|
+
|
|
13
|
+
|
|
10
14
|
def rdflib_to_oxi_type(rdflib_format: str) -> str | None:
|
|
11
15
|
"""Convert an RDFlib format to a MIME type.
|
|
12
16
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from cognite.neat._session.engine._interface import Extractor as EngineExtractor
|
|
2
2
|
|
|
3
|
-
from ._base import BaseExtractor
|
|
3
|
+
from ._base import BaseExtractor, KnowledgeGraphExtractor
|
|
4
4
|
from ._classic_cdf._assets import AssetsExtractor
|
|
5
5
|
from ._classic_cdf._classic import ClassicGraphExtractor
|
|
6
6
|
from ._classic_cdf._data_sets import DataSetExtractor
|
|
@@ -12,6 +12,7 @@ from ._classic_cdf._sequences import SequencesExtractor
|
|
|
12
12
|
from ._classic_cdf._timeseries import TimeSeriesExtractor
|
|
13
13
|
from ._dexpi import DexpiExtractor
|
|
14
14
|
from ._dms import DMSExtractor
|
|
15
|
+
from ._dms_graph import DMSGraphExtractor
|
|
15
16
|
from ._iodd import IODDExtractor
|
|
16
17
|
from ._mock_graph_generator import MockGraphGenerator
|
|
17
18
|
from ._rdf_file import RdfFileExtractor
|
|
@@ -21,11 +22,13 @@ __all__ = [
|
|
|
21
22
|
"BaseExtractor",
|
|
22
23
|
"ClassicGraphExtractor",
|
|
23
24
|
"DMSExtractor",
|
|
25
|
+
"DMSGraphExtractor",
|
|
24
26
|
"DataSetExtractor",
|
|
25
27
|
"DexpiExtractor",
|
|
26
28
|
"EventsExtractor",
|
|
27
29
|
"FilesExtractor",
|
|
28
30
|
"IODDExtractor",
|
|
31
|
+
"KnowledgeGraphExtractor",
|
|
29
32
|
"LabelsExtractor",
|
|
30
33
|
"MockGraphGenerator",
|
|
31
34
|
"RdfFileExtractor",
|
|
@@ -51,6 +54,7 @@ TripleExtractors = (
|
|
|
51
54
|
| ClassicGraphExtractor
|
|
52
55
|
| DataSetExtractor
|
|
53
56
|
| EngineExtractor
|
|
57
|
+
| DMSGraphExtractor
|
|
54
58
|
)
|
|
55
59
|
|
|
56
60
|
|
|
@@ -1,9 +1,17 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
2
|
from collections.abc import Iterable
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
|
|
5
|
+
from rdflib import URIRef
|
|
6
|
+
|
|
7
|
+
from cognite.neat._constants import DEFAULT_NAMESPACE
|
|
8
|
+
from cognite.neat._rules.models import InformationRules
|
|
4
9
|
from cognite.neat._shared import Triple
|
|
5
10
|
from cognite.neat._utils.auxiliary import class_html_doc
|
|
6
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from cognite.neat._store._provenance import Agent as ProvenanceAgent
|
|
14
|
+
|
|
7
15
|
|
|
8
16
|
class BaseExtractor:
|
|
9
17
|
"""This is the base class for all extractors. It defines the interface that
|
|
@@ -24,3 +32,27 @@ class BaseExtractor:
|
|
|
24
32
|
@classmethod
|
|
25
33
|
def _repr_html_(cls) -> str:
|
|
26
34
|
return class_html_doc(cls)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class KnowledgeGraphExtractor(BaseExtractor):
|
|
38
|
+
"""A knowledge graph extractor extracts triples with a schema"""
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def get_information_rules(self) -> InformationRules:
|
|
42
|
+
"""Returns the information rules that the extractor uses."""
|
|
43
|
+
raise NotImplementedError()
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def description(self) -> str:
|
|
47
|
+
return self.__doc__.strip().split("\n")[0] if self.__doc__ else "Missing"
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def source_uri(self) -> URIRef:
|
|
51
|
+
raise NotImplementedError
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def agent(self) -> "ProvenanceAgent":
|
|
55
|
+
"""Provenance agent for the importer."""
|
|
56
|
+
from cognite.neat._store._provenance import Agent as ProvenanceAgent
|
|
57
|
+
|
|
58
|
+
return ProvenanceAgent(id_=DEFAULT_NAMESPACE[f"agent/{type(self).__name__}"])
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
3
|
import sys
|
|
4
|
+
import typing
|
|
5
|
+
import urllib.parse
|
|
4
6
|
import warnings
|
|
5
7
|
from abc import ABC, abstractmethod
|
|
6
8
|
from collections.abc import Callable, Iterable, Sequence, Set
|
|
@@ -16,7 +18,8 @@ from rdflib import RDF, XSD, Literal, Namespace, URIRef
|
|
|
16
18
|
|
|
17
19
|
from cognite.neat._constants import DEFAULT_NAMESPACE
|
|
18
20
|
from cognite.neat._graph.extractors._base import BaseExtractor
|
|
19
|
-
from cognite.neat._issues.
|
|
21
|
+
from cognite.neat._issues.errors import NeatValueError
|
|
22
|
+
from cognite.neat._issues.warnings import CDFAuthWarning, NeatValueWarning
|
|
20
23
|
from cognite.neat._shared import Triple
|
|
21
24
|
from cognite.neat._utils.auxiliary import string_to_ideal_type
|
|
22
25
|
from cognite.neat._utils.collection_ import iterate_progress_bar_if_above_config_threshold
|
|
@@ -72,6 +75,8 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
72
75
|
camel_case (bool, optional): Whether to use camelCase instead of snake_case for property names.
|
|
73
76
|
Defaults to True.
|
|
74
77
|
as_write (bool, optional): Whether to use the write/request format of the items. Defaults to False.
|
|
78
|
+
prefix (str, optional): A prefix to add to the rdf type. Defaults to None.
|
|
79
|
+
identifier (Literal["id", "externalId"], optional): The identifier to use. Defaults to "id".
|
|
75
80
|
"""
|
|
76
81
|
|
|
77
82
|
_default_rdf_type: str
|
|
@@ -89,6 +94,8 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
89
94
|
skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
|
|
90
95
|
camel_case: bool = True,
|
|
91
96
|
as_write: bool = False,
|
|
97
|
+
prefix: str | None = None,
|
|
98
|
+
identifier: typing.Literal["id", "externalId"] = "id",
|
|
92
99
|
):
|
|
93
100
|
self.namespace = namespace or DEFAULT_NAMESPACE
|
|
94
101
|
self.items = items
|
|
@@ -99,9 +106,19 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
99
106
|
self.skip_metadata_values = skip_metadata_values
|
|
100
107
|
self.camel_case = camel_case
|
|
101
108
|
self.as_write = as_write
|
|
109
|
+
self.prefix = prefix
|
|
110
|
+
self.identifier = identifier
|
|
111
|
+
# If identifier=externalId, we need to keep track of the external ids
|
|
112
|
+
# and use them in linking of Files, Sequences, TimeSeries, and Events.
|
|
113
|
+
self.asset_external_ids_by_id: dict[int, str] = {}
|
|
114
|
+
self.lookup_dataset_external_id: Callable[[int], str] | None = None
|
|
115
|
+
# Used by the ClassicGraphExtractor to log URIRefs
|
|
116
|
+
self._log_urirefs = False
|
|
117
|
+
self._uriref_by_external_id: dict[str, URIRef] = {}
|
|
102
118
|
|
|
103
119
|
def extract(self) -> Iterable[Triple]:
|
|
104
120
|
"""Extracts an asset with the given asset_id."""
|
|
121
|
+
from ._assets import AssetsExtractor
|
|
105
122
|
|
|
106
123
|
if self.total is not None and self.total > 0:
|
|
107
124
|
to_iterate = iterate_progress_bar_if_above_config_threshold(
|
|
@@ -109,21 +126,40 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
109
126
|
)
|
|
110
127
|
else:
|
|
111
128
|
to_iterate = self.items
|
|
129
|
+
if self.identifier == "externalId" and isinstance(self, AssetsExtractor):
|
|
130
|
+
to_iterate = self._store_asset_external_ids(to_iterate) # type: ignore[attr-defined]
|
|
131
|
+
|
|
112
132
|
for no, asset in enumerate(to_iterate):
|
|
113
133
|
yield from self._item2triples(asset)
|
|
114
134
|
if self.limit and no >= self.limit:
|
|
115
135
|
break
|
|
116
136
|
|
|
137
|
+
def _store_asset_external_ids(self, items: Iterable[T_CogniteResource]) -> Iterable[T_CogniteResource]:
|
|
138
|
+
for item in items:
|
|
139
|
+
if hasattr(item, "id") and hasattr(item, "external_id"):
|
|
140
|
+
self.asset_external_ids_by_id[item.id] = item.external_id
|
|
141
|
+
yield item
|
|
142
|
+
|
|
117
143
|
def _item2triples(self, item: T_CogniteResource) -> list[Triple]:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
144
|
+
if self.identifier == "id":
|
|
145
|
+
id_value: str | None
|
|
146
|
+
if hasattr(item, "id"):
|
|
147
|
+
id_value = str(item.id)
|
|
148
|
+
else:
|
|
149
|
+
id_value = self._fallback_id(item)
|
|
150
|
+
if id_value is None:
|
|
151
|
+
return []
|
|
152
|
+
id_suffix = id_value
|
|
153
|
+
elif self.identifier == "externalId":
|
|
154
|
+
if not hasattr(item, "external_id"):
|
|
155
|
+
return []
|
|
156
|
+
id_suffix = self._external_id_as_uri_suffix(item.external_id)
|
|
121
157
|
else:
|
|
122
|
-
|
|
123
|
-
if id_value is None:
|
|
124
|
-
return []
|
|
158
|
+
raise NeatValueError(f"Unknown identifier {self.identifier}")
|
|
125
159
|
|
|
126
|
-
id_ = self.namespace[f"{self._instance_id_prefix}{
|
|
160
|
+
id_ = self.namespace[f"{self._instance_id_prefix}{id_suffix}"]
|
|
161
|
+
if self._log_urirefs and hasattr(item, "external_id"):
|
|
162
|
+
self._uriref_by_external_id[item.external_id] = id_
|
|
127
163
|
|
|
128
164
|
type_ = self._get_rdf_type(item)
|
|
129
165
|
|
|
@@ -152,10 +188,25 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
152
188
|
"""This can be overridden to handle special cases for the item."""
|
|
153
189
|
return []
|
|
154
190
|
|
|
191
|
+
@classmethod
|
|
192
|
+
def _external_id_as_uri_suffix(cls, external_id: str | None) -> str:
|
|
193
|
+
if external_id == "":
|
|
194
|
+
warnings.warn(NeatValueWarning(f"Empty external id in {cls._default_rdf_type}"), stacklevel=2)
|
|
195
|
+
return "empty"
|
|
196
|
+
elif external_id == "\x00":
|
|
197
|
+
warnings.warn(NeatValueWarning(f"Null external id in {cls._default_rdf_type}"), stacklevel=2)
|
|
198
|
+
return "null"
|
|
199
|
+
elif external_id is None:
|
|
200
|
+
warnings.warn(NeatValueWarning(f"None external id in {cls._default_rdf_type}"), stacklevel=2)
|
|
201
|
+
return "None"
|
|
202
|
+
# The external ID needs to pass the ^[^\\x00]{1,256}$ regex for the DMS API.
|
|
203
|
+
# In addition, neat internals requires the external ID to be a valid URI.
|
|
204
|
+
return urllib.parse.quote(external_id)
|
|
205
|
+
|
|
155
206
|
def _fallback_id(self, item: T_CogniteResource) -> str | None:
|
|
156
207
|
raise AttributeError(
|
|
157
208
|
f"Item of type {type(item)} does not have an id attribute. "
|
|
158
|
-
|
|
209
|
+
"Please implement the _fallback_id method in the extractor."
|
|
159
210
|
)
|
|
160
211
|
|
|
161
212
|
def _metadata_to_triples(self, id_: URIRef, metadata: dict[str, str]) -> Iterable[Triple]:
|
|
@@ -174,13 +225,34 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
174
225
|
type_ = self._default_rdf_type
|
|
175
226
|
if self.to_type:
|
|
176
227
|
type_ = self.to_type(item) or type_
|
|
228
|
+
if self.prefix:
|
|
229
|
+
type_ = f"{self.prefix}{type_}"
|
|
177
230
|
return self._SPACE_PATTERN.sub("_", type_)
|
|
178
231
|
|
|
179
232
|
def _as_object(self, raw: Any, key: str) -> Literal | URIRef:
|
|
233
|
+
"""Return properly formatted object part of s-p-o triple"""
|
|
180
234
|
if key in {"data_set_id", "dataSetId"}:
|
|
181
|
-
|
|
235
|
+
if self.identifier == "externalId" and self.lookup_dataset_external_id:
|
|
236
|
+
try:
|
|
237
|
+
data_set_external_id = self.lookup_dataset_external_id(raw)
|
|
238
|
+
except KeyError:
|
|
239
|
+
return Literal("Unknown data set")
|
|
240
|
+
else:
|
|
241
|
+
return self.namespace[
|
|
242
|
+
f"{InstanceIdPrefix.data_set}{self._external_id_as_uri_suffix(data_set_external_id)}"
|
|
243
|
+
]
|
|
244
|
+
else:
|
|
245
|
+
return self.namespace[f"{InstanceIdPrefix.data_set}{raw}"]
|
|
182
246
|
elif key in {"assetId", "asset_id", "assetIds", "asset_ids", "parentId", "rootId", "parent_id", "root_id"}:
|
|
183
|
-
|
|
247
|
+
if self.identifier == "id":
|
|
248
|
+
return self.namespace[f"{InstanceIdPrefix.asset}{raw}"]
|
|
249
|
+
else:
|
|
250
|
+
try:
|
|
251
|
+
asset_external_id = self._external_id_as_uri_suffix(self.asset_external_ids_by_id[raw])
|
|
252
|
+
except KeyError:
|
|
253
|
+
return Literal("Unknown asset", datatype=XSD.string)
|
|
254
|
+
else:
|
|
255
|
+
return self.namespace[f"{InstanceIdPrefix.asset}{asset_external_id}"]
|
|
184
256
|
elif key in {
|
|
185
257
|
"startTime",
|
|
186
258
|
"endTime",
|
|
@@ -218,9 +290,23 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
218
290
|
skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
|
|
219
291
|
camel_case: bool = True,
|
|
220
292
|
as_write: bool = False,
|
|
293
|
+
prefix: str | None = None,
|
|
294
|
+
identifier: typing.Literal["id", "externalId"] = "id",
|
|
221
295
|
):
|
|
222
296
|
total, items = cls._handle_no_access(lambda: cls._from_dataset(client, data_set_external_id))
|
|
223
|
-
return cls(
|
|
297
|
+
return cls(
|
|
298
|
+
items,
|
|
299
|
+
namespace,
|
|
300
|
+
to_type,
|
|
301
|
+
total,
|
|
302
|
+
limit,
|
|
303
|
+
unpack_metadata,
|
|
304
|
+
skip_metadata_values,
|
|
305
|
+
camel_case,
|
|
306
|
+
as_write,
|
|
307
|
+
prefix,
|
|
308
|
+
identifier,
|
|
309
|
+
)
|
|
224
310
|
|
|
225
311
|
@classmethod
|
|
226
312
|
@abstractmethod
|
|
@@ -241,9 +327,23 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
241
327
|
skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
|
|
242
328
|
camel_case: bool = True,
|
|
243
329
|
as_write: bool = False,
|
|
330
|
+
prefix: str | None = None,
|
|
331
|
+
identifier: typing.Literal["id", "externalId"] = "id",
|
|
244
332
|
):
|
|
245
333
|
total, items = cls._handle_no_access(lambda: cls._from_hierarchy(client, root_asset_external_id))
|
|
246
|
-
return cls(
|
|
334
|
+
return cls(
|
|
335
|
+
items,
|
|
336
|
+
namespace,
|
|
337
|
+
to_type,
|
|
338
|
+
total,
|
|
339
|
+
limit,
|
|
340
|
+
unpack_metadata,
|
|
341
|
+
skip_metadata_values,
|
|
342
|
+
camel_case,
|
|
343
|
+
as_write,
|
|
344
|
+
prefix,
|
|
345
|
+
identifier,
|
|
346
|
+
)
|
|
247
347
|
|
|
248
348
|
@classmethod
|
|
249
349
|
@abstractmethod
|
|
@@ -263,9 +363,23 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
263
363
|
skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
|
|
264
364
|
camel_case: bool = True,
|
|
265
365
|
as_write: bool = False,
|
|
366
|
+
prefix: str | None = None,
|
|
367
|
+
identifier: typing.Literal["id", "externalId"] = "id",
|
|
266
368
|
):
|
|
267
369
|
total, items = cls._from_file(file_path)
|
|
268
|
-
return cls(
|
|
370
|
+
return cls(
|
|
371
|
+
items,
|
|
372
|
+
namespace,
|
|
373
|
+
to_type,
|
|
374
|
+
total,
|
|
375
|
+
limit,
|
|
376
|
+
unpack_metadata,
|
|
377
|
+
skip_metadata_values,
|
|
378
|
+
camel_case,
|
|
379
|
+
as_write,
|
|
380
|
+
prefix,
|
|
381
|
+
identifier,
|
|
382
|
+
)
|
|
269
383
|
|
|
270
384
|
@classmethod
|
|
271
385
|
@abstractmethod
|
|
@@ -1,18 +1,27 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
import urllib.parse
|
|
1
3
|
import warnings
|
|
2
4
|
from collections import defaultdict
|
|
3
5
|
from collections.abc import Iterable, Sequence
|
|
4
|
-
from typing import ClassVar, NamedTuple
|
|
6
|
+
from typing import ClassVar, NamedTuple, cast
|
|
5
7
|
|
|
6
8
|
from cognite.client import CogniteClient
|
|
7
9
|
from cognite.client.exceptions import CogniteAPIError
|
|
8
|
-
from rdflib import Namespace
|
|
9
|
-
|
|
10
|
-
from cognite.neat._constants import CLASSIC_CDF_NAMESPACE
|
|
11
|
-
from cognite.neat._graph.extractors._base import
|
|
12
|
-
from cognite.neat._issues.
|
|
10
|
+
from rdflib import Namespace, URIRef
|
|
11
|
+
|
|
12
|
+
from cognite.neat._constants import CLASSIC_CDF_NAMESPACE, DEFAULT_NAMESPACE, get_default_prefixes_and_namespaces
|
|
13
|
+
from cognite.neat._graph.extractors._base import KnowledgeGraphExtractor
|
|
14
|
+
from cognite.neat._issues.errors import NeatValueError, ResourceNotFoundError
|
|
15
|
+
from cognite.neat._issues.warnings import CDFAuthWarning, NeatValueWarning
|
|
16
|
+
from cognite.neat._rules._shared import ReadRules
|
|
17
|
+
from cognite.neat._rules.catalog import classic_model
|
|
18
|
+
from cognite.neat._rules.models import InformationInputRules, InformationRules
|
|
19
|
+
from cognite.neat._rules.models._rdfpath import Entity as RDFPathEntity
|
|
20
|
+
from cognite.neat._rules.models._rdfpath import RDFPath, SingleProperty
|
|
13
21
|
from cognite.neat._shared import Triple
|
|
14
22
|
from cognite.neat._utils.collection_ import chunker, iterate_progress_bar
|
|
15
23
|
from cognite.neat._utils.rdf_ import remove_namespace_from_uri
|
|
24
|
+
from cognite.neat._utils.text import to_snake
|
|
16
25
|
|
|
17
26
|
from ._assets import AssetsExtractor
|
|
18
27
|
from ._base import ClassicCDFBaseExtractor, InstanceIdPrefix
|
|
@@ -37,7 +46,7 @@ class _ClassicCoreType(NamedTuple):
|
|
|
37
46
|
api_name: str
|
|
38
47
|
|
|
39
48
|
|
|
40
|
-
class ClassicGraphExtractor(
|
|
49
|
+
class ClassicGraphExtractor(KnowledgeGraphExtractor):
|
|
41
50
|
"""This extractor extracts all classic CDF Resources.
|
|
42
51
|
|
|
43
52
|
The Classic Graph consists of the following core resource type.
|
|
@@ -93,6 +102,8 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
93
102
|
root_asset_external_id: str | None = None,
|
|
94
103
|
namespace: Namespace | None = None,
|
|
95
104
|
limit_per_type: int | None = None,
|
|
105
|
+
prefix: str | None = None,
|
|
106
|
+
identifier: typing.Literal["id", "externalId"] = "id",
|
|
96
107
|
):
|
|
97
108
|
self._client = client
|
|
98
109
|
if sum([bool(data_set_external_id), bool(root_asset_external_id)]) != 1:
|
|
@@ -101,16 +112,29 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
101
112
|
self._data_set_external_id = data_set_external_id
|
|
102
113
|
self._namespace = namespace or CLASSIC_CDF_NAMESPACE
|
|
103
114
|
self._extractor_args = dict(
|
|
104
|
-
namespace=self._namespace,
|
|
115
|
+
namespace=self._namespace,
|
|
116
|
+
unpack_metadata=False,
|
|
117
|
+
as_write=True,
|
|
118
|
+
camel_case=True,
|
|
119
|
+
limit=limit_per_type,
|
|
120
|
+
prefix=prefix,
|
|
121
|
+
identifier=identifier,
|
|
105
122
|
)
|
|
123
|
+
self._identifier = identifier
|
|
124
|
+
self._prefix = prefix
|
|
106
125
|
self._limit_per_type = limit_per_type
|
|
107
126
|
|
|
127
|
+
self._uris_by_external_id_by_type: dict[InstanceIdPrefix, dict[str, URIRef]] = defaultdict(dict)
|
|
108
128
|
self._source_external_ids_by_type: dict[InstanceIdPrefix, set[str]] = defaultdict(set)
|
|
109
129
|
self._target_external_ids_by_type: dict[InstanceIdPrefix, set[str]] = defaultdict(set)
|
|
130
|
+
self._relationship_subject_predicate_type_external_id: list[tuple[URIRef, URIRef, str, str]] = []
|
|
110
131
|
self._labels: set[str] = set()
|
|
111
132
|
self._data_set_ids: set[int] = set()
|
|
133
|
+
self._data_set_external_ids: set[str] = set()
|
|
112
134
|
self._extracted_labels = False
|
|
113
135
|
self._extracted_data_sets = False
|
|
136
|
+
self._asset_external_ids_by_id: dict[int, str] = {}
|
|
137
|
+
self._dataset_external_ids_by_id: dict[int, str] = {}
|
|
114
138
|
|
|
115
139
|
def _get_activity_names(self) -> list[str]:
|
|
116
140
|
activities = [data_access_object.extractor_cls.__name__ for data_access_object in self._classic_node_types] + [
|
|
@@ -124,12 +148,17 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
124
148
|
|
|
125
149
|
def extract(self) -> Iterable[Triple]:
|
|
126
150
|
"""Extracts all classic CDF Resources."""
|
|
151
|
+
self._validate_exists()
|
|
152
|
+
|
|
127
153
|
yield from self._extract_core_start_nodes()
|
|
128
154
|
|
|
129
155
|
yield from self._extract_start_node_relationships()
|
|
130
156
|
|
|
131
157
|
yield from self._extract_core_end_nodes()
|
|
132
158
|
|
|
159
|
+
if self._identifier == "id":
|
|
160
|
+
yield from self._extract_relationship_target_triples()
|
|
161
|
+
|
|
133
162
|
try:
|
|
134
163
|
yield from self._extract_labels()
|
|
135
164
|
except CogniteAPIError as e:
|
|
@@ -144,6 +173,69 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
144
173
|
else:
|
|
145
174
|
self._extracted_data_sets = True
|
|
146
175
|
|
|
176
|
+
def get_information_rules(self) -> InformationRules:
|
|
177
|
+
# To avoid circular imports
|
|
178
|
+
from cognite.neat._rules.importers import ExcelImporter
|
|
179
|
+
|
|
180
|
+
unverified = cast(ReadRules[InformationInputRules], ExcelImporter(classic_model).to_rules())
|
|
181
|
+
if unverified.rules is None:
|
|
182
|
+
raise NeatValueError(f"Could not read the classic model rules from {classic_model}.")
|
|
183
|
+
|
|
184
|
+
verified = unverified.rules.as_verified_rules()
|
|
185
|
+
prefixes = get_default_prefixes_and_namespaces()
|
|
186
|
+
instance_prefix: str | None = next((k for k, v in prefixes.items() if v == self._namespace), None)
|
|
187
|
+
if instance_prefix is None:
|
|
188
|
+
# We need to add a new prefix
|
|
189
|
+
instance_prefix = f"prefix_{len(prefixes) + 1}"
|
|
190
|
+
prefixes[instance_prefix] = self._namespace
|
|
191
|
+
verified.prefixes = prefixes
|
|
192
|
+
|
|
193
|
+
is_snake_case = self._extractor_args["camel_case"] is False
|
|
194
|
+
for prop in verified.properties:
|
|
195
|
+
prop_id = prop.property_
|
|
196
|
+
if is_snake_case:
|
|
197
|
+
prop_id = to_snake(prop_id)
|
|
198
|
+
prop.instance_source = RDFPath(
|
|
199
|
+
traversal=SingleProperty(
|
|
200
|
+
class_=RDFPathEntity(prefix=instance_prefix, suffix=prop.class_.suffix),
|
|
201
|
+
property=RDFPathEntity(prefix=instance_prefix, suffix=prop_id),
|
|
202
|
+
)
|
|
203
|
+
)
|
|
204
|
+
return verified
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def description(self) -> str:
|
|
208
|
+
if self._data_set_external_id:
|
|
209
|
+
source = f"data set {self._data_set_external_id}."
|
|
210
|
+
elif self._root_asset_external_id:
|
|
211
|
+
source = f"root asset {self._root_asset_external_id}."
|
|
212
|
+
else:
|
|
213
|
+
source = "unknown source."
|
|
214
|
+
return f"Extracting clasic CDF Graph (Assets, TimeSeries, Sequences, Events, Files) from {source}."
|
|
215
|
+
|
|
216
|
+
@property
|
|
217
|
+
def source_uri(self) -> URIRef:
|
|
218
|
+
if self._data_set_external_id:
|
|
219
|
+
resource = "dataset"
|
|
220
|
+
external_id = self._data_set_external_id
|
|
221
|
+
elif self._root_asset_external_id:
|
|
222
|
+
resource = "asset"
|
|
223
|
+
external_id = self._root_asset_external_id
|
|
224
|
+
else:
|
|
225
|
+
resource = "unknown"
|
|
226
|
+
external_id = "unknown"
|
|
227
|
+
return DEFAULT_NAMESPACE[f"{self._client.config.project}/{resource}/{urllib.parse.quote(external_id)}"]
|
|
228
|
+
|
|
229
|
+
def _validate_exists(self) -> None:
|
|
230
|
+
if self._data_set_external_id:
|
|
231
|
+
if self._client.data_sets.retrieve(external_id=self._data_set_external_id) is None:
|
|
232
|
+
raise ResourceNotFoundError(self._data_set_external_id, "data set")
|
|
233
|
+
elif self._root_asset_external_id:
|
|
234
|
+
if self._client.assets.retrieve(external_id=self._root_asset_external_id) is None:
|
|
235
|
+
raise ResourceNotFoundError(self._root_asset_external_id, "root asset")
|
|
236
|
+
else:
|
|
237
|
+
raise ValueError("Exactly one of data_set_external_id or root_asset_external_id must be set.")
|
|
238
|
+
|
|
147
239
|
def _extract_core_start_nodes(self):
|
|
148
240
|
for core_node in self._classic_node_types:
|
|
149
241
|
if self._data_set_external_id:
|
|
@@ -157,8 +249,20 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
157
249
|
else:
|
|
158
250
|
raise ValueError("Exactly one of data_set_external_id or root_asset_external_id must be set.")
|
|
159
251
|
|
|
252
|
+
if self._identifier == "externalId":
|
|
253
|
+
if isinstance(extractor, AssetsExtractor):
|
|
254
|
+
self._asset_external_ids_by_id = extractor.asset_external_ids_by_id
|
|
255
|
+
else:
|
|
256
|
+
extractor.asset_external_ids_by_id = self._asset_external_ids_by_id
|
|
257
|
+
extractor.lookup_dataset_external_id = self._lookup_dataset
|
|
258
|
+
elif self._identifier == "id":
|
|
259
|
+
extractor._log_urirefs = True
|
|
260
|
+
|
|
160
261
|
yield from self._extract_with_logging_label_dataset(extractor, core_node.resource_type)
|
|
161
262
|
|
|
263
|
+
if self._identifier == "id":
|
|
264
|
+
self._uris_by_external_id_by_type[core_node.resource_type].update(extractor._uriref_by_external_id)
|
|
265
|
+
|
|
162
266
|
def _extract_start_node_relationships(self):
|
|
163
267
|
for start_resource_type, source_external_ids in self._source_external_ids_by_type.items():
|
|
164
268
|
start_type = start_resource_type.removesuffix("_")
|
|
@@ -169,6 +273,8 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
169
273
|
extractor = RelationshipsExtractor(relationship_iterator, **self._extractor_args)
|
|
170
274
|
# This is a private attribute, but we need to set it to log the target nodes.
|
|
171
275
|
extractor._log_target_nodes = True
|
|
276
|
+
if self._identifier == "id":
|
|
277
|
+
extractor._uri_by_external_id_by_by_type = self._uris_by_external_id_by_type
|
|
172
278
|
|
|
173
279
|
yield from extractor.extract()
|
|
174
280
|
|
|
@@ -187,6 +293,11 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
187
293
|
):
|
|
188
294
|
self._target_external_ids_by_type[end_type].add(external_id)
|
|
189
295
|
|
|
296
|
+
if self._identifier == "id":
|
|
297
|
+
# We need to store all future target triples which we will lookup after fetching
|
|
298
|
+
# the target nodes.
|
|
299
|
+
self._relationship_subject_predicate_type_external_id.extend(extractor._target_triples)
|
|
300
|
+
|
|
190
301
|
def _extract_core_end_nodes(self):
|
|
191
302
|
for core_node in self._classic_node_types:
|
|
192
303
|
target_external_ids = self._target_external_ids_by_type[core_node.resource_type]
|
|
@@ -197,8 +308,26 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
197
308
|
):
|
|
198
309
|
resource_iterator = api.retrieve_multiple(external_ids=list(chunk), ignore_unknown_ids=True)
|
|
199
310
|
extractor = core_node.extractor_cls(resource_iterator, **self._extractor_args)
|
|
311
|
+
|
|
312
|
+
extractor.asset_external_ids_by_id = self._asset_external_ids_by_id
|
|
313
|
+
extractor.lookup_dataset_external_id = self._lookup_dataset
|
|
314
|
+
if self._identifier == "id":
|
|
315
|
+
extractor._log_urirefs = True
|
|
316
|
+
|
|
200
317
|
yield from self._extract_with_logging_label_dataset(extractor)
|
|
201
318
|
|
|
319
|
+
if self._identifier == "id":
|
|
320
|
+
self._uris_by_external_id_by_type[core_node.resource_type].update(extractor._uriref_by_external_id)
|
|
321
|
+
|
|
322
|
+
def _extract_relationship_target_triples(self):
|
|
323
|
+
for id_, predicate, type_, external_id in self._relationship_subject_predicate_type_external_id:
|
|
324
|
+
try:
|
|
325
|
+
object_uri = self._uris_by_external_id_by_type[InstanceIdPrefix.from_str(type_)][external_id]
|
|
326
|
+
except KeyError:
|
|
327
|
+
warnings.warn(NeatValueWarning(f"Missing externalId {external_id} for {type_}"), stacklevel=2)
|
|
328
|
+
else:
|
|
329
|
+
yield id_, predicate, object_uri
|
|
330
|
+
|
|
202
331
|
def _extract_labels(self):
|
|
203
332
|
for chunk in self._chunk(list(self._labels), description="Extracting labels"):
|
|
204
333
|
label_iterator = self._client.labels.retrieve(external_id=list(chunk), ignore_unknown_ids=True)
|
|
@@ -208,6 +337,11 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
208
337
|
for chunk in self._chunk(list(self._data_set_ids), description="Extracting data sets"):
|
|
209
338
|
data_set_iterator = self._client.data_sets.retrieve_multiple(ids=list(chunk), ignore_unknown_ids=True)
|
|
210
339
|
yield from DataSetExtractor(data_set_iterator, **self._extractor_args).extract()
|
|
340
|
+
for chunk in self._chunk(list(self._data_set_external_ids), description="Extracting data sets"):
|
|
341
|
+
data_set_iterator = self._client.data_sets.retrieve_multiple(
|
|
342
|
+
external_ids=list(chunk), ignore_unknown_ids=True
|
|
343
|
+
)
|
|
344
|
+
yield from DataSetExtractor(data_set_iterator, **self._extractor_args).extract()
|
|
211
345
|
|
|
212
346
|
def _extract_with_logging_label_dataset(
|
|
213
347
|
self, extractor: ClassicCDFBaseExtractor, resource_type: InstanceIdPrefix | None = None
|
|
@@ -217,10 +351,12 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
217
351
|
self._source_external_ids_by_type[resource_type].add(remove_namespace_from_uri(triple[2]))
|
|
218
352
|
elif triple[1] == self._namespace.labels:
|
|
219
353
|
self._labels.add(remove_namespace_from_uri(triple[2]).removeprefix(InstanceIdPrefix.label))
|
|
220
|
-
elif triple[1] == self._namespace.
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
354
|
+
elif triple[1] == self._namespace.dataSetId:
|
|
355
|
+
identifier = remove_namespace_from_uri(triple[2]).removeprefix(InstanceIdPrefix.data_set)
|
|
356
|
+
try:
|
|
357
|
+
self._data_set_ids.add(int(identifier))
|
|
358
|
+
except ValueError:
|
|
359
|
+
self._data_set_external_ids.add(identifier)
|
|
224
360
|
yield triple
|
|
225
361
|
|
|
226
362
|
@staticmethod
|
|
@@ -230,3 +366,11 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
230
366
|
return iterate_progress_bar(to_iterate, (len(items) // 1_000) + 1, description)
|
|
231
367
|
else:
|
|
232
368
|
return to_iterate
|
|
369
|
+
|
|
370
|
+
def _lookup_dataset(self, dataset_id: int) -> str:
|
|
371
|
+
if dataset_id not in self._dataset_external_ids_by_id:
|
|
372
|
+
if (dataset := self._client.data_sets.retrieve(id=dataset_id)) and dataset.external_id:
|
|
373
|
+
self._dataset_external_ids_by_id[dataset_id] = dataset.external_id
|
|
374
|
+
else:
|
|
375
|
+
raise KeyError(f"Could not find dataset with id {dataset_id}.")
|
|
376
|
+
return self._dataset_external_ids_by_id[dataset_id]
|