cognite-neat 0.107.0__py3-none-any.whl → 0.109.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cognite-neat might be problematic. Click here for more details.
- cognite/neat/_constants.py +35 -1
- cognite/neat/_graph/_shared.py +4 -0
- cognite/neat/_graph/extractors/_classic_cdf/_base.py +115 -14
- cognite/neat/_graph/extractors/_classic_cdf/_classic.py +87 -6
- cognite/neat/_graph/extractors/_classic_cdf/_relationships.py +48 -12
- cognite/neat/_graph/extractors/_classic_cdf/_sequences.py +19 -1
- cognite/neat/_graph/extractors/_dms.py +162 -47
- cognite/neat/_graph/extractors/_dms_graph.py +54 -4
- cognite/neat/_graph/extractors/_mock_graph_generator.py +1 -1
- cognite/neat/_graph/extractors/_rdf_file.py +3 -2
- cognite/neat/_graph/loaders/__init__.py +1 -3
- cognite/neat/_graph/loaders/_rdf2dms.py +20 -10
- cognite/neat/_graph/queries/_base.py +144 -84
- cognite/neat/_graph/queries/_construct.py +1 -1
- cognite/neat/_graph/transformers/__init__.py +3 -1
- cognite/neat/_graph/transformers/_base.py +4 -4
- cognite/neat/_graph/transformers/_classic_cdf.py +13 -13
- cognite/neat/_graph/transformers/_prune_graph.py +3 -3
- cognite/neat/_graph/transformers/_rdfpath.py +3 -4
- cognite/neat/_graph/transformers/_value_type.py +71 -13
- cognite/neat/_issues/errors/__init__.py +2 -0
- cognite/neat/_issues/errors/_external.py +8 -0
- cognite/neat/_issues/errors/_resources.py +1 -1
- cognite/neat/_issues/warnings/__init__.py +0 -2
- cognite/neat/_issues/warnings/_models.py +1 -1
- cognite/neat/_issues/warnings/_properties.py +0 -8
- cognite/neat/_issues/warnings/_resources.py +1 -1
- cognite/neat/_rules/catalog/classic_model.xlsx +0 -0
- cognite/neat/_rules/exporters/_rules2instance_template.py +3 -3
- cognite/neat/_rules/exporters/_rules2yaml.py +1 -1
- cognite/neat/_rules/importers/__init__.py +3 -1
- cognite/neat/_rules/importers/_dtdl2rules/spec.py +1 -2
- cognite/neat/_rules/importers/_rdf/__init__.py +2 -2
- cognite/neat/_rules/importers/_rdf/_base.py +2 -2
- cognite/neat/_rules/importers/_rdf/_inference2rules.py +310 -26
- cognite/neat/_rules/models/_base_rules.py +22 -11
- cognite/neat/_rules/models/dms/_exporter.py +5 -4
- cognite/neat/_rules/models/dms/_rules.py +1 -8
- cognite/neat/_rules/models/dms/_rules_input.py +4 -0
- cognite/neat/_rules/models/information/_rules_input.py +5 -0
- cognite/neat/_rules/transformers/__init__.py +10 -3
- cognite/neat/_rules/transformers/_base.py +6 -1
- cognite/neat/_rules/transformers/_converters.py +530 -364
- cognite/neat/_rules/transformers/_mapping.py +4 -4
- cognite/neat/_session/_base.py +100 -47
- cognite/neat/_session/_create.py +133 -0
- cognite/neat/_session/_drop.py +60 -2
- cognite/neat/_session/_fix.py +28 -0
- cognite/neat/_session/_inspect.py +22 -7
- cognite/neat/_session/_mapping.py +8 -8
- cognite/neat/_session/_prepare.py +3 -247
- cognite/neat/_session/_read.py +138 -17
- cognite/neat/_session/_set.py +50 -1
- cognite/neat/_session/_show.py +16 -43
- cognite/neat/_session/_state.py +53 -52
- cognite/neat/_session/_to.py +11 -4
- cognite/neat/_session/_wizard.py +1 -1
- cognite/neat/_session/exceptions.py +8 -1
- cognite/neat/_store/_graph_store.py +301 -146
- cognite/neat/_store/_provenance.py +36 -20
- cognite/neat/_store/_rules_store.py +253 -267
- cognite/neat/_store/exceptions.py +40 -4
- cognite/neat/_utils/auth.py +5 -3
- cognite/neat/_version.py +1 -1
- {cognite_neat-0.107.0.dist-info → cognite_neat-0.109.0.dist-info}/METADATA +1 -1
- {cognite_neat-0.107.0.dist-info → cognite_neat-0.109.0.dist-info}/RECORD +69 -67
- {cognite_neat-0.107.0.dist-info → cognite_neat-0.109.0.dist-info}/LICENSE +0 -0
- {cognite_neat-0.107.0.dist-info → cognite_neat-0.109.0.dist-info}/WHEEL +0 -0
- {cognite_neat-0.107.0.dist-info → cognite_neat-0.109.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import urllib.parse
|
|
2
|
-
from collections.abc import Iterable, Iterator
|
|
2
|
+
from collections.abc import Iterable, Iterator, Set
|
|
3
|
+
from functools import cached_property
|
|
3
4
|
from typing import cast
|
|
4
5
|
|
|
5
6
|
from cognite.client import CogniteClient
|
|
@@ -9,34 +10,48 @@ from cognite.client.data_classes.data_modeling.instances import Instance, Proper
|
|
|
9
10
|
from cognite.client.utils.useful_types import SequenceNotStr
|
|
10
11
|
from rdflib import RDF, XSD, Literal, Namespace, URIRef
|
|
11
12
|
|
|
12
|
-
from cognite.neat.
|
|
13
|
+
from cognite.neat._config import GLOBAL_CONFIG
|
|
14
|
+
from cognite.neat._constants import DEFAULT_SPACE_URI, is_readonly_property
|
|
13
15
|
from cognite.neat._issues.errors import ResourceRetrievalError
|
|
14
16
|
from cognite.neat._shared import Triple
|
|
17
|
+
from cognite.neat._utils.auxiliary import string_to_ideal_type
|
|
18
|
+
from cognite.neat._utils.collection_ import iterate_progress_bar
|
|
15
19
|
|
|
16
20
|
from ._base import BaseExtractor
|
|
17
21
|
|
|
22
|
+
DEFAULT_EMPTY_VALUES = frozenset({"nan", "null", "none", "", " ", "nil", "n/a", "na", "unknown", "undefined"})
|
|
23
|
+
|
|
18
24
|
|
|
19
25
|
class DMSExtractor(BaseExtractor):
|
|
20
26
|
"""Extract data from Cognite Data Fusion DMS instances into Neat.
|
|
21
27
|
|
|
22
28
|
Args:
|
|
23
|
-
|
|
24
|
-
|
|
29
|
+
total_instances_pair_by_view: A dictionary where the key is the view id and the value is a tuple with the total
|
|
30
|
+
number of instances and an iterable of instances.
|
|
25
31
|
limit: The maximum number of items to extract.
|
|
26
32
|
overwrite_namespace: If provided, this will overwrite the space of the extracted items.
|
|
33
|
+
unpack_json: If True, JSON objects will be unpacked into RDF literals.
|
|
34
|
+
empty_values: If unpack_json is True, when unpacking JSON objects, if a key has a value in this set, it will be
|
|
35
|
+
considered as an empty value and skipped.
|
|
36
|
+
str_to_ideal_type: If unpack_json is True, when unpacking JSON objects, if the value is a string, the extractor
|
|
37
|
+
will try to convert it to the ideal type.
|
|
27
38
|
"""
|
|
28
39
|
|
|
29
40
|
def __init__(
|
|
30
41
|
self,
|
|
31
|
-
|
|
32
|
-
total: int | None = None,
|
|
42
|
+
total_instances_pair_by_view: dict[dm.ViewId, tuple[int | None, Iterable[Instance]]],
|
|
33
43
|
limit: int | None = None,
|
|
34
44
|
overwrite_namespace: Namespace | None = None,
|
|
45
|
+
unpack_json: bool = False,
|
|
46
|
+
empty_values: Set[str] = DEFAULT_EMPTY_VALUES,
|
|
47
|
+
str_to_ideal_type: bool = False,
|
|
35
48
|
) -> None:
|
|
36
|
-
self.
|
|
37
|
-
self.total = total
|
|
49
|
+
self.total_instances_pair_by_view = total_instances_pair_by_view
|
|
38
50
|
self.limit = limit
|
|
39
51
|
self.overwrite_namespace = overwrite_namespace
|
|
52
|
+
self.unpack_json = unpack_json
|
|
53
|
+
self.empty_values = empty_values
|
|
54
|
+
self.str_to_ideal_type = str_to_ideal_type
|
|
40
55
|
|
|
41
56
|
@classmethod
|
|
42
57
|
def from_data_model(
|
|
@@ -46,6 +61,8 @@ class DMSExtractor(BaseExtractor):
|
|
|
46
61
|
limit: int | None = None,
|
|
47
62
|
overwrite_namespace: Namespace | None = None,
|
|
48
63
|
instance_space: str | SequenceNotStr[str] | None = None,
|
|
64
|
+
unpack_json: bool = False,
|
|
65
|
+
str_to_ideal_type: bool = False,
|
|
49
66
|
) -> "DMSExtractor":
|
|
50
67
|
"""Create an extractor from a data model.
|
|
51
68
|
|
|
@@ -55,11 +72,20 @@ class DMSExtractor(BaseExtractor):
|
|
|
55
72
|
limit: The maximum number of instances to extract.
|
|
56
73
|
overwrite_namespace: If provided, this will overwrite the space of the extracted items.
|
|
57
74
|
instance_space: The space to extract instances from.
|
|
75
|
+
unpack_json: If True, JSON objects will be unpacked into RDF literals.
|
|
58
76
|
"""
|
|
59
77
|
retrieved = client.data_modeling.data_models.retrieve(data_model, inline_views=True)
|
|
60
78
|
if not retrieved:
|
|
61
79
|
raise ResourceRetrievalError(dm.DataModelId.load(data_model), "data model", "Data Model is missing in CDF")
|
|
62
|
-
return cls.from_views(
|
|
80
|
+
return cls.from_views(
|
|
81
|
+
client,
|
|
82
|
+
retrieved.latest_version().views,
|
|
83
|
+
limit,
|
|
84
|
+
overwrite_namespace,
|
|
85
|
+
instance_space,
|
|
86
|
+
unpack_json,
|
|
87
|
+
str_to_ideal_type,
|
|
88
|
+
)
|
|
63
89
|
|
|
64
90
|
@classmethod
|
|
65
91
|
def from_views(
|
|
@@ -69,6 +95,8 @@ class DMSExtractor(BaseExtractor):
|
|
|
69
95
|
limit: int | None = None,
|
|
70
96
|
overwrite_namespace: Namespace | None = None,
|
|
71
97
|
instance_space: str | SequenceNotStr[str] | None = None,
|
|
98
|
+
unpack_json: bool = False,
|
|
99
|
+
str_to_ideal_type: bool = False,
|
|
72
100
|
) -> "DMSExtractor":
|
|
73
101
|
"""Create an extractor from a set of views.
|
|
74
102
|
|
|
@@ -78,19 +106,43 @@ class DMSExtractor(BaseExtractor):
|
|
|
78
106
|
limit: The maximum number of instances to extract.
|
|
79
107
|
overwrite_namespace: If provided, this will overwrite the space of the extracted items.
|
|
80
108
|
instance_space: The space to extract instances from.
|
|
109
|
+
unpack_json: If True, JSON objects will be unpacked into RDF literals.
|
|
110
|
+
str_to_ideal_type: If True, when unpacking JSON objects, if the value is a string, the extractor will try to
|
|
111
|
+
convert it to the ideal type.
|
|
81
112
|
"""
|
|
113
|
+
total_instances_pair_by_view: dict[dm.ViewId, tuple[int | None, Iterable[Instance]]] = {}
|
|
114
|
+
for view in views:
|
|
115
|
+
instance_iterator = _ViewInstanceIterator(client, view, instance_space)
|
|
116
|
+
total_instances_pair_by_view[view.as_id()] = (instance_iterator.count, instance_iterator)
|
|
117
|
+
|
|
82
118
|
return cls(
|
|
83
|
-
|
|
84
|
-
total=None,
|
|
119
|
+
total_instances_pair_by_view=total_instances_pair_by_view,
|
|
85
120
|
limit=limit,
|
|
86
121
|
overwrite_namespace=overwrite_namespace,
|
|
122
|
+
unpack_json=unpack_json,
|
|
123
|
+
str_to_ideal_type=str_to_ideal_type,
|
|
87
124
|
)
|
|
88
125
|
|
|
89
126
|
def extract(self) -> Iterable[Triple]:
|
|
90
|
-
for
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
127
|
+
total_instances = sum(total for total, _ in self.total_instances_pair_by_view.values() if total is not None)
|
|
128
|
+
use_progress_bar = (
|
|
129
|
+
GLOBAL_CONFIG.use_iterate_bar_threshold and total_instances > GLOBAL_CONFIG.use_iterate_bar_threshold
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
for view_id, (total, instances) in self.total_instances_pair_by_view.items():
|
|
133
|
+
if total == 0:
|
|
134
|
+
continue
|
|
135
|
+
if use_progress_bar and total is not None:
|
|
136
|
+
instances = iterate_progress_bar(
|
|
137
|
+
instances,
|
|
138
|
+
total,
|
|
139
|
+
f"Extracting instances from {view_id.space}:{view_id.external_id}(version={view_id.version})",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
for count, item in enumerate(instances, 1):
|
|
143
|
+
if self.limit and count > self.limit:
|
|
144
|
+
break
|
|
145
|
+
yield from self._extract_instance(item)
|
|
94
146
|
|
|
95
147
|
def _extract_instance(self, instance: Instance) -> Iterable[Triple]:
|
|
96
148
|
if isinstance(instance, dm.Edge):
|
|
@@ -105,7 +157,6 @@ class DMSExtractor(BaseExtractor):
|
|
|
105
157
|
# If the edge has properties, we create a node for the edge and connect it to the start and end nodes.
|
|
106
158
|
id_ = self._as_uri_ref(instance)
|
|
107
159
|
yield id_, RDF.type, self._as_uri_ref(instance.type)
|
|
108
|
-
yield id_, RDF.type, self._get_namespace(instance.space).Edge
|
|
109
160
|
yield (
|
|
110
161
|
id_,
|
|
111
162
|
self._as_uri_ref(dm.DirectRelationReference(instance.space, "startNode")),
|
|
@@ -121,6 +172,9 @@ class DMSExtractor(BaseExtractor):
|
|
|
121
172
|
id_ = self._as_uri_ref(instance)
|
|
122
173
|
if instance.type:
|
|
123
174
|
type_ = self._as_uri_ref(cast(dm.DirectRelationReference, instance.type))
|
|
175
|
+
elif len(instance.properties) == 1:
|
|
176
|
+
view_id = next(iter(instance.properties.keys()))
|
|
177
|
+
type_ = self._get_namespace(view_id.space)[urllib.parse.quote(view_id.external_id)]
|
|
124
178
|
else:
|
|
125
179
|
type_ = self._get_namespace(instance.space).Node
|
|
126
180
|
|
|
@@ -135,20 +189,38 @@ class DMSExtractor(BaseExtractor):
|
|
|
135
189
|
for view_id, properties in instance.properties.items():
|
|
136
190
|
namespace = self._get_namespace(view_id.space)
|
|
137
191
|
for key, value in properties.items():
|
|
138
|
-
for object_ in self.
|
|
139
|
-
yield id_, namespace[
|
|
192
|
+
for predicate_str, object_ in self._get_predicate_objects_pair(key, value):
|
|
193
|
+
yield id_, namespace[urllib.parse.quote(predicate_str)], object_
|
|
140
194
|
|
|
141
|
-
def
|
|
195
|
+
def _get_predicate_objects_pair(self, key: str, value: PropertyValue) -> Iterable[tuple[str, Literal | URIRef]]:
|
|
142
196
|
if isinstance(value, str | float | bool | int):
|
|
143
|
-
yield Literal(value)
|
|
197
|
+
yield key, Literal(value)
|
|
144
198
|
elif isinstance(value, dict) and "space" in value and "externalId" in value:
|
|
145
|
-
yield self._as_uri_ref(dm.DirectRelationReference.load(value))
|
|
199
|
+
yield key, self._as_uri_ref(dm.DirectRelationReference.load(value))
|
|
200
|
+
elif isinstance(value, dict) and self.unpack_json:
|
|
201
|
+
for sub_key, sub_value in value.items():
|
|
202
|
+
if isinstance(sub_value, str):
|
|
203
|
+
if sub_value.casefold() in self.empty_values:
|
|
204
|
+
continue
|
|
205
|
+
if self.str_to_ideal_type:
|
|
206
|
+
yield sub_key, Literal(string_to_ideal_type(sub_value))
|
|
207
|
+
else:
|
|
208
|
+
yield sub_key, Literal(sub_value)
|
|
209
|
+
elif isinstance(sub_value, int | float | bool):
|
|
210
|
+
yield sub_key, Literal(sub_value)
|
|
211
|
+
elif isinstance(sub_value, dict):
|
|
212
|
+
yield from self._get_predicate_objects_pair(f"{key}_{sub_key}", sub_value)
|
|
213
|
+
elif isinstance(sub_value, list):
|
|
214
|
+
for item in sub_value:
|
|
215
|
+
yield from self._get_predicate_objects_pair(f"{key}_{sub_key}", item)
|
|
216
|
+
else:
|
|
217
|
+
yield sub_key, Literal(str(sub_value))
|
|
146
218
|
elif isinstance(value, dict):
|
|
147
219
|
# This object is a json object.
|
|
148
|
-
yield Literal(str(value), datatype=XSD._NS["json"])
|
|
220
|
+
yield key, Literal(str(value), datatype=XSD._NS["json"])
|
|
149
221
|
elif isinstance(value, list):
|
|
150
222
|
for item in value:
|
|
151
|
-
yield from self.
|
|
223
|
+
yield from self._get_predicate_objects_pair(key, item)
|
|
152
224
|
|
|
153
225
|
def _as_uri_ref(self, instance: Instance | dm.DirectRelationReference) -> URIRef:
|
|
154
226
|
return self._get_namespace(instance.space)[urllib.parse.quote(instance.external_id)]
|
|
@@ -159,34 +231,77 @@ class DMSExtractor(BaseExtractor):
|
|
|
159
231
|
return Namespace(DEFAULT_SPACE_URI.format(space=urllib.parse.quote(space)))
|
|
160
232
|
|
|
161
233
|
|
|
162
|
-
class
|
|
163
|
-
def __init__(
|
|
164
|
-
self, client: CogniteClient, views: Iterable[dm.View], instance_space: str | SequenceNotStr[str] | None = None
|
|
165
|
-
):
|
|
234
|
+
class _ViewInstanceIterator(Iterable[Instance]):
|
|
235
|
+
def __init__(self, client: CogniteClient, view: dm.View, instance_space: str | SequenceNotStr[str] | None = None):
|
|
166
236
|
self.client = client
|
|
167
|
-
self.
|
|
237
|
+
self.view = view
|
|
168
238
|
self.instance_space = instance_space
|
|
169
239
|
|
|
240
|
+
@cached_property
|
|
241
|
+
def count(self) -> int:
|
|
242
|
+
node_count = edge_count = 0
|
|
243
|
+
if self.view.used_for in ("node", "all"):
|
|
244
|
+
node_count = int(
|
|
245
|
+
self.client.data_modeling.instances.aggregate(
|
|
246
|
+
view=self.view.as_id(),
|
|
247
|
+
aggregates=dm.aggregations.Count("externalId"),
|
|
248
|
+
instance_type="node",
|
|
249
|
+
space=self.instance_space,
|
|
250
|
+
).value
|
|
251
|
+
)
|
|
252
|
+
if self.view.used_for in ("edge", "all"):
|
|
253
|
+
edge_count = int(
|
|
254
|
+
self.client.data_modeling.instances.aggregate(
|
|
255
|
+
view=self.view.as_id(),
|
|
256
|
+
aggregates=dm.aggregations.Count("externalId"),
|
|
257
|
+
instance_type="edge",
|
|
258
|
+
space=self.instance_space,
|
|
259
|
+
).value
|
|
260
|
+
)
|
|
261
|
+
return node_count + edge_count
|
|
262
|
+
|
|
170
263
|
def __iter__(self) -> Iterator[Instance]:
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
264
|
+
view_id = self.view.as_id()
|
|
265
|
+
read_only_properties = {
|
|
266
|
+
prop_id
|
|
267
|
+
for prop_id, prop in self.view.properties.items()
|
|
268
|
+
if isinstance(prop, dm.MappedProperty)
|
|
269
|
+
and is_readonly_property(prop.container, prop.container_property_identifier)
|
|
270
|
+
}
|
|
271
|
+
# All nodes and edges with properties
|
|
272
|
+
if self.view.used_for in ("node", "all"):
|
|
273
|
+
node_iterable: Iterable[Instance] = self.client.data_modeling.instances(
|
|
274
|
+
chunk_size=None, instance_type="node", sources=[view_id], space=self.instance_space
|
|
275
|
+
)
|
|
276
|
+
if read_only_properties:
|
|
277
|
+
node_iterable = self._remove_read_only_properties(node_iterable, read_only_properties, view_id)
|
|
278
|
+
yield from node_iterable
|
|
279
|
+
|
|
280
|
+
if self.view.used_for in ("edge", "all"):
|
|
281
|
+
yield from self.client.data_modeling.instances(
|
|
282
|
+
chunk_size=None, instance_type="edge", sources=[view_id], space=self.instance_space
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
for prop in self.view.properties.values():
|
|
286
|
+
if isinstance(prop, dm.EdgeConnection):
|
|
287
|
+
if prop.edge_source:
|
|
288
|
+
# All edges with properties are extracted from the edge source
|
|
289
|
+
continue
|
|
179
290
|
yield from self.client.data_modeling.instances(
|
|
180
|
-
chunk_size=None,
|
|
291
|
+
chunk_size=None,
|
|
292
|
+
instance_type="edge",
|
|
293
|
+
filter=dm.filters.Equals(
|
|
294
|
+
["edge", "type"], {"space": prop.type.space, "externalId": prop.type.external_id}
|
|
295
|
+
),
|
|
296
|
+
space=self.instance_space,
|
|
181
297
|
)
|
|
182
298
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
)
|
|
299
|
+
@staticmethod
|
|
300
|
+
def _remove_read_only_properties(
|
|
301
|
+
nodes: Iterable[Instance], read_only_properties: Set[str], view_id: dm.ViewId
|
|
302
|
+
) -> Iterable[Instance]:
|
|
303
|
+
for node in nodes:
|
|
304
|
+
if properties := node.properties.get(view_id):
|
|
305
|
+
for read_only in read_only_properties:
|
|
306
|
+
properties.pop(read_only, None)
|
|
307
|
+
yield node
|
|
@@ -6,11 +6,12 @@ from cognite.client.utils.useful_types import SequenceNotStr
|
|
|
6
6
|
from rdflib import Namespace, URIRef
|
|
7
7
|
|
|
8
8
|
from cognite.neat._client import NeatClient
|
|
9
|
-
from cognite.neat._constants import DEFAULT_NAMESPACE
|
|
9
|
+
from cognite.neat._constants import COGNITE_SPACES, DEFAULT_NAMESPACE
|
|
10
10
|
from cognite.neat._issues import IssueList, NeatIssue, catch_warnings
|
|
11
11
|
from cognite.neat._issues.warnings import CDFAuthWarning, ResourceNotFoundWarning, ResourceRetrievalWarning
|
|
12
12
|
from cognite.neat._rules.importers import DMSImporter
|
|
13
13
|
from cognite.neat._rules.models import DMSRules, InformationRules
|
|
14
|
+
from cognite.neat._rules.models.data_types import Json
|
|
14
15
|
from cognite.neat._rules.transformers import DMSToInformation, VerifyDMSRules
|
|
15
16
|
from cognite.neat._shared import Triple
|
|
16
17
|
|
|
@@ -26,12 +27,18 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
|
|
|
26
27
|
namespace: Namespace = DEFAULT_NAMESPACE,
|
|
27
28
|
issues: Sequence[NeatIssue] | None = None,
|
|
28
29
|
instance_space: str | SequenceNotStr[str] | None = None,
|
|
30
|
+
skip_cognite_views: bool = True,
|
|
31
|
+
unpack_json: bool = False,
|
|
32
|
+
str_to_ideal_type: bool = False,
|
|
29
33
|
) -> None:
|
|
30
34
|
self._client = client
|
|
31
35
|
self._data_model = data_model
|
|
32
36
|
self._namespace = namespace or DEFAULT_NAMESPACE
|
|
33
37
|
self._issues = IssueList(issues)
|
|
34
38
|
self._instance_space = instance_space
|
|
39
|
+
self._skip_cognite_views = skip_cognite_views
|
|
40
|
+
self._unpack_json = unpack_json
|
|
41
|
+
self._str_to_ideal_type = str_to_ideal_type
|
|
35
42
|
|
|
36
43
|
self._views: list[dm.View] | None = None
|
|
37
44
|
self._information_rules: InformationRules | None = None
|
|
@@ -44,6 +51,9 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
|
|
|
44
51
|
client: NeatClient,
|
|
45
52
|
namespace: Namespace = DEFAULT_NAMESPACE,
|
|
46
53
|
instance_space: str | SequenceNotStr[str] | None = None,
|
|
54
|
+
skip_cognite_views: bool = True,
|
|
55
|
+
unpack_json: bool = False,
|
|
56
|
+
str_to_ideal_type: bool = False,
|
|
47
57
|
) -> "DMSGraphExtractor":
|
|
48
58
|
issues: list[NeatIssue] = []
|
|
49
59
|
try:
|
|
@@ -51,14 +61,37 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
|
|
|
51
61
|
except CogniteAPIError as e:
|
|
52
62
|
issues.append(CDFAuthWarning("retrieving data model", str(e)))
|
|
53
63
|
return cls(
|
|
54
|
-
cls._create_empty_model(dm.DataModelId.load(data_model_id)),
|
|
64
|
+
cls._create_empty_model(dm.DataModelId.load(data_model_id)),
|
|
65
|
+
client,
|
|
66
|
+
namespace,
|
|
67
|
+
issues,
|
|
68
|
+
instance_space,
|
|
69
|
+
skip_cognite_views,
|
|
70
|
+
unpack_json,
|
|
71
|
+
str_to_ideal_type,
|
|
55
72
|
)
|
|
56
73
|
if not data_model:
|
|
57
74
|
issues.append(ResourceRetrievalWarning(frozenset({data_model_id}), "data model"))
|
|
58
75
|
return cls(
|
|
59
|
-
cls._create_empty_model(dm.DataModelId.load(data_model_id)),
|
|
76
|
+
cls._create_empty_model(dm.DataModelId.load(data_model_id)),
|
|
77
|
+
client,
|
|
78
|
+
namespace,
|
|
79
|
+
issues,
|
|
80
|
+
instance_space,
|
|
81
|
+
skip_cognite_views,
|
|
82
|
+
unpack_json,
|
|
83
|
+
str_to_ideal_type,
|
|
60
84
|
)
|
|
61
|
-
return cls(
|
|
85
|
+
return cls(
|
|
86
|
+
data_model.latest_version(),
|
|
87
|
+
client,
|
|
88
|
+
namespace,
|
|
89
|
+
issues,
|
|
90
|
+
instance_space,
|
|
91
|
+
skip_cognite_views,
|
|
92
|
+
unpack_json,
|
|
93
|
+
str_to_ideal_type,
|
|
94
|
+
)
|
|
62
95
|
|
|
63
96
|
@classmethod
|
|
64
97
|
def _create_empty_model(cls, data_model_id: dm.DataModelId) -> dm.DataModel:
|
|
@@ -92,11 +125,16 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
|
|
|
92
125
|
def extract(self) -> Iterable[Triple]:
|
|
93
126
|
"""Extracts the knowledge graph from the data model."""
|
|
94
127
|
views = self._model_views
|
|
128
|
+
if self._skip_cognite_views:
|
|
129
|
+
views = [view for view in views if view.space not in COGNITE_SPACES]
|
|
130
|
+
|
|
95
131
|
yield from DMSExtractor.from_views(
|
|
96
132
|
self._client,
|
|
97
133
|
views,
|
|
98
134
|
overwrite_namespace=self._namespace,
|
|
99
135
|
instance_space=self._instance_space,
|
|
136
|
+
unpack_json=self._unpack_json,
|
|
137
|
+
str_to_ideal_type=self._str_to_ideal_type,
|
|
100
138
|
).extract()
|
|
101
139
|
|
|
102
140
|
def _get_views(self) -> list[dm.View]:
|
|
@@ -141,6 +179,18 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
|
|
|
141
179
|
# The DMS and Information rules must be created together to link them property.
|
|
142
180
|
importer = DMSImporter.from_data_model(self._client, self._data_model)
|
|
143
181
|
unverified_dms = importer.to_rules()
|
|
182
|
+
if self._unpack_json and (dms_rules := unverified_dms.rules):
|
|
183
|
+
# Drop the JSON properties from the DMS rules as these are no longer valid.
|
|
184
|
+
json_name = Json().name # To avoid instantiating Json multiple times.
|
|
185
|
+
dms_rules.properties = [
|
|
186
|
+
prop
|
|
187
|
+
for prop in dms_rules.properties
|
|
188
|
+
if not (
|
|
189
|
+
isinstance(prop.value_type, Json)
|
|
190
|
+
or (isinstance(prop.value_type, str) and prop.value_type == json_name)
|
|
191
|
+
)
|
|
192
|
+
]
|
|
193
|
+
|
|
144
194
|
with catch_warnings() as issues:
|
|
145
195
|
# Any errors occur will be raised and caught outside the extractor.
|
|
146
196
|
verified_dms = VerifyDMSRules(client=self._client).transform(unverified_dms)
|
|
@@ -141,7 +141,7 @@ def generate_triples(
|
|
|
141
141
|
|
|
142
142
|
# pregenerate instance ids for each remaining class
|
|
143
143
|
instance_ids = {
|
|
144
|
-
key: [URIRef(namespace[f"{key.suffix}-{i+1}"]) for i in range(value)] for key, value in class_count.items()
|
|
144
|
+
key: [URIRef(namespace[f"{key.suffix}-{i + 1}"]) for i in range(value)] for key, value in class_count.items()
|
|
145
145
|
}
|
|
146
146
|
|
|
147
147
|
# create triple for each class instance defining its type
|
|
@@ -36,7 +36,6 @@ class RdfFileExtractor(BaseExtractor):
|
|
|
36
36
|
|
|
37
37
|
self.format = guess_format(str(self.filepath) if isinstance(self.filepath, Path) else self.filepath.name)
|
|
38
38
|
|
|
39
|
-
print(self.format)
|
|
40
39
|
if isinstance(self.filepath, Path) and not self.filepath.exists():
|
|
41
40
|
self.issue_list.append(FileNotFoundNeatError(self.filepath))
|
|
42
41
|
|
|
@@ -55,7 +54,7 @@ class RdfFileExtractor(BaseExtractor):
|
|
|
55
54
|
def from_zip(
|
|
56
55
|
cls,
|
|
57
56
|
filepath: Path,
|
|
58
|
-
filename: str = "neat-session/instances/instances.
|
|
57
|
+
filename: str = "neat-session/instances/instances.trig",
|
|
59
58
|
base_uri: URIRef = DEFAULT_BASE_URI,
|
|
60
59
|
issue_list: IssueList | None = None,
|
|
61
60
|
):
|
|
@@ -69,6 +68,8 @@ class RdfFileExtractor(BaseExtractor):
|
|
|
69
68
|
if file_info.filename == filename:
|
|
70
69
|
# We need to open the file in the zip file, and close it upon
|
|
71
70
|
# triple extraction ...
|
|
71
|
+
|
|
72
|
+
print(file_info)
|
|
72
73
|
file = zip_ref.open(file_info)
|
|
73
74
|
return cls(cast(zipfile.ZipExtFile, file), base_uri, issue_list)
|
|
74
75
|
|
|
@@ -20,6 +20,4 @@ def _repr_html_() -> str:
|
|
|
20
20
|
]
|
|
21
21
|
)._repr_html_()
|
|
22
22
|
|
|
23
|
-
return
|
|
24
|
-
"<strong>Loader</strong> A loader writes data from Neat's triple storage into a target system" f"<br />{table}"
|
|
25
|
-
)
|
|
23
|
+
return f"<strong>Loader</strong> A loader writes data from Neat's triple storage into a target system<br />{table}"
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
|
+
import urllib.parse
|
|
3
4
|
import warnings
|
|
4
5
|
from collections import defaultdict
|
|
5
6
|
from collections.abc import Iterable, Sequence
|
|
@@ -70,6 +71,7 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
|
|
|
70
71
|
tracker: type[Tracker] | None = None,
|
|
71
72
|
rules: DMSRules | None = None,
|
|
72
73
|
client: NeatClient | None = None,
|
|
74
|
+
unquote_external_ids: bool = False,
|
|
73
75
|
):
|
|
74
76
|
super().__init__(graph_store)
|
|
75
77
|
self.data_model = data_model
|
|
@@ -79,6 +81,7 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
|
|
|
79
81
|
self._tracker: type[Tracker] = tracker or LogTracker
|
|
80
82
|
self.rules = rules
|
|
81
83
|
self._client = client
|
|
84
|
+
self._unquote_external_ids = unquote_external_ids
|
|
82
85
|
|
|
83
86
|
@classmethod
|
|
84
87
|
def from_data_model_id(
|
|
@@ -99,7 +102,12 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
|
|
|
99
102
|
|
|
100
103
|
@classmethod
|
|
101
104
|
def from_rules(
|
|
102
|
-
cls,
|
|
105
|
+
cls,
|
|
106
|
+
rules: DMSRules,
|
|
107
|
+
graph_store: NeatGraphStore,
|
|
108
|
+
instance_space: str,
|
|
109
|
+
client: NeatClient | None = None,
|
|
110
|
+
unquote_external_ids: bool = False,
|
|
103
111
|
) -> "DMSLoader":
|
|
104
112
|
issues: list[NeatIssue] = []
|
|
105
113
|
data_model: dm.DataModel[dm.View] | None = None
|
|
@@ -125,6 +133,7 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
|
|
|
125
133
|
issues,
|
|
126
134
|
rules=rules,
|
|
127
135
|
client=client,
|
|
136
|
+
unquote_external_ids=unquote_external_ids,
|
|
128
137
|
)
|
|
129
138
|
|
|
130
139
|
def _load(self, stop_on_exception: bool = False) -> Iterable[dm.InstanceApply | NeatIssue | type[_END_OF_CLASS]]:
|
|
@@ -142,7 +151,9 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
|
|
|
142
151
|
if self.rules and self.rules.metadata.logical
|
|
143
152
|
else None
|
|
144
153
|
)
|
|
154
|
+
|
|
145
155
|
view_and_count_by_id = self._select_views_with_instances(self.data_model.views)
|
|
156
|
+
|
|
146
157
|
if self._client:
|
|
147
158
|
view_and_count_by_id, properties_point_to_self = self._sort_by_direct_relation_dependencies(
|
|
148
159
|
view_and_count_by_id
|
|
@@ -425,20 +436,11 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
|
|
|
425
436
|
else:
|
|
426
437
|
raise ValueError(f"Expect valid JSON string or dict for {info.field_name}: {value}")
|
|
427
438
|
|
|
428
|
-
def parse_text(cls, value: Any, info: ValidationInfo) -> Any:
|
|
429
|
-
if isinstance(value, list):
|
|
430
|
-
return [remove_namespace_from_uri(v) for v in value]
|
|
431
|
-
else:
|
|
432
|
-
return remove_namespace_from_uri(value)
|
|
433
|
-
|
|
434
439
|
if json_fields:
|
|
435
440
|
validators["parse_json_string"] = field_validator(*json_fields, mode="before")(parse_json_string) # type: ignore[assignment, arg-type]
|
|
436
441
|
|
|
437
442
|
validators["parse_list"] = field_validator("*", mode="before")(parse_list) # type: ignore[assignment, arg-type]
|
|
438
443
|
|
|
439
|
-
if text_fields:
|
|
440
|
-
validators["parse_text"] = field_validator(*text_fields, mode="before")(parse_text) # type: ignore[assignment, arg-type]
|
|
441
|
-
|
|
442
444
|
if direct_relation_by_property:
|
|
443
445
|
|
|
444
446
|
def parse_direct_relation(cls, value: list, info: ValidationInfo) -> dict | list[dict]:
|
|
@@ -490,6 +492,8 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
|
|
|
490
492
|
) -> dm.InstanceApply:
|
|
491
493
|
type_ = properties.pop(RDF.type, [None])[0]
|
|
492
494
|
created = pydantic_cls.model_validate(properties)
|
|
495
|
+
if self._unquote_external_ids:
|
|
496
|
+
identifier = urllib.parse.unquote(identifier)
|
|
493
497
|
|
|
494
498
|
return dm.NodeApply(
|
|
495
499
|
space=self.instance_space,
|
|
@@ -514,6 +518,9 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
|
|
|
514
518
|
if type_ is None:
|
|
515
519
|
raise ValueError(f"Missing type for edge {identifier}")
|
|
516
520
|
|
|
521
|
+
if self._unquote_external_ids:
|
|
522
|
+
identifier = urllib.parse.unquote(identifier)
|
|
523
|
+
|
|
517
524
|
return dm.EdgeApply(
|
|
518
525
|
space=self.instance_space,
|
|
519
526
|
external_id=identifier,
|
|
@@ -550,6 +557,9 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
|
|
|
550
557
|
yield error
|
|
551
558
|
for target in values:
|
|
552
559
|
external_id = f"{identifier}.{prop_id}.{target}"
|
|
560
|
+
if self._unquote_external_ids:
|
|
561
|
+
external_id = urllib.parse.unquote(external_id)
|
|
562
|
+
|
|
553
563
|
yield dm.EdgeApply(
|
|
554
564
|
space=self.instance_space,
|
|
555
565
|
external_id=(external_id if len(external_id) < 256 else create_sha256_hash(external_id)),
|