cognite-neat 0.107.0__py3-none-any.whl → 0.108.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-neat might be problematic. Click here for more details.

Files changed (52) hide show
  1. cognite/neat/_constants.py +35 -1
  2. cognite/neat/_graph/_shared.py +4 -0
  3. cognite/neat/_graph/extractors/_classic_cdf/_base.py +115 -14
  4. cognite/neat/_graph/extractors/_classic_cdf/_classic.py +83 -6
  5. cognite/neat/_graph/extractors/_classic_cdf/_relationships.py +48 -12
  6. cognite/neat/_graph/extractors/_classic_cdf/_sequences.py +19 -1
  7. cognite/neat/_graph/extractors/_dms.py +162 -47
  8. cognite/neat/_graph/extractors/_dms_graph.py +54 -4
  9. cognite/neat/_graph/extractors/_mock_graph_generator.py +1 -1
  10. cognite/neat/_graph/extractors/_rdf_file.py +3 -2
  11. cognite/neat/_graph/loaders/__init__.py +1 -3
  12. cognite/neat/_graph/loaders/_rdf2dms.py +20 -10
  13. cognite/neat/_graph/queries/_base.py +140 -84
  14. cognite/neat/_graph/queries/_construct.py +1 -1
  15. cognite/neat/_graph/transformers/__init__.py +3 -1
  16. cognite/neat/_graph/transformers/_value_type.py +54 -3
  17. cognite/neat/_issues/errors/_resources.py +1 -1
  18. cognite/neat/_issues/warnings/__init__.py +0 -2
  19. cognite/neat/_issues/warnings/_models.py +1 -1
  20. cognite/neat/_issues/warnings/_properties.py +0 -8
  21. cognite/neat/_rules/catalog/classic_model.xlsx +0 -0
  22. cognite/neat/_rules/exporters/_rules2instance_template.py +3 -3
  23. cognite/neat/_rules/importers/__init__.py +3 -1
  24. cognite/neat/_rules/importers/_dtdl2rules/spec.py +1 -2
  25. cognite/neat/_rules/importers/_rdf/__init__.py +2 -2
  26. cognite/neat/_rules/importers/_rdf/_base.py +2 -2
  27. cognite/neat/_rules/importers/_rdf/_inference2rules.py +241 -18
  28. cognite/neat/_rules/models/_base_rules.py +13 -3
  29. cognite/neat/_rules/models/dms/_rules.py +1 -8
  30. cognite/neat/_rules/models/dms/_rules_input.py +4 -0
  31. cognite/neat/_rules/models/information/_rules_input.py +5 -0
  32. cognite/neat/_rules/transformers/__init__.py +6 -0
  33. cognite/neat/_rules/transformers/_converters.py +98 -7
  34. cognite/neat/_session/_base.py +55 -4
  35. cognite/neat/_session/_drop.py +5 -1
  36. cognite/neat/_session/_inspect.py +3 -2
  37. cognite/neat/_session/_read.py +61 -14
  38. cognite/neat/_session/_set.py +27 -0
  39. cognite/neat/_session/_show.py +4 -4
  40. cognite/neat/_session/_state.py +8 -4
  41. cognite/neat/_session/_to.py +4 -1
  42. cognite/neat/_session/_wizard.py +1 -1
  43. cognite/neat/_session/exceptions.py +2 -1
  44. cognite/neat/_store/_graph_store.py +287 -133
  45. cognite/neat/_store/_rules_store.py +108 -1
  46. cognite/neat/_utils/auth.py +1 -1
  47. cognite/neat/_version.py +1 -1
  48. {cognite_neat-0.107.0.dist-info → cognite_neat-0.108.0.dist-info}/METADATA +1 -1
  49. {cognite_neat-0.107.0.dist-info → cognite_neat-0.108.0.dist-info}/RECORD +52 -52
  50. {cognite_neat-0.107.0.dist-info → cognite_neat-0.108.0.dist-info}/LICENSE +0 -0
  51. {cognite_neat-0.107.0.dist-info → cognite_neat-0.108.0.dist-info}/WHEEL +0 -0
  52. {cognite_neat-0.107.0.dist-info → cognite_neat-0.108.0.dist-info}/entry_points.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import urllib.parse
2
- from collections.abc import Iterable, Iterator
2
+ from collections.abc import Iterable, Iterator, Set
3
+ from functools import cached_property
3
4
  from typing import cast
4
5
 
5
6
  from cognite.client import CogniteClient
@@ -9,34 +10,48 @@ from cognite.client.data_classes.data_modeling.instances import Instance, Proper
9
10
  from cognite.client.utils.useful_types import SequenceNotStr
10
11
  from rdflib import RDF, XSD, Literal, Namespace, URIRef
11
12
 
12
- from cognite.neat._constants import DEFAULT_SPACE_URI
13
+ from cognite.neat._config import GLOBAL_CONFIG
14
+ from cognite.neat._constants import DEFAULT_SPACE_URI, is_readonly_property
13
15
  from cognite.neat._issues.errors import ResourceRetrievalError
14
16
  from cognite.neat._shared import Triple
17
+ from cognite.neat._utils.auxiliary import string_to_ideal_type
18
+ from cognite.neat._utils.collection_ import iterate_progress_bar
15
19
 
16
20
  from ._base import BaseExtractor
17
21
 
22
+ DEFAULT_EMPTY_VALUES = frozenset({"nan", "null", "none", "", " ", "nil", "n/a", "na", "unknown", "undefined"})
23
+
18
24
 
19
25
  class DMSExtractor(BaseExtractor):
20
26
  """Extract data from Cognite Data Fusion DMS instances into Neat.
21
27
 
22
28
  Args:
23
- items: The items to extract.
24
- total: The total number of items to extract. If provided, this will be used to estimate the progress.
29
+ total_instances_pair_by_view: A dictionary where the key is the view id and the value is a tuple with the total
30
+ number of instances and an iterable of instances.
25
31
  limit: The maximum number of items to extract.
26
32
  overwrite_namespace: If provided, this will overwrite the space of the extracted items.
33
+ unpack_json: If True, JSON objects will be unpacked into RDF literals.
34
+ empty_values: If unpack_json is True, when unpacking JSON objects, if a key has a value in this set, it will be
35
+ considered as an empty value and skipped.
36
+ str_to_ideal_type: If unpack_json is True, when unpacking JSON objects, if the value is a string, the extractor
37
+ will try to convert it to the ideal type.
27
38
  """
28
39
 
29
40
  def __init__(
30
41
  self,
31
- items: Iterable[Instance],
32
- total: int | None = None,
42
+ total_instances_pair_by_view: dict[dm.ViewId, tuple[int | None, Iterable[Instance]]],
33
43
  limit: int | None = None,
34
44
  overwrite_namespace: Namespace | None = None,
45
+ unpack_json: bool = False,
46
+ empty_values: Set[str] = DEFAULT_EMPTY_VALUES,
47
+ str_to_ideal_type: bool = False,
35
48
  ) -> None:
36
- self.items = items
37
- self.total = total
49
+ self.total_instances_pair_by_view = total_instances_pair_by_view
38
50
  self.limit = limit
39
51
  self.overwrite_namespace = overwrite_namespace
52
+ self.unpack_json = unpack_json
53
+ self.empty_values = empty_values
54
+ self.str_to_ideal_type = str_to_ideal_type
40
55
 
41
56
  @classmethod
42
57
  def from_data_model(
@@ -46,6 +61,8 @@ class DMSExtractor(BaseExtractor):
46
61
  limit: int | None = None,
47
62
  overwrite_namespace: Namespace | None = None,
48
63
  instance_space: str | SequenceNotStr[str] | None = None,
64
+ unpack_json: bool = False,
65
+ str_to_ideal_type: bool = False,
49
66
  ) -> "DMSExtractor":
50
67
  """Create an extractor from a data model.
51
68
 
@@ -55,11 +72,20 @@ class DMSExtractor(BaseExtractor):
55
72
  limit: The maximum number of instances to extract.
56
73
  overwrite_namespace: If provided, this will overwrite the space of the extracted items.
57
74
  instance_space: The space to extract instances from.
75
+ unpack_json: If True, JSON objects will be unpacked into RDF literals.
58
76
  """
59
77
  retrieved = client.data_modeling.data_models.retrieve(data_model, inline_views=True)
60
78
  if not retrieved:
61
79
  raise ResourceRetrievalError(dm.DataModelId.load(data_model), "data model", "Data Model is missing in CDF")
62
- return cls.from_views(client, retrieved.latest_version().views, limit, overwrite_namespace, instance_space)
80
+ return cls.from_views(
81
+ client,
82
+ retrieved.latest_version().views,
83
+ limit,
84
+ overwrite_namespace,
85
+ instance_space,
86
+ unpack_json,
87
+ str_to_ideal_type,
88
+ )
63
89
 
64
90
  @classmethod
65
91
  def from_views(
@@ -69,6 +95,8 @@ class DMSExtractor(BaseExtractor):
69
95
  limit: int | None = None,
70
96
  overwrite_namespace: Namespace | None = None,
71
97
  instance_space: str | SequenceNotStr[str] | None = None,
98
+ unpack_json: bool = False,
99
+ str_to_ideal_type: bool = False,
72
100
  ) -> "DMSExtractor":
73
101
  """Create an extractor from a set of views.
74
102
 
@@ -78,19 +106,43 @@ class DMSExtractor(BaseExtractor):
78
106
  limit: The maximum number of instances to extract.
79
107
  overwrite_namespace: If provided, this will overwrite the space of the extracted items.
80
108
  instance_space: The space to extract instances from.
109
+ unpack_json: If True, JSON objects will be unpacked into RDF literals.
110
+ str_to_ideal_type: If True, when unpacking JSON objects, if the value is a string, the extractor will try to
111
+ convert it to the ideal type.
81
112
  """
113
+ total_instances_pair_by_view: dict[dm.ViewId, tuple[int | None, Iterable[Instance]]] = {}
114
+ for view in views:
115
+ instance_iterator = _ViewInstanceIterator(client, view, instance_space)
116
+ total_instances_pair_by_view[view.as_id()] = (instance_iterator.count, instance_iterator)
117
+
82
118
  return cls(
83
- _InstanceIterator(client, views, instance_space),
84
- total=None,
119
+ total_instances_pair_by_view=total_instances_pair_by_view,
85
120
  limit=limit,
86
121
  overwrite_namespace=overwrite_namespace,
122
+ unpack_json=unpack_json,
123
+ str_to_ideal_type=str_to_ideal_type,
87
124
  )
88
125
 
89
126
  def extract(self) -> Iterable[Triple]:
90
- for count, item in enumerate(self.items, 1):
91
- if self.limit and count > self.limit:
92
- break
93
- yield from self._extract_instance(item)
127
+ total_instances = sum(total for total, _ in self.total_instances_pair_by_view.values() if total is not None)
128
+ use_progress_bar = (
129
+ GLOBAL_CONFIG.use_iterate_bar_threshold and total_instances > GLOBAL_CONFIG.use_iterate_bar_threshold
130
+ )
131
+
132
+ for view_id, (total, instances) in self.total_instances_pair_by_view.items():
133
+ if total == 0:
134
+ continue
135
+ if use_progress_bar and total is not None:
136
+ instances = iterate_progress_bar(
137
+ instances,
138
+ total,
139
+ f"Extracting instances from {view_id.space}:{view_id.external_id}(version={view_id.version})",
140
+ )
141
+
142
+ for count, item in enumerate(instances, 1):
143
+ if self.limit and count > self.limit:
144
+ break
145
+ yield from self._extract_instance(item)
94
146
 
95
147
  def _extract_instance(self, instance: Instance) -> Iterable[Triple]:
96
148
  if isinstance(instance, dm.Edge):
@@ -105,7 +157,6 @@ class DMSExtractor(BaseExtractor):
105
157
  # If the edge has properties, we create a node for the edge and connect it to the start and end nodes.
106
158
  id_ = self._as_uri_ref(instance)
107
159
  yield id_, RDF.type, self._as_uri_ref(instance.type)
108
- yield id_, RDF.type, self._get_namespace(instance.space).Edge
109
160
  yield (
110
161
  id_,
111
162
  self._as_uri_ref(dm.DirectRelationReference(instance.space, "startNode")),
@@ -121,6 +172,9 @@ class DMSExtractor(BaseExtractor):
121
172
  id_ = self._as_uri_ref(instance)
122
173
  if instance.type:
123
174
  type_ = self._as_uri_ref(cast(dm.DirectRelationReference, instance.type))
175
+ elif len(instance.properties) == 1:
176
+ view_id = next(iter(instance.properties.keys()))
177
+ type_ = self._get_namespace(view_id.space)[urllib.parse.quote(view_id.external_id)]
124
178
  else:
125
179
  type_ = self._get_namespace(instance.space).Node
126
180
 
@@ -135,20 +189,38 @@ class DMSExtractor(BaseExtractor):
135
189
  for view_id, properties in instance.properties.items():
136
190
  namespace = self._get_namespace(view_id.space)
137
191
  for key, value in properties.items():
138
- for object_ in self._get_objects(value):
139
- yield id_, namespace[key], object_
192
+ for predicate_str, object_ in self._get_predicate_objects_pair(key, value):
193
+ yield id_, namespace[urllib.parse.quote(predicate_str)], object_
140
194
 
141
- def _get_objects(self, value: PropertyValue) -> Iterable[Literal | URIRef]:
195
+ def _get_predicate_objects_pair(self, key: str, value: PropertyValue) -> Iterable[tuple[str, Literal | URIRef]]:
142
196
  if isinstance(value, str | float | bool | int):
143
- yield Literal(value)
197
+ yield key, Literal(value)
144
198
  elif isinstance(value, dict) and "space" in value and "externalId" in value:
145
- yield self._as_uri_ref(dm.DirectRelationReference.load(value))
199
+ yield key, self._as_uri_ref(dm.DirectRelationReference.load(value))
200
+ elif isinstance(value, dict) and self.unpack_json:
201
+ for sub_key, sub_value in value.items():
202
+ if isinstance(sub_value, str):
203
+ if sub_value.casefold() in self.empty_values:
204
+ continue
205
+ if self.str_to_ideal_type:
206
+ yield sub_key, Literal(string_to_ideal_type(sub_value))
207
+ else:
208
+ yield sub_key, Literal(sub_value)
209
+ elif isinstance(sub_value, int | float | bool):
210
+ yield sub_key, Literal(sub_value)
211
+ elif isinstance(sub_value, dict):
212
+ yield from self._get_predicate_objects_pair(f"{key}_{sub_key}", sub_value)
213
+ elif isinstance(sub_value, list):
214
+ for item in sub_value:
215
+ yield from self._get_predicate_objects_pair(f"{key}_{sub_key}", item)
216
+ else:
217
+ yield sub_key, Literal(str(sub_value))
146
218
  elif isinstance(value, dict):
147
219
  # This object is a json object.
148
- yield Literal(str(value), datatype=XSD._NS["json"])
220
+ yield key, Literal(str(value), datatype=XSD._NS["json"])
149
221
  elif isinstance(value, list):
150
222
  for item in value:
151
- yield from self._get_objects(item)
223
+ yield from self._get_predicate_objects_pair(key, item)
152
224
 
153
225
  def _as_uri_ref(self, instance: Instance | dm.DirectRelationReference) -> URIRef:
154
226
  return self._get_namespace(instance.space)[urllib.parse.quote(instance.external_id)]
@@ -159,34 +231,77 @@ class DMSExtractor(BaseExtractor):
159
231
  return Namespace(DEFAULT_SPACE_URI.format(space=urllib.parse.quote(space)))
160
232
 
161
233
 
162
- class _InstanceIterator(Iterable[Instance]):
163
- def __init__(
164
- self, client: CogniteClient, views: Iterable[dm.View], instance_space: str | SequenceNotStr[str] | None = None
165
- ):
234
+ class _ViewInstanceIterator(Iterable[Instance]):
235
+ def __init__(self, client: CogniteClient, view: dm.View, instance_space: str | SequenceNotStr[str] | None = None):
166
236
  self.client = client
167
- self.views = views
237
+ self.view = view
168
238
  self.instance_space = instance_space
169
239
 
240
+ @cached_property
241
+ def count(self) -> int:
242
+ node_count = edge_count = 0
243
+ if self.view.used_for in ("node", "all"):
244
+ node_count = int(
245
+ self.client.data_modeling.instances.aggregate(
246
+ view=self.view.as_id(),
247
+ aggregates=dm.aggregations.Count("externalId"),
248
+ instance_type="node",
249
+ space=self.instance_space,
250
+ ).value
251
+ )
252
+ if self.view.used_for in ("edge", "all"):
253
+ edge_count = int(
254
+ self.client.data_modeling.instances.aggregate(
255
+ view=self.view.as_id(),
256
+ aggregates=dm.aggregations.Count("externalId"),
257
+ instance_type="edge",
258
+ space=self.instance_space,
259
+ ).value
260
+ )
261
+ return node_count + edge_count
262
+
170
263
  def __iter__(self) -> Iterator[Instance]:
171
- for view in self.views:
172
- view_id = view.as_id()
173
- # All nodes and edges with properties
174
- if view.used_for in ("node", "all"):
175
- yield from self.client.data_modeling.instances(
176
- chunk_size=None, instance_type="node", sources=[view_id], space=self.instance_space
177
- )
178
- if view.used_for in ("edge", "all"):
264
+ view_id = self.view.as_id()
265
+ read_only_properties = {
266
+ prop_id
267
+ for prop_id, prop in self.view.properties.items()
268
+ if isinstance(prop, dm.MappedProperty)
269
+ and is_readonly_property(prop.container, prop.container_property_identifier)
270
+ }
271
+ # All nodes and edges with properties
272
+ if self.view.used_for in ("node", "all"):
273
+ node_iterable: Iterable[Instance] = self.client.data_modeling.instances(
274
+ chunk_size=None, instance_type="node", sources=[view_id], space=self.instance_space
275
+ )
276
+ if read_only_properties:
277
+ node_iterable = self._remove_read_only_properties(node_iterable, read_only_properties, view_id)
278
+ yield from node_iterable
279
+
280
+ if self.view.used_for in ("edge", "all"):
281
+ yield from self.client.data_modeling.instances(
282
+ chunk_size=None, instance_type="edge", sources=[view_id], space=self.instance_space
283
+ )
284
+
285
+ for prop in self.view.properties.values():
286
+ if isinstance(prop, dm.EdgeConnection):
287
+ if prop.edge_source:
288
+ # All edges with properties are extracted from the edge source
289
+ continue
179
290
  yield from self.client.data_modeling.instances(
180
- chunk_size=None, instance_type="edge", sources=[view_id], space=self.instance_space
291
+ chunk_size=None,
292
+ instance_type="edge",
293
+ filter=dm.filters.Equals(
294
+ ["edge", "type"], {"space": prop.type.space, "externalId": prop.type.external_id}
295
+ ),
296
+ space=self.instance_space,
181
297
  )
182
298
 
183
- for prop in view.properties.values():
184
- if isinstance(prop, dm.EdgeConnection):
185
- yield from self.client.data_modeling.instances(
186
- chunk_size=None,
187
- instance_type="edge",
188
- filter=dm.filters.Equals(
189
- ["edge", "type"], {"space": prop.type.space, "externalId": prop.type.external_id}
190
- ),
191
- space=self.instance_space,
192
- )
299
+ @staticmethod
300
+ def _remove_read_only_properties(
301
+ nodes: Iterable[Instance], read_only_properties: Set[str], view_id: dm.ViewId
302
+ ) -> Iterable[Instance]:
303
+ for node in nodes:
304
+ if properties := node.properties.get(view_id):
305
+ for read_only in read_only_properties:
306
+ properties.pop(read_only, None)
307
+ yield node
@@ -6,11 +6,12 @@ from cognite.client.utils.useful_types import SequenceNotStr
6
6
  from rdflib import Namespace, URIRef
7
7
 
8
8
  from cognite.neat._client import NeatClient
9
- from cognite.neat._constants import DEFAULT_NAMESPACE
9
+ from cognite.neat._constants import COGNITE_SPACES, DEFAULT_NAMESPACE
10
10
  from cognite.neat._issues import IssueList, NeatIssue, catch_warnings
11
11
  from cognite.neat._issues.warnings import CDFAuthWarning, ResourceNotFoundWarning, ResourceRetrievalWarning
12
12
  from cognite.neat._rules.importers import DMSImporter
13
13
  from cognite.neat._rules.models import DMSRules, InformationRules
14
+ from cognite.neat._rules.models.data_types import Json
14
15
  from cognite.neat._rules.transformers import DMSToInformation, VerifyDMSRules
15
16
  from cognite.neat._shared import Triple
16
17
 
@@ -26,12 +27,18 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
26
27
  namespace: Namespace = DEFAULT_NAMESPACE,
27
28
  issues: Sequence[NeatIssue] | None = None,
28
29
  instance_space: str | SequenceNotStr[str] | None = None,
30
+ skip_cognite_views: bool = True,
31
+ unpack_json: bool = False,
32
+ str_to_ideal_type: bool = False,
29
33
  ) -> None:
30
34
  self._client = client
31
35
  self._data_model = data_model
32
36
  self._namespace = namespace or DEFAULT_NAMESPACE
33
37
  self._issues = IssueList(issues)
34
38
  self._instance_space = instance_space
39
+ self._skip_cognite_views = skip_cognite_views
40
+ self._unpack_json = unpack_json
41
+ self._str_to_ideal_type = str_to_ideal_type
35
42
 
36
43
  self._views: list[dm.View] | None = None
37
44
  self._information_rules: InformationRules | None = None
@@ -44,6 +51,9 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
44
51
  client: NeatClient,
45
52
  namespace: Namespace = DEFAULT_NAMESPACE,
46
53
  instance_space: str | SequenceNotStr[str] | None = None,
54
+ skip_cognite_views: bool = True,
55
+ unpack_json: bool = False,
56
+ str_to_ideal_type: bool = False,
47
57
  ) -> "DMSGraphExtractor":
48
58
  issues: list[NeatIssue] = []
49
59
  try:
@@ -51,14 +61,37 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
51
61
  except CogniteAPIError as e:
52
62
  issues.append(CDFAuthWarning("retrieving data model", str(e)))
53
63
  return cls(
54
- cls._create_empty_model(dm.DataModelId.load(data_model_id)), client, namespace, issues, instance_space
64
+ cls._create_empty_model(dm.DataModelId.load(data_model_id)),
65
+ client,
66
+ namespace,
67
+ issues,
68
+ instance_space,
69
+ skip_cognite_views,
70
+ unpack_json,
71
+ str_to_ideal_type,
55
72
  )
56
73
  if not data_model:
57
74
  issues.append(ResourceRetrievalWarning(frozenset({data_model_id}), "data model"))
58
75
  return cls(
59
- cls._create_empty_model(dm.DataModelId.load(data_model_id)), client, namespace, issues, instance_space
76
+ cls._create_empty_model(dm.DataModelId.load(data_model_id)),
77
+ client,
78
+ namespace,
79
+ issues,
80
+ instance_space,
81
+ skip_cognite_views,
82
+ unpack_json,
83
+ str_to_ideal_type,
60
84
  )
61
- return cls(data_model.latest_version(), client, namespace, issues, instance_space)
85
+ return cls(
86
+ data_model.latest_version(),
87
+ client,
88
+ namespace,
89
+ issues,
90
+ instance_space,
91
+ skip_cognite_views,
92
+ unpack_json,
93
+ str_to_ideal_type,
94
+ )
62
95
 
63
96
  @classmethod
64
97
  def _create_empty_model(cls, data_model_id: dm.DataModelId) -> dm.DataModel:
@@ -92,11 +125,16 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
92
125
  def extract(self) -> Iterable[Triple]:
93
126
  """Extracts the knowledge graph from the data model."""
94
127
  views = self._model_views
128
+ if self._skip_cognite_views:
129
+ views = [view for view in views if view.space not in COGNITE_SPACES]
130
+
95
131
  yield from DMSExtractor.from_views(
96
132
  self._client,
97
133
  views,
98
134
  overwrite_namespace=self._namespace,
99
135
  instance_space=self._instance_space,
136
+ unpack_json=self._unpack_json,
137
+ str_to_ideal_type=self._str_to_ideal_type,
100
138
  ).extract()
101
139
 
102
140
  def _get_views(self) -> list[dm.View]:
@@ -141,6 +179,18 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
141
179
  # The DMS and Information rules must be created together to link them property.
142
180
  importer = DMSImporter.from_data_model(self._client, self._data_model)
143
181
  unverified_dms = importer.to_rules()
182
+ if self._unpack_json and (dms_rules := unverified_dms.rules):
183
+ # Drop the JSON properties from the DMS rules as these are no longer valid.
184
+ json_name = Json().name # To avoid instantiating Json multiple times.
185
+ dms_rules.properties = [
186
+ prop
187
+ for prop in dms_rules.properties
188
+ if not (
189
+ isinstance(prop.value_type, Json)
190
+ or (isinstance(prop.value_type, str) and prop.value_type == json_name)
191
+ )
192
+ ]
193
+
144
194
  with catch_warnings() as issues:
145
195
  # Any errors occur will be raised and caught outside the extractor.
146
196
  verified_dms = VerifyDMSRules(client=self._client).transform(unverified_dms)
@@ -141,7 +141,7 @@ def generate_triples(
141
141
 
142
142
  # pregenerate instance ids for each remaining class
143
143
  instance_ids = {
144
- key: [URIRef(namespace[f"{key.suffix}-{i+1}"]) for i in range(value)] for key, value in class_count.items()
144
+ key: [URIRef(namespace[f"{key.suffix}-{i + 1}"]) for i in range(value)] for key, value in class_count.items()
145
145
  }
146
146
 
147
147
  # create triple for each class instance defining its type
@@ -36,7 +36,6 @@ class RdfFileExtractor(BaseExtractor):
36
36
 
37
37
  self.format = guess_format(str(self.filepath) if isinstance(self.filepath, Path) else self.filepath.name)
38
38
 
39
- print(self.format)
40
39
  if isinstance(self.filepath, Path) and not self.filepath.exists():
41
40
  self.issue_list.append(FileNotFoundNeatError(self.filepath))
42
41
 
@@ -55,7 +54,7 @@ class RdfFileExtractor(BaseExtractor):
55
54
  def from_zip(
56
55
  cls,
57
56
  filepath: Path,
58
- filename: str = "neat-session/instances/instances.ttl",
57
+ filename: str = "neat-session/instances/instances.trig",
59
58
  base_uri: URIRef = DEFAULT_BASE_URI,
60
59
  issue_list: IssueList | None = None,
61
60
  ):
@@ -69,6 +68,8 @@ class RdfFileExtractor(BaseExtractor):
69
68
  if file_info.filename == filename:
70
69
  # We need to open the file in the zip file, and close it upon
71
70
  # triple extraction ...
71
+
72
+ print(file_info)
72
73
  file = zip_ref.open(file_info)
73
74
  return cls(cast(zipfile.ZipExtFile, file), base_uri, issue_list)
74
75
 
@@ -20,6 +20,4 @@ def _repr_html_() -> str:
20
20
  ]
21
21
  )._repr_html_()
22
22
 
23
- return (
24
- "<strong>Loader</strong> A loader writes data from Neat's triple storage into a target system" f"<br />{table}"
25
- )
23
+ return f"<strong>Loader</strong> A loader writes data from Neat's triple storage into a target system<br />{table}"
@@ -1,5 +1,6 @@
1
1
  import itertools
2
2
  import json
3
+ import urllib.parse
3
4
  import warnings
4
5
  from collections import defaultdict
5
6
  from collections.abc import Iterable, Sequence
@@ -70,6 +71,7 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
70
71
  tracker: type[Tracker] | None = None,
71
72
  rules: DMSRules | None = None,
72
73
  client: NeatClient | None = None,
74
+ unquote_external_ids: bool = False,
73
75
  ):
74
76
  super().__init__(graph_store)
75
77
  self.data_model = data_model
@@ -79,6 +81,7 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
79
81
  self._tracker: type[Tracker] = tracker or LogTracker
80
82
  self.rules = rules
81
83
  self._client = client
84
+ self._unquote_external_ids = unquote_external_ids
82
85
 
83
86
  @classmethod
84
87
  def from_data_model_id(
@@ -99,7 +102,12 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
99
102
 
100
103
  @classmethod
101
104
  def from_rules(
102
- cls, rules: DMSRules, graph_store: NeatGraphStore, instance_space: str, client: NeatClient | None = None
105
+ cls,
106
+ rules: DMSRules,
107
+ graph_store: NeatGraphStore,
108
+ instance_space: str,
109
+ client: NeatClient | None = None,
110
+ unquote_external_ids: bool = False,
103
111
  ) -> "DMSLoader":
104
112
  issues: list[NeatIssue] = []
105
113
  data_model: dm.DataModel[dm.View] | None = None
@@ -125,6 +133,7 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
125
133
  issues,
126
134
  rules=rules,
127
135
  client=client,
136
+ unquote_external_ids=unquote_external_ids,
128
137
  )
129
138
 
130
139
  def _load(self, stop_on_exception: bool = False) -> Iterable[dm.InstanceApply | NeatIssue | type[_END_OF_CLASS]]:
@@ -142,7 +151,9 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
142
151
  if self.rules and self.rules.metadata.logical
143
152
  else None
144
153
  )
154
+
145
155
  view_and_count_by_id = self._select_views_with_instances(self.data_model.views)
156
+
146
157
  if self._client:
147
158
  view_and_count_by_id, properties_point_to_self = self._sort_by_direct_relation_dependencies(
148
159
  view_and_count_by_id
@@ -425,20 +436,11 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
425
436
  else:
426
437
  raise ValueError(f"Expect valid JSON string or dict for {info.field_name}: {value}")
427
438
 
428
- def parse_text(cls, value: Any, info: ValidationInfo) -> Any:
429
- if isinstance(value, list):
430
- return [remove_namespace_from_uri(v) for v in value]
431
- else:
432
- return remove_namespace_from_uri(value)
433
-
434
439
  if json_fields:
435
440
  validators["parse_json_string"] = field_validator(*json_fields, mode="before")(parse_json_string) # type: ignore[assignment, arg-type]
436
441
 
437
442
  validators["parse_list"] = field_validator("*", mode="before")(parse_list) # type: ignore[assignment, arg-type]
438
443
 
439
- if text_fields:
440
- validators["parse_text"] = field_validator(*text_fields, mode="before")(parse_text) # type: ignore[assignment, arg-type]
441
-
442
444
  if direct_relation_by_property:
443
445
 
444
446
  def parse_direct_relation(cls, value: list, info: ValidationInfo) -> dict | list[dict]:
@@ -490,6 +492,8 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
490
492
  ) -> dm.InstanceApply:
491
493
  type_ = properties.pop(RDF.type, [None])[0]
492
494
  created = pydantic_cls.model_validate(properties)
495
+ if self._unquote_external_ids:
496
+ identifier = urllib.parse.unquote(identifier)
493
497
 
494
498
  return dm.NodeApply(
495
499
  space=self.instance_space,
@@ -514,6 +518,9 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
514
518
  if type_ is None:
515
519
  raise ValueError(f"Missing type for edge {identifier}")
516
520
 
521
+ if self._unquote_external_ids:
522
+ identifier = urllib.parse.unquote(identifier)
523
+
517
524
  return dm.EdgeApply(
518
525
  space=self.instance_space,
519
526
  external_id=identifier,
@@ -550,6 +557,9 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
550
557
  yield error
551
558
  for target in values:
552
559
  external_id = f"{identifier}.{prop_id}.{target}"
560
+ if self._unquote_external_ids:
561
+ external_id = urllib.parse.unquote(external_id)
562
+
553
563
  yield dm.EdgeApply(
554
564
  space=self.instance_space,
555
565
  external_id=(external_id if len(external_id) < 256 else create_sha256_hash(external_id)),