cognite-neat 0.110.0__py3-none-any.whl → 0.111.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-neat might be problematic. Click here for more details.

Files changed (53) hide show
  1. cognite/neat/_alpha.py +6 -0
  2. cognite/neat/_client/_api/schema.py +26 -0
  3. cognite/neat/_client/data_classes/schema.py +1 -1
  4. cognite/neat/_constants.py +4 -1
  5. cognite/neat/_graph/extractors/__init__.py +4 -0
  6. cognite/neat/_graph/extractors/_classic_cdf/_base.py +8 -16
  7. cognite/neat/_graph/extractors/_classic_cdf/_classic.py +39 -9
  8. cognite/neat/_graph/extractors/_classic_cdf/_relationships.py +23 -17
  9. cognite/neat/_graph/extractors/_classic_cdf/_sequences.py +15 -17
  10. cognite/neat/_graph/extractors/_dict.py +102 -0
  11. cognite/neat/_graph/extractors/_dms.py +27 -40
  12. cognite/neat/_graph/extractors/_dms_graph.py +30 -3
  13. cognite/neat/_graph/extractors/_raw.py +67 -0
  14. cognite/neat/_graph/loaders/_base.py +20 -4
  15. cognite/neat/_graph/loaders/_rdf2dms.py +243 -89
  16. cognite/neat/_graph/queries/_base.py +137 -43
  17. cognite/neat/_graph/transformers/_classic_cdf.py +6 -22
  18. cognite/neat/_issues/_factory.py +9 -1
  19. cognite/neat/_issues/errors/__init__.py +2 -0
  20. cognite/neat/_issues/errors/_external.py +7 -0
  21. cognite/neat/_issues/warnings/user_modeling.py +12 -0
  22. cognite/neat/_rules/_constants.py +3 -0
  23. cognite/neat/_rules/analysis/_base.py +29 -50
  24. cognite/neat/_rules/exporters/_rules2excel.py +1 -1
  25. cognite/neat/_rules/importers/_rdf/_inference2rules.py +16 -10
  26. cognite/neat/_rules/models/_base_rules.py +0 -2
  27. cognite/neat/_rules/models/data_types.py +7 -0
  28. cognite/neat/_rules/models/dms/_exporter.py +9 -8
  29. cognite/neat/_rules/models/dms/_rules.py +26 -1
  30. cognite/neat/_rules/models/dms/_rules_input.py +5 -1
  31. cognite/neat/_rules/models/dms/_validation.py +101 -1
  32. cognite/neat/_rules/models/entities/_single_value.py +8 -3
  33. cognite/neat/_rules/models/entities/_wrapped.py +2 -2
  34. cognite/neat/_rules/models/information/_rules_input.py +1 -0
  35. cognite/neat/_rules/models/information/_validation.py +64 -17
  36. cognite/neat/_rules/transformers/_converters.py +7 -2
  37. cognite/neat/_session/_base.py +2 -0
  38. cognite/neat/_session/_explore.py +39 -0
  39. cognite/neat/_session/_inspect.py +25 -6
  40. cognite/neat/_session/_read.py +67 -3
  41. cognite/neat/_session/_set.py +7 -1
  42. cognite/neat/_session/_state.py +6 -0
  43. cognite/neat/_session/_to.py +115 -8
  44. cognite/neat/_store/_graph_store.py +8 -4
  45. cognite/neat/_utils/rdf_.py +34 -3
  46. cognite/neat/_utils/text.py +72 -4
  47. cognite/neat/_utils/upload.py +2 -0
  48. cognite/neat/_version.py +2 -2
  49. {cognite_neat-0.110.0.dist-info → cognite_neat-0.111.1.dist-info}/METADATA +1 -1
  50. {cognite_neat-0.110.0.dist-info → cognite_neat-0.111.1.dist-info}/RECORD +53 -50
  51. {cognite_neat-0.110.0.dist-info → cognite_neat-0.111.1.dist-info}/LICENSE +0 -0
  52. {cognite_neat-0.110.0.dist-info → cognite_neat-0.111.1.dist-info}/WHEEL +0 -0
  53. {cognite_neat-0.110.0.dist-info → cognite_neat-0.111.1.dist-info}/entry_points.txt +0 -0
@@ -6,20 +6,18 @@ from typing import cast
6
6
  from cognite.client import CogniteClient
7
7
  from cognite.client import data_modeling as dm
8
8
  from cognite.client.data_classes.data_modeling import DataModelIdentifier
9
- from cognite.client.data_classes.data_modeling.instances import Instance, PropertyValue
9
+ from cognite.client.data_classes.data_modeling.instances import Instance, InstanceSort
10
10
  from cognite.client.utils.useful_types import SequenceNotStr
11
- from rdflib import RDF, XSD, Literal, Namespace, URIRef
11
+ from rdflib import RDF, Literal, Namespace, URIRef
12
12
 
13
13
  from cognite.neat._config import GLOBAL_CONFIG
14
14
  from cognite.neat._constants import DEFAULT_SPACE_URI, is_readonly_property
15
15
  from cognite.neat._issues.errors import ResourceRetrievalError
16
16
  from cognite.neat._shared import Triple
17
- from cognite.neat._utils.auxiliary import string_to_ideal_type
18
17
  from cognite.neat._utils.collection_ import iterate_progress_bar
19
18
 
20
19
  from ._base import BaseExtractor
21
-
22
- DEFAULT_EMPTY_VALUES = frozenset({"nan", "null", "none", "", " ", "nil", "n/a", "na", "unknown", "undefined"})
20
+ from ._dict import DEFAULT_EMPTY_VALUES, DMSPropertyExtractor
23
21
 
24
22
 
25
23
  class DMSExtractor(BaseExtractor):
@@ -188,39 +186,15 @@ class DMSExtractor(BaseExtractor):
188
186
 
189
187
  for view_id, properties in instance.properties.items():
190
188
  namespace = self._get_namespace(view_id.space)
191
- for key, value in properties.items():
192
- for predicate_str, object_ in self._get_predicate_objects_pair(key, value):
193
- yield id_, namespace[urllib.parse.quote(predicate_str)], object_
194
-
195
- def _get_predicate_objects_pair(self, key: str, value: PropertyValue) -> Iterable[tuple[str, Literal | URIRef]]:
196
- if isinstance(value, str | float | bool | int):
197
- yield key, Literal(value)
198
- elif isinstance(value, dict) and "space" in value and "externalId" in value:
199
- yield key, self._as_uri_ref(dm.DirectRelationReference.load(value))
200
- elif isinstance(value, dict) and self.unpack_json:
201
- for sub_key, sub_value in value.items():
202
- if isinstance(sub_value, str):
203
- if sub_value.casefold() in self.empty_values:
204
- continue
205
- if self.str_to_ideal_type:
206
- yield sub_key, Literal(string_to_ideal_type(sub_value))
207
- else:
208
- yield sub_key, Literal(sub_value)
209
- elif isinstance(sub_value, int | float | bool):
210
- yield sub_key, Literal(sub_value)
211
- elif isinstance(sub_value, dict):
212
- yield from self._get_predicate_objects_pair(f"{key}_{sub_key}", sub_value)
213
- elif isinstance(sub_value, list):
214
- for item in sub_value:
215
- yield from self._get_predicate_objects_pair(f"{key}_{sub_key}", item)
216
- else:
217
- yield sub_key, Literal(str(sub_value))
218
- elif isinstance(value, dict):
219
- # This object is a json object.
220
- yield key, Literal(str(value), datatype=XSD._NS["json"])
221
- elif isinstance(value, list):
222
- for item in value:
223
- yield from self._get_predicate_objects_pair(key, item)
189
+ yield from DMSPropertyExtractor(
190
+ id_,
191
+ properties,
192
+ namespace,
193
+ self._as_uri_ref,
194
+ self.empty_values,
195
+ self.str_to_ideal_type,
196
+ self.unpack_json,
197
+ ).extract()
224
198
 
225
199
  def _as_uri_ref(self, instance: Instance | dm.DirectRelationReference) -> URIRef:
226
200
  return self._get_namespace(instance.space)[urllib.parse.quote(instance.external_id)]
@@ -270,8 +244,16 @@ class _ViewInstanceIterator(Iterable[Instance]):
270
244
  }
271
245
  # All nodes and edges with properties
272
246
  if self.view.used_for in ("node", "all"):
247
+ # Without a sort, the sort is implicitly by the internal id, as cursoring needs a stable sort.
248
+ # By making the sort be on external_id, Postgres should pick the index
249
+ # that's on (project_id, space, external_id)
250
+ # WHERE deleted_at IS NULL. In other words, avoiding soft deleted instances.
273
251
  node_iterable: Iterable[Instance] = self.client.data_modeling.instances(
274
- chunk_size=None, instance_type="node", sources=[view_id], space=self.instance_space
252
+ chunk_size=None,
253
+ instance_type="node",
254
+ sources=[view_id],
255
+ space=self.instance_space,
256
+ sort=InstanceSort(["node", "externalId"]),
275
257
  )
276
258
  if read_only_properties:
277
259
  node_iterable = self._remove_read_only_properties(node_iterable, read_only_properties, view_id)
@@ -279,7 +261,11 @@ class _ViewInstanceIterator(Iterable[Instance]):
279
261
 
280
262
  if self.view.used_for in ("edge", "all"):
281
263
  yield from self.client.data_modeling.instances(
282
- chunk_size=None, instance_type="edge", sources=[view_id], space=self.instance_space
264
+ chunk_size=None,
265
+ instance_type="edge",
266
+ sources=[view_id],
267
+ space=self.instance_space,
268
+ sort=InstanceSort(["edge", "externalId"]),
283
269
  )
284
270
 
285
271
  for prop in self.view.properties.values():
@@ -294,6 +280,7 @@ class _ViewInstanceIterator(Iterable[Instance]):
294
280
  ["edge", "type"], {"space": prop.type.space, "externalId": prop.type.external_id}
295
281
  ),
296
282
  space=self.instance_space,
283
+ sort=InstanceSort(["edge", "externalId"]),
297
284
  )
298
285
 
299
286
  @staticmethod
@@ -12,6 +12,8 @@ from cognite.neat._issues.warnings import CDFAuthWarning, ResourceNotFoundWarnin
12
12
  from cognite.neat._rules.importers import DMSImporter
13
13
  from cognite.neat._rules.models import DMSRules, InformationRules
14
14
  from cognite.neat._rules.models.data_types import Json
15
+ from cognite.neat._rules.models.entities import UnknownEntity
16
+ from cognite.neat._rules.models.information import InformationProperty
15
17
  from cognite.neat._rules.transformers import DMSToInformation, VerifyDMSRules
16
18
  from cognite.neat._shared import Triple
17
19
 
@@ -131,7 +133,6 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
131
133
  yield from DMSExtractor.from_views(
132
134
  self._client,
133
135
  views,
134
- overwrite_namespace=self._namespace,
135
136
  instance_space=self._instance_space,
136
137
  unpack_json=self._unpack_json,
137
138
  str_to_ideal_type=self._str_to_ideal_type,
@@ -186,8 +187,12 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
186
187
  prop
187
188
  for prop in dms_rules.properties
188
189
  if not (
189
- isinstance(prop.value_type, Json)
190
- or (isinstance(prop.value_type, str) and prop.value_type == json_name)
190
+ (
191
+ isinstance(prop.value_type, Json)
192
+ or (isinstance(prop.value_type, str) and prop.value_type == json_name)
193
+ )
194
+ # We are not unpacking list of JSONs.
195
+ and prop.is_list is not True
191
196
  )
192
197
  ]
193
198
 
@@ -195,5 +200,27 @@ class DMSGraphExtractor(KnowledgeGraphExtractor):
195
200
  # Any errors occur will be raised and caught outside the extractor.
196
201
  verified_dms = VerifyDMSRules(client=self._client).transform(unverified_dms)
197
202
  information_rules = DMSToInformation(self._namespace).transform(verified_dms)
203
+
204
+ # We need to sync the metadata between the two rules, such that the `.sync_with_info_rules` method works.
205
+ information_rules.metadata.physical = verified_dms.metadata.identifier
206
+ verified_dms.metadata.logical = information_rules.metadata.identifier
207
+ verified_dms.sync_with_info_rules(information_rules)
208
+
209
+ # Adding startNode and endNode to the information rules for views that are used for edges.
210
+ classes_by_prefix = {cls_.class_.prefix: cls_ for cls_ in information_rules.classes}
211
+ for view in self._model_views:
212
+ if view.used_for == "edge" and view.external_id in classes_by_prefix:
213
+ cls_ = classes_by_prefix[view.external_id]
214
+ for property_ in ("startNode", "endNode"):
215
+ information_rules.properties.append(
216
+ InformationProperty(
217
+ class_=cls_.class_,
218
+ property_=property_,
219
+ value_type=UnknownEntity(),
220
+ min_count=0,
221
+ max_count=1,
222
+ )
223
+ )
224
+
198
225
  self._issues.extend(issues)
199
226
  return information_rules, verified_dms
@@ -0,0 +1,67 @@
1
+ import urllib.parse
2
+ from collections.abc import Iterable, Set
3
+ from typing import Any, cast
4
+
5
+ from cognite.client.data_classes import Row, RowList
6
+ from cognite.client.utils.useful_types import SequenceNotStr
7
+ from rdflib import RDF, Namespace, URIRef
8
+
9
+ from cognite.neat._client import NeatClient
10
+ from cognite.neat._constants import DEFAULT_RAW_URI
11
+ from cognite.neat._shared import Triple
12
+
13
+ from ._base import BaseExtractor
14
+ from ._dict import DEFAULT_EMPTY_VALUES, DictExtractor
15
+
16
+
17
+ class RAWExtractor(BaseExtractor):
18
+ def __init__(
19
+ self,
20
+ client: NeatClient,
21
+ db_name: str,
22
+ table_name: str,
23
+ table_type: str | None = None,
24
+ foreign_keys: str | SequenceNotStr[str] | None = None,
25
+ namespace: Namespace | None = None,
26
+ empty_values: Set[str] = DEFAULT_EMPTY_VALUES,
27
+ str_to_ideal_type: bool = False,
28
+ unpack_json: bool = False,
29
+ ) -> None:
30
+ self.client = client
31
+ self.db_name = db_name
32
+ self.table_name = table_name
33
+ self.table_type = table_type
34
+ self.foreign_keys = {foreign_keys} if isinstance(foreign_keys, str) else set(foreign_keys or [])
35
+ self.namespace = namespace or Namespace(DEFAULT_RAW_URI)
36
+ self.empty_values = empty_values
37
+ self.str_to_ideal_type = str_to_ideal_type
38
+ self.unpack_json = unpack_json
39
+
40
+ @property
41
+ def _rdf_type(self) -> URIRef:
42
+ return self.namespace[urllib.parse.quote(self.table_type or self.table_name)]
43
+
44
+ def extract(self) -> Iterable[Triple]:
45
+ for row in self.client.raw.rows(self.db_name, self.table_name, partitions=10, chunk_size=None):
46
+ if isinstance(row, Row):
47
+ yield from self._row2triples(row)
48
+ elif isinstance(row, RowList):
49
+ # Bug in SDK returning row list with chunk_size= None
50
+ for item in row:
51
+ yield from self._row2triples(item)
52
+
53
+ def _row2triples(self, row: Row) -> Iterable[Triple]:
54
+ # The row is always set. It is just the PySDK that have it as str | None
55
+ key, data = cast(tuple[str, dict[str, Any]], (row.key, row.columns))
56
+ identifier = self.namespace[urllib.parse.quote(key)]
57
+ yield identifier, RDF.type, self._rdf_type
58
+
59
+ yield from DictExtractor(
60
+ identifier,
61
+ data,
62
+ self.namespace,
63
+ self.foreign_keys,
64
+ self.empty_values,
65
+ self.str_to_ideal_type,
66
+ self.unpack_json,
67
+ ).extract()
@@ -20,6 +20,11 @@ T_Output = TypeVar("T_Output")
20
20
  class _END_OF_CLASS: ...
21
21
 
22
22
 
23
+ class _START_OF_CLASS:
24
+ def __init__(self, class_name: str | None = None):
25
+ self.class_name = class_name
26
+
27
+
23
28
  class BaseLoader(ABC, Generic[T_Output]):
24
29
  _new_line = "\n"
25
30
  _encoding = "utf-8"
@@ -33,10 +38,16 @@ class BaseLoader(ABC, Generic[T_Output]):
33
38
 
34
39
  def load(self, stop_on_exception: bool = False) -> Iterable[T_Output | NeatIssue]:
35
40
  """Load the graph with data."""
36
- return (item for item in self._load(stop_on_exception) if item is not _END_OF_CLASS) # type: ignore[misc]
41
+ return (
42
+ item # type: ignore[misc]
43
+ for item in self._load(stop_on_exception)
44
+ if not (item is _END_OF_CLASS or isinstance(item, _START_OF_CLASS))
45
+ )
37
46
 
38
47
  @abstractmethod
39
- def _load(self, stop_on_exception: bool = False) -> Iterable[T_Output | NeatIssue | type[_END_OF_CLASS]]:
48
+ def _load(
49
+ self, stop_on_exception: bool = False
50
+ ) -> Iterable[T_Output | NeatIssue | type[_END_OF_CLASS] | _START_OF_CLASS]:
40
51
  """Load the graph with data."""
41
52
  pass
42
53
 
@@ -75,21 +86,25 @@ class CDFLoader(BaseLoader[T_Output]):
75
86
 
76
87
  issues = IssueList()
77
88
  items: list[T_Output] = []
89
+ last_class_name: str | None = None
78
90
  for result in self._load(stop_on_exception=False):
79
91
  if isinstance(result, NeatIssue):
80
92
  issues.append(result)
81
93
  elif result is _END_OF_CLASS:
82
94
  ...
95
+ elif isinstance(result, _START_OF_CLASS):
96
+ last_class_name = result.class_name
97
+ continue
83
98
  else:
84
99
  # MyPy does not understand that 'else' means the item will be of type T_Output
85
100
  items.append(result) # type: ignore[arg-type]
86
101
 
87
102
  if len(items) >= self._UPLOAD_BATCH_SIZE or result is _END_OF_CLASS:
88
- yield from self._upload_to_cdf(client, items, dry_run, issues)
103
+ yield from self._upload_to_cdf(client, items, dry_run, issues, last_class_name)
89
104
  issues = IssueList()
90
105
  items = []
91
106
  if items:
92
- yield from self._upload_to_cdf(client, items, dry_run, issues)
107
+ yield from self._upload_to_cdf(client, items, dry_run, issues, last_class_name)
93
108
 
94
109
  @abstractmethod
95
110
  def _get_required_capabilities(self) -> list[Capability]:
@@ -102,5 +117,6 @@ class CDFLoader(BaseLoader[T_Output]):
102
117
  items: list[T_Output],
103
118
  dry_run: bool,
104
119
  read_issues: IssueList,
120
+ class_name: str | None = None,
105
121
  ) -> Iterable[UploadResult]:
106
122
  raise NotImplementedError