cognite-neat 0.105.1__py3-none-any.whl → 0.106.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. cognite/neat/_config.py +6 -260
  2. cognite/neat/_graph/extractors/_classic_cdf/_base.py +26 -13
  3. cognite/neat/_graph/extractors/_classic_cdf/_classic.py +4 -1
  4. cognite/neat/_graph/extractors/_classic_cdf/_sequences.py +2 -2
  5. cognite/neat/_graph/loaders/_rdf2dms.py +7 -2
  6. cognite/neat/_graph/transformers/_base.py +4 -8
  7. cognite/neat/_graph/transformers/_classic_cdf.py +164 -80
  8. cognite/neat/_graph/transformers/_rdfpath.py +1 -1
  9. cognite/neat/_issues/warnings/_external.py +1 -1
  10. cognite/neat/_rules/importers/_rdf/_inference2rules.py +4 -2
  11. cognite/neat/_rules/models/mapping/_classic2core.yaml +70 -58
  12. cognite/neat/_rules/transformers/_mapping.py +3 -2
  13. cognite/neat/_session/_base.py +6 -7
  14. cognite/neat/_session/_inspect.py +6 -2
  15. cognite/neat/_session/_mapping.py +6 -8
  16. cognite/neat/_session/_prepare.py +9 -10
  17. cognite/neat/_session/_read.py +35 -26
  18. cognite/neat/_session/_set.py +9 -0
  19. cognite/neat/_session/_state.py +3 -1
  20. cognite/neat/_session/_to.py +11 -13
  21. cognite/neat/_store/_graph_store.py +33 -28
  22. cognite/neat/_utils/auth.py +35 -15
  23. cognite/neat/_utils/collection_.py +32 -11
  24. cognite/neat/_version.py +1 -1
  25. {cognite_neat-0.105.1.dist-info → cognite_neat-0.106.0.dist-info}/METADATA +1 -7
  26. {cognite_neat-0.105.1.dist-info → cognite_neat-0.106.0.dist-info}/RECORD +29 -29
  27. {cognite_neat-0.105.1.dist-info → cognite_neat-0.106.0.dist-info}/LICENSE +0 -0
  28. {cognite_neat-0.105.1.dist-info → cognite_neat-0.106.0.dist-info}/WHEEL +0 -0
  29. {cognite_neat-0.105.1.dist-info → cognite_neat-0.106.0.dist-info}/entry_points.txt +0 -0
cognite/neat/_config.py CHANGED
@@ -1,265 +1,11 @@
1
- import json
2
- import logging
3
- import os
4
- import shutil
5
- import sys
6
- from pathlib import Path
7
- from typing import Any, Literal, cast
1
+ from typing import Literal
8
2
 
9
- import yaml
10
- from pydantic import BaseModel, Field, model_validator
11
- from yaml import safe_load
3
+ from pydantic import BaseModel
12
4
 
13
- from cognite.neat._constants import EXAMPLE_GRAPHS, EXAMPLE_RULES, EXAMPLE_WORKFLOWS
14
- from cognite.neat._utils.auth import EnvironmentVariables
15
5
 
16
- if sys.version_info >= (3, 11):
17
- from enum import StrEnum
18
- from typing import Self
19
- else:
20
- from backports.strenum import StrEnum
21
- from typing_extensions import Self
6
+ class NeatConfig(BaseModel, validate_assignment=True):
7
+ progress_bar: Literal["tqdm", "rich", "tqdm-notebook", "infer"] | None = "infer"
8
+ use_iterate_bar_threshold: int | None = 500
22
9
 
23
- LOG_FORMAT = "%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s"
24
- LOG_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
25
10
 
26
-
27
- class RulesStoreType(StrEnum):
28
- """Rules Store type"""
29
-
30
- CDF = "cdf"
31
- FILE = "file"
32
- URL = "url"
33
- GOOGLE_SHEET = "google_sheet"
34
-
35
-
36
- class WorkflowsStoreType(StrEnum):
37
- """Workflows Store type"""
38
-
39
- CDF = "cdf"
40
- FILE = "file"
41
- URL = "url"
42
-
43
-
44
- class Config(BaseModel, arbitrary_types_allowed=True):
45
- workflows_store_type: WorkflowsStoreType = WorkflowsStoreType.FILE
46
- data_store_path: Path = Field(default_factory=lambda: Path.cwd() / "data")
47
-
48
- workflow_downloader_filter: list[str] | None = Field(
49
- description="List of workflow names+tags to filter on when downloading workflows from CDF. "
50
- "Example name:workflow_name=version,tag:tag_name",
51
- default=None,
52
- )
53
-
54
- cdf_auth_config: EnvironmentVariables = Field(default_factory=EnvironmentVariables.default)
55
- cdf_default_dataset_id: int = 0
56
- load_examples: bool = True
57
-
58
- log_level: Literal["ERROR", "WARNING", "INFO", "DEBUG"] = "INFO"
59
- log_format: str = LOG_FORMAT
60
- download_workflows_from_cdf: bool = Field(
61
- default=False,
62
- description="Downloads all workflows from CDF automatically and stores them locally",
63
- )
64
- stop_on_error: bool = False
65
-
66
- @model_validator(mode="before")
67
- def backwards_compatible(cls, data: Any):
68
- if not isinstance(data, dict):
69
- return data
70
- if "cdf_client" in data:
71
- cdf_client = data["cdf_client"]
72
- if isinstance(cdf_client, dict):
73
- if "base_url" in cdf_client:
74
- base_url = cdf_client["base_url"]
75
- cluster = base_url.removeprefix("https://").removesuffix(".cognitedata.com")
76
- else:
77
- base_url, cluster = "Missing", "Missing"
78
- if "scopes" in cdf_client:
79
- scopes = cdf_client["scopes"]
80
- if isinstance(scopes, list):
81
- scopes = ",".join(scopes)
82
- else:
83
- scopes = "Missing"
84
- data["cdf_auth_config"] = EnvironmentVariables(
85
- CDF_PROJECT=cdf_client.get("project", "Missing"),
86
- CDF_CLUSTER=cluster,
87
- CDF_URL=base_url,
88
- IDP_CLIENT_ID=cdf_client.get("client_id", "Missing"),
89
- IDP_CLIENT_SECRET=cdf_client.get("client_secret", "Missing"),
90
- IDP_TOKEN_URL=cdf_client.get("token_url", "Missing"),
91
- IDP_SCOPES=scopes,
92
- CDF_TIMEOUT=int(cdf_client.get("timeout", 60)),
93
- CDF_MAX_WORKERS=int(cdf_client.get("max_workers", 3)),
94
- )
95
- return data
96
-
97
- def as_legacy_config(
98
- self,
99
- ) -> dict[str, Any]:
100
- config: dict[str, Any] = {}
101
-
102
- config["workflows_store_type"] = self.workflows_store_type
103
- config["data_store_path"] = str(self.data_store_path)
104
- config["workflows_downloader_filter"] = self.workflow_downloader_filter
105
-
106
- config["cdf_client"] = {}
107
- if self.cdf_auth_config.CDF_PROJECT not in {"Missing", "NOT SET"}:
108
- config["cdf_client"]["project"] = self.cdf_auth_config.CDF_PROJECT
109
- if self.cdf_auth_config.CDF_CLUSTER not in {"Missing", "NOT SET"}:
110
- config["cdf_client"]["cluster"] = self.cdf_auth_config.CDF_CLUSTER
111
- if self.cdf_auth_config.CDF_URL:
112
- config["cdf_client"]["base_url"] = self.cdf_auth_config.CDF_URL
113
- if self.cdf_auth_config.IDP_CLIENT_ID:
114
- config["cdf_client"]["client_id"] = self.cdf_auth_config.IDP_CLIENT_ID
115
- if self.cdf_auth_config.IDP_CLIENT_SECRET:
116
- config["cdf_client"]["client_secret"] = self.cdf_auth_config.IDP_CLIENT_SECRET
117
- if self.cdf_auth_config.IDP_TOKEN_URL:
118
- config["cdf_client"]["token_url"] = self.cdf_auth_config.IDP_TOKEN_URL
119
- if self.cdf_auth_config.IDP_SCOPES:
120
- config["cdf_client"]["scopes"] = self.cdf_auth_config.idp_scopes
121
- if self.cdf_auth_config.CDF_TIMEOUT:
122
- config["cdf_client"]["timeout"] = self.cdf_auth_config.CDF_TIMEOUT
123
- if self.cdf_auth_config.CDF_MAX_WORKERS:
124
- config["cdf_client"]["max_workers"] = self.cdf_auth_config.CDF_MAX_WORKERS
125
-
126
- config["cdf_default_dataset_id"] = self.cdf_default_dataset_id
127
- config["load_examples"] = self.load_examples
128
- config["log_level"] = self.log_level
129
- config["log_format"] = self.log_format
130
- config["download_workflows_from_cdf"] = self.download_workflows_from_cdf
131
- config["stop_on_error"] = self.stop_on_error
132
-
133
- return config
134
-
135
- @property
136
- def _dir_suffix(self) -> str:
137
- is_test_running = "pytest" in sys.modules
138
- if is_test_running:
139
- # Todo change the below to f"-{os.getpid()}" when all tests supports parallel execution.
140
- return ""
141
- return ""
142
-
143
- @property
144
- def rules_store_path(self) -> Path:
145
- return self.data_store_path / f"rules{self._dir_suffix}"
146
-
147
- @property
148
- def workflows_store_path(self) -> Path:
149
- return self.data_store_path / f"workflows{self._dir_suffix}"
150
-
151
- @property
152
- def source_graph_path(self) -> Path:
153
- return self.data_store_path / f"source-graphs{self._dir_suffix}"
154
-
155
- @property
156
- def staging_path(self) -> Path:
157
- return self.data_store_path / f"staging{self._dir_suffix}"
158
-
159
- @classmethod
160
- def from_yaml(cls, filepath: Path) -> Self:
161
- return cls(**safe_load(filepath.read_text()))
162
-
163
- def to_yaml(self, filepath: Path):
164
- # Parse as json to avoid Path and Enum objects
165
- dump = json.loads(self.model_dump_json())
166
-
167
- with filepath.open("w") as f:
168
- yaml.safe_dump(dump, f)
169
-
170
- @classmethod
171
- def from_env(cls) -> Self:
172
- missing = "Missing"
173
- # This is to be backwards compatible with the old config
174
-
175
- base_url: str | None = None
176
- if "NEAT_CDF_BASE_URL" in os.environ:
177
- base_url = os.environ["NEAT_CDF_BASE_URL"]
178
- if isinstance(base_url, str):
179
- cluster = base_url.removeprefix("https://").removesuffix(".cognitedata.com")
180
- else:
181
- cluster = missing
182
- variables = EnvironmentVariables(
183
- CDF_PROJECT=os.environ.get("NEAT_CDF_PROJECT", missing),
184
- CDF_CLUSTER=cluster,
185
- CDF_URL=base_url,
186
- IDP_CLIENT_ID=os.environ.get("NEAT_CDF_CLIENT_ID"),
187
- IDP_CLIENT_SECRET=os.environ.get("NEAT_CDF_CLIENT_SECRET"),
188
- IDP_TOKEN_URL=os.environ.get("NEAT_CDF_TOKEN_URL"),
189
- IDP_SCOPES=os.environ.get("NEAT_CDF_SCOPES"),
190
- CDF_TIMEOUT=int(os.environ["NEAT_CDF_CLIENT_TIMEOUT"] if "NEAT_CDF_CLIENT_TIMEOUT" in os.environ else 60),
191
- CDF_MAX_WORKERS=int(
192
- os.environ["NEAT_CDF_CLIENT_MAX_WORKERS"] if "NEAT_CDF_CLIENT_MAX_WORKERS" in os.environ else 3
193
- ),
194
- )
195
-
196
- if workflow_downloader_filter_value := os.environ.get("NEAT_WORKFLOW_DOWNLOADER_FILTER", None):
197
- workflow_downloader_filter = workflow_downloader_filter_value.split(",")
198
- else:
199
- workflow_downloader_filter = None
200
-
201
- return cls(
202
- cdf_auth_config=variables,
203
- workflows_store_type=os.environ.get( # type: ignore[arg-type]
204
- "NEAT_WORKFLOWS_STORE_TYPE", WorkflowsStoreType.FILE
205
- ),
206
- data_store_path=Path(os.environ.get("NEAT_DATA_PATH", "_app/data")),
207
- cdf_default_dataset_id=int(os.environ.get("NEAT_CDF_DEFAULT_DATASET_ID", 6476640149881990)),
208
- log_level=cast(
209
- Literal["ERROR", "WARNING", "INFO", "DEBUG"],
210
- os.environ.get("NEAT_LOG_LEVEL", "INFO"),
211
- ),
212
- workflow_downloader_filter=workflow_downloader_filter,
213
- load_examples=bool(os.environ.get("NEAT_LOAD_EXAMPLES", True) in ["True", "true", "1"]),
214
- )
215
-
216
-
217
- def copy_examples_to_directory(config: Config):
218
- """
219
- Copier over all the examples to the target_data_directory,
220
- without overwriting
221
-
222
- Args:
223
- target_data_dir : The target directory
224
- suffix : The suffix to add to the directory names
225
-
226
- """
227
-
228
- print(f"Copying examples into {config.data_store_path}")
229
- _copy_examples(EXAMPLE_RULES, config.rules_store_path)
230
- _copy_examples(EXAMPLE_GRAPHS, config.source_graph_path)
231
- _copy_examples(EXAMPLE_WORKFLOWS, config.workflows_store_path)
232
- config.staging_path.mkdir(exist_ok=True, parents=True)
233
-
234
-
235
- def create_data_dir_structure(config: Config) -> None:
236
- """
237
- Create the data directory structure in empty directory
238
-
239
- Args:
240
- target_data_dir : The target directory
241
- suffix : The suffix to add to the directory names
242
-
243
- """
244
- for path in (
245
- config.rules_store_path,
246
- config.source_graph_path,
247
- config.staging_path,
248
- config.workflows_store_path,
249
- ):
250
- path.mkdir(exist_ok=True, parents=True)
251
-
252
-
253
- def _copy_examples(source_dir: Path, target_dir: Path):
254
- for current in source_dir.rglob("*"):
255
- if current.is_dir():
256
- continue
257
- relative = current.relative_to(source_dir)
258
- if not (target := target_dir / relative).exists():
259
- target.parent.mkdir(exist_ok=True, parents=True)
260
- shutil.copy2(current, target)
261
-
262
-
263
- def configure_logging(level: str = "DEBUG", log_format: str = LOG_FORMAT):
264
- """Configure logging based on config."""
265
- logging.basicConfig(format=log_format, level=logging.getLevelName(level), datefmt=LOG_DATE_FORMAT)
11
+ GLOBAL_CONFIG = NeatConfig()
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import re
3
3
  import sys
4
+ import warnings
4
5
  from abc import ABC, abstractmethod
5
6
  from collections.abc import Callable, Iterable, Sequence, Set
6
7
  from datetime import datetime, timezone
@@ -9,13 +10,16 @@ from typing import Any, Generic, TypeVar
9
10
 
10
11
  from cognite.client import CogniteClient
11
12
  from cognite.client.data_classes._base import WriteableCogniteResource
13
+ from cognite.client.exceptions import CogniteAPIError
12
14
  from pydantic import AnyHttpUrl, ValidationError
13
15
  from rdflib import RDF, XSD, Literal, Namespace, URIRef
14
16
 
15
17
  from cognite.neat._constants import DEFAULT_NAMESPACE
16
18
  from cognite.neat._graph.extractors._base import BaseExtractor
19
+ from cognite.neat._issues.warnings import CDFAuthWarning
17
20
  from cognite.neat._shared import Triple
18
21
  from cognite.neat._utils.auxiliary import string_to_ideal_type
22
+ from cognite.neat._utils.collection_ import iterate_progress_bar_if_above_config_threshold
19
23
 
20
24
  T_CogniteResource = TypeVar("T_CogniteResource", bound=WriteableCogniteResource)
21
25
 
@@ -98,17 +102,11 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
98
102
 
99
103
  def extract(self) -> Iterable[Triple]:
100
104
  """Extracts an asset with the given asset_id."""
101
- if self.total:
102
- try:
103
- from rich.progress import track
104
- except ModuleNotFoundError:
105
- to_iterate = self.items
106
- else:
107
- to_iterate = track(
108
- self.items,
109
- total=self.limit or self.total,
110
- description=f"Extracting {type(self).__name__.removesuffix('Extractor')}",
111
- )
105
+
106
+ if self.total is not None and self.total > 0:
107
+ to_iterate = iterate_progress_bar_if_above_config_threshold(
108
+ self.items, self.total, f"Extracting {type(self).__name__.removesuffix('Extractor')}"
109
+ )
112
110
  else:
113
111
  to_iterate = self.items
114
112
  for no, asset in enumerate(to_iterate):
@@ -221,7 +219,7 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
221
219
  camel_case: bool = True,
222
220
  as_write: bool = False,
223
221
  ):
224
- total, items = cls._from_dataset(client, data_set_external_id)
222
+ total, items = cls._handle_no_access(lambda: cls._from_dataset(client, data_set_external_id))
225
223
  return cls(items, namespace, to_type, total, limit, unpack_metadata, skip_metadata_values, camel_case, as_write)
226
224
 
227
225
  @classmethod
@@ -244,7 +242,7 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
244
242
  camel_case: bool = True,
245
243
  as_write: bool = False,
246
244
  ):
247
- total, items = cls._from_hierarchy(client, root_asset_external_id)
245
+ total, items = cls._handle_no_access(lambda: cls._from_hierarchy(client, root_asset_external_id))
248
246
  return cls(items, namespace, to_type, total, limit, unpack_metadata, skip_metadata_values, camel_case, as_write)
249
247
 
250
248
  @classmethod
@@ -273,3 +271,18 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
273
271
  @abstractmethod
274
272
  def _from_file(cls, file_path: str | Path) -> tuple[int | None, Iterable[T_CogniteResource]]:
275
273
  raise NotImplementedError
274
+
275
+ @classmethod
276
+ def _handle_no_access(
277
+ cls, action: Callable[[], tuple[int | None, Iterable[T_CogniteResource]]]
278
+ ) -> tuple[int | None, Iterable[T_CogniteResource]]:
279
+ try:
280
+ return action()
281
+ except CogniteAPIError as e:
282
+ if e.code == 403:
283
+ warnings.warn(
284
+ CDFAuthWarning(f"extract {cls.__name__.removesuffix('Extractor').casefold()}", str(e)), stacklevel=2
285
+ )
286
+ return 0, []
287
+ else:
288
+ raise e
@@ -226,4 +226,7 @@ class ClassicGraphExtractor(BaseExtractor):
226
226
  @staticmethod
227
227
  def _chunk(items: Sequence, description: str) -> Iterable:
228
228
  to_iterate: Iterable = chunker(items, chunk_size=1000)
229
- return iterate_progress_bar(to_iterate, (len(items) // 1_000) + 1, description)
229
+ if items:
230
+ return iterate_progress_bar(to_iterate, (len(items) // 1_000) + 1, description)
231
+ else:
232
+ return to_iterate
@@ -73,7 +73,7 @@ class SequencesExtractor(ClassicCDFBaseExtractor[NeatSequence]):
73
73
  as_write: bool = False,
74
74
  unpack_columns: bool = False,
75
75
  ):
76
- total, items = cls._from_dataset(client, data_set_external_id)
76
+ total, items = cls._handle_no_access(lambda: cls._from_dataset(client, data_set_external_id))
77
77
  return cls(
78
78
  items,
79
79
  namespace,
@@ -101,7 +101,7 @@ class SequencesExtractor(ClassicCDFBaseExtractor[NeatSequence]):
101
101
  as_write: bool = False,
102
102
  unpack_columns: bool = False,
103
103
  ):
104
- total, items = cls._from_hierarchy(client, root_asset_external_id)
104
+ total, items = cls._handle_no_access(lambda: cls._from_hierarchy(client, root_asset_external_id))
105
105
  return cls(
106
106
  items,
107
107
  namespace,
@@ -37,6 +37,7 @@ from cognite.neat._rules.models.entities._single_value import ViewEntity
37
37
  from cognite.neat._shared import InstanceType
38
38
  from cognite.neat._store import NeatGraphStore
39
39
  from cognite.neat._utils.auxiliary import create_sha256_hash
40
+ from cognite.neat._utils.collection_ import iterate_progress_bar_if_above_config_threshold
40
41
  from cognite.neat._utils.rdf_ import remove_namespace_from_uri
41
42
  from cognite.neat._utils.upload import UploadResult
42
43
 
@@ -157,7 +158,7 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
157
158
  view_ids.append(f"{view_id!r} (self)")
158
159
 
159
160
  tracker = self._tracker(type(self).__name__, view_ids, "views")
160
- for view_id, (view, _) in view_and_count_by_id.items():
161
+ for view_id, (view, instance_count) in view_and_count_by_id.items():
161
162
  pydantic_cls, edge_by_type, issues = self._create_validation_classes(view) # type: ignore[var-annotated]
162
163
  yield from issues
163
164
  tracker.issue(issues)
@@ -194,7 +195,11 @@ class DMSLoader(CDFLoader[dm.InstanceApply]):
194
195
  # this assumes no changes in the suffix of view and class
195
196
  reader = self.graph_store.read(view.external_id)
196
197
 
197
- for identifier, properties in reader:
198
+ instance_iterable = iterate_progress_bar_if_above_config_threshold(
199
+ reader, instance_count, f"Loading {track_id}"
200
+ )
201
+
202
+ for identifier, properties in instance_iterable:
198
203
  if skip_properties:
199
204
  properties = {k: v for k, v in properties.items() if k not in skip_properties}
200
205
  try:
@@ -8,7 +8,7 @@ from rdflib.query import ResultRow
8
8
 
9
9
  from cognite.neat._issues.warnings import NeatValueWarning
10
10
  from cognite.neat._shared import Triple
11
- from cognite.neat._utils.collection_ import iterate_progress_bar
11
+ from cognite.neat._utils.collection_ import iterate_progress_bar_if_above_config_threshold
12
12
  from cognite.neat._utils.graph_transformations_report import GraphTransformationResult
13
13
 
14
14
  To_Add_Triples: TypeAlias = list[Triple]
@@ -42,7 +42,6 @@ class BaseTransformerStandardised(ABC):
42
42
  description: str
43
43
  _use_only_once: bool = False
44
44
  _need_changes: ClassVar[frozenset[str]] = frozenset()
45
- _use_iterate_bar_threshold: int = 500
46
45
 
47
46
  @abstractmethod
48
47
  def operation(self, query_result_row: ResultRow) -> RowTransformationOutput:
@@ -99,12 +98,9 @@ class BaseTransformerStandardised(ABC):
99
98
  return outcome
100
99
 
101
100
  result_iterable = graph.query(self._iterate_query())
102
- if iteration_count > self._use_iterate_bar_threshold:
103
- result_iterable = iterate_progress_bar( # type: ignore[misc, assignment]
104
- result_iterable,
105
- total=iteration_count,
106
- description=self.description,
107
- )
101
+ result_iterable = iterate_progress_bar_if_above_config_threshold(
102
+ result_iterable, iteration_count, self.description
103
+ )
108
104
 
109
105
  for row in result_iterable:
110
106
  row = cast(ResultRow, row)