cognite-neat 0.105.2__py3-none-any.whl → 0.107.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cognite-neat might be problematic. Click here for more details.
- cognite/neat/_config.py +6 -260
- cognite/neat/_graph/extractors/__init__.py +5 -1
- cognite/neat/_graph/extractors/_base.py +32 -0
- cognite/neat/_graph/extractors/_classic_cdf/_base.py +42 -16
- cognite/neat/_graph/extractors/_classic_cdf/_classic.py +78 -8
- cognite/neat/_graph/extractors/_classic_cdf/_relationships.py +2 -0
- cognite/neat/_graph/extractors/_classic_cdf/_sequences.py +10 -3
- cognite/neat/_graph/extractors/_dms.py +48 -14
- cognite/neat/_graph/extractors/_dms_graph.py +149 -0
- cognite/neat/_graph/extractors/_rdf_file.py +32 -5
- cognite/neat/_graph/loaders/_rdf2dms.py +119 -20
- cognite/neat/_graph/queries/_construct.py +1 -1
- cognite/neat/_graph/transformers/__init__.py +5 -0
- cognite/neat/_graph/transformers/_base.py +13 -9
- cognite/neat/_graph/transformers/_classic_cdf.py +141 -44
- cognite/neat/_graph/transformers/_rdfpath.py +4 -4
- cognite/neat/_graph/transformers/_value_type.py +54 -44
- cognite/neat/_issues/warnings/_external.py +1 -1
- cognite/neat/_rules/analysis/_base.py +1 -1
- cognite/neat/_rules/analysis/_information.py +14 -13
- cognite/neat/_rules/catalog/__init__.py +1 -0
- cognite/neat/_rules/catalog/classic_model.xlsx +0 -0
- cognite/neat/_rules/catalog/info-rules-imf.xlsx +0 -0
- cognite/neat/_rules/importers/_dms2rules.py +7 -5
- cognite/neat/_rules/importers/_rdf/_inference2rules.py +5 -3
- cognite/neat/_rules/models/_base_rules.py +0 -12
- cognite/neat/_rules/models/_types.py +5 -0
- cognite/neat/_rules/models/dms/_rules.py +50 -2
- cognite/neat/_rules/models/information/_rules.py +48 -5
- cognite/neat/_rules/models/information/_rules_input.py +1 -1
- cognite/neat/_rules/models/mapping/_classic2core.py +4 -5
- cognite/neat/_rules/models/mapping/_classic2core.yaml +70 -58
- cognite/neat/_rules/transformers/__init__.py +4 -0
- cognite/neat/_rules/transformers/_converters.py +209 -62
- cognite/neat/_rules/transformers/_mapping.py +3 -2
- cognite/neat/_session/_base.py +8 -13
- cognite/neat/_session/_inspect.py +6 -2
- cognite/neat/_session/_mapping.py +22 -13
- cognite/neat/_session/_prepare.py +9 -57
- cognite/neat/_session/_read.py +96 -29
- cognite/neat/_session/_set.py +9 -0
- cognite/neat/_session/_state.py +10 -1
- cognite/neat/_session/_to.py +51 -15
- cognite/neat/_session/exceptions.py +7 -3
- cognite/neat/_store/_graph_store.py +85 -39
- cognite/neat/_store/_rules_store.py +22 -0
- cognite/neat/_utils/auth.py +2 -0
- cognite/neat/_utils/collection_.py +32 -11
- cognite/neat/_version.py +1 -1
- {cognite_neat-0.105.2.dist-info → cognite_neat-0.107.0.dist-info}/METADATA +2 -8
- {cognite_neat-0.105.2.dist-info → cognite_neat-0.107.0.dist-info}/RECORD +54 -52
- {cognite_neat-0.105.2.dist-info → cognite_neat-0.107.0.dist-info}/WHEEL +1 -1
- {cognite_neat-0.105.2.dist-info → cognite_neat-0.107.0.dist-info}/LICENSE +0 -0
- {cognite_neat-0.105.2.dist-info → cognite_neat-0.107.0.dist-info}/entry_points.txt +0 -0
cognite/neat/_config.py
CHANGED
|
@@ -1,265 +1,11 @@
|
|
|
1
|
-
import
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
import shutil
|
|
5
|
-
import sys
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Any, Literal, cast
|
|
1
|
+
from typing import Literal
|
|
8
2
|
|
|
9
|
-
import
|
|
10
|
-
from pydantic import BaseModel, Field, model_validator
|
|
11
|
-
from yaml import safe_load
|
|
3
|
+
from pydantic import BaseModel
|
|
12
4
|
|
|
13
|
-
from cognite.neat._constants import EXAMPLE_GRAPHS, EXAMPLE_RULES, EXAMPLE_WORKFLOWS
|
|
14
|
-
from cognite.neat._utils.auth import EnvironmentVariables
|
|
15
5
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
else:
|
|
20
|
-
from backports.strenum import StrEnum
|
|
21
|
-
from typing_extensions import Self
|
|
6
|
+
class NeatConfig(BaseModel, validate_assignment=True):
|
|
7
|
+
progress_bar: Literal["tqdm", "rich", "tqdm-notebook", "infer"] | None = "infer"
|
|
8
|
+
use_iterate_bar_threshold: int | None = 500
|
|
22
9
|
|
|
23
|
-
LOG_FORMAT = "%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s"
|
|
24
|
-
LOG_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
|
|
25
10
|
|
|
26
|
-
|
|
27
|
-
class RulesStoreType(StrEnum):
|
|
28
|
-
"""Rules Store type"""
|
|
29
|
-
|
|
30
|
-
CDF = "cdf"
|
|
31
|
-
FILE = "file"
|
|
32
|
-
URL = "url"
|
|
33
|
-
GOOGLE_SHEET = "google_sheet"
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class WorkflowsStoreType(StrEnum):
|
|
37
|
-
"""Workflows Store type"""
|
|
38
|
-
|
|
39
|
-
CDF = "cdf"
|
|
40
|
-
FILE = "file"
|
|
41
|
-
URL = "url"
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class Config(BaseModel, arbitrary_types_allowed=True):
|
|
45
|
-
workflows_store_type: WorkflowsStoreType = WorkflowsStoreType.FILE
|
|
46
|
-
data_store_path: Path = Field(default_factory=lambda: Path.cwd() / "data")
|
|
47
|
-
|
|
48
|
-
workflow_downloader_filter: list[str] | None = Field(
|
|
49
|
-
description="List of workflow names+tags to filter on when downloading workflows from CDF. "
|
|
50
|
-
"Example name:workflow_name=version,tag:tag_name",
|
|
51
|
-
default=None,
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
cdf_auth_config: EnvironmentVariables = Field(default_factory=EnvironmentVariables.default)
|
|
55
|
-
cdf_default_dataset_id: int = 0
|
|
56
|
-
load_examples: bool = True
|
|
57
|
-
|
|
58
|
-
log_level: Literal["ERROR", "WARNING", "INFO", "DEBUG"] = "INFO"
|
|
59
|
-
log_format: str = LOG_FORMAT
|
|
60
|
-
download_workflows_from_cdf: bool = Field(
|
|
61
|
-
default=False,
|
|
62
|
-
description="Downloads all workflows from CDF automatically and stores them locally",
|
|
63
|
-
)
|
|
64
|
-
stop_on_error: bool = False
|
|
65
|
-
|
|
66
|
-
@model_validator(mode="before")
|
|
67
|
-
def backwards_compatible(cls, data: Any):
|
|
68
|
-
if not isinstance(data, dict):
|
|
69
|
-
return data
|
|
70
|
-
if "cdf_client" in data:
|
|
71
|
-
cdf_client = data["cdf_client"]
|
|
72
|
-
if isinstance(cdf_client, dict):
|
|
73
|
-
if "base_url" in cdf_client:
|
|
74
|
-
base_url = cdf_client["base_url"]
|
|
75
|
-
cluster = base_url.removeprefix("https://").removesuffix(".cognitedata.com")
|
|
76
|
-
else:
|
|
77
|
-
base_url, cluster = "Missing", "Missing"
|
|
78
|
-
if "scopes" in cdf_client:
|
|
79
|
-
scopes = cdf_client["scopes"]
|
|
80
|
-
if isinstance(scopes, list):
|
|
81
|
-
scopes = ",".join(scopes)
|
|
82
|
-
else:
|
|
83
|
-
scopes = "Missing"
|
|
84
|
-
data["cdf_auth_config"] = EnvironmentVariables(
|
|
85
|
-
CDF_PROJECT=cdf_client.get("project", "Missing"),
|
|
86
|
-
CDF_CLUSTER=cluster,
|
|
87
|
-
CDF_URL=base_url,
|
|
88
|
-
IDP_CLIENT_ID=cdf_client.get("client_id", "Missing"),
|
|
89
|
-
IDP_CLIENT_SECRET=cdf_client.get("client_secret", "Missing"),
|
|
90
|
-
IDP_TOKEN_URL=cdf_client.get("token_url", "Missing"),
|
|
91
|
-
IDP_SCOPES=scopes,
|
|
92
|
-
CDF_TIMEOUT=int(cdf_client.get("timeout", 60)),
|
|
93
|
-
CDF_MAX_WORKERS=int(cdf_client.get("max_workers", 3)),
|
|
94
|
-
)
|
|
95
|
-
return data
|
|
96
|
-
|
|
97
|
-
def as_legacy_config(
|
|
98
|
-
self,
|
|
99
|
-
) -> dict[str, Any]:
|
|
100
|
-
config: dict[str, Any] = {}
|
|
101
|
-
|
|
102
|
-
config["workflows_store_type"] = self.workflows_store_type
|
|
103
|
-
config["data_store_path"] = str(self.data_store_path)
|
|
104
|
-
config["workflows_downloader_filter"] = self.workflow_downloader_filter
|
|
105
|
-
|
|
106
|
-
config["cdf_client"] = {}
|
|
107
|
-
if self.cdf_auth_config.CDF_PROJECT not in {"Missing", "NOT SET"}:
|
|
108
|
-
config["cdf_client"]["project"] = self.cdf_auth_config.CDF_PROJECT
|
|
109
|
-
if self.cdf_auth_config.CDF_CLUSTER not in {"Missing", "NOT SET"}:
|
|
110
|
-
config["cdf_client"]["cluster"] = self.cdf_auth_config.CDF_CLUSTER
|
|
111
|
-
if self.cdf_auth_config.CDF_URL:
|
|
112
|
-
config["cdf_client"]["base_url"] = self.cdf_auth_config.CDF_URL
|
|
113
|
-
if self.cdf_auth_config.IDP_CLIENT_ID:
|
|
114
|
-
config["cdf_client"]["client_id"] = self.cdf_auth_config.IDP_CLIENT_ID
|
|
115
|
-
if self.cdf_auth_config.IDP_CLIENT_SECRET:
|
|
116
|
-
config["cdf_client"]["client_secret"] = self.cdf_auth_config.IDP_CLIENT_SECRET
|
|
117
|
-
if self.cdf_auth_config.IDP_TOKEN_URL:
|
|
118
|
-
config["cdf_client"]["token_url"] = self.cdf_auth_config.IDP_TOKEN_URL
|
|
119
|
-
if self.cdf_auth_config.IDP_SCOPES:
|
|
120
|
-
config["cdf_client"]["scopes"] = self.cdf_auth_config.idp_scopes
|
|
121
|
-
if self.cdf_auth_config.CDF_TIMEOUT:
|
|
122
|
-
config["cdf_client"]["timeout"] = self.cdf_auth_config.CDF_TIMEOUT
|
|
123
|
-
if self.cdf_auth_config.CDF_MAX_WORKERS:
|
|
124
|
-
config["cdf_client"]["max_workers"] = self.cdf_auth_config.CDF_MAX_WORKERS
|
|
125
|
-
|
|
126
|
-
config["cdf_default_dataset_id"] = self.cdf_default_dataset_id
|
|
127
|
-
config["load_examples"] = self.load_examples
|
|
128
|
-
config["log_level"] = self.log_level
|
|
129
|
-
config["log_format"] = self.log_format
|
|
130
|
-
config["download_workflows_from_cdf"] = self.download_workflows_from_cdf
|
|
131
|
-
config["stop_on_error"] = self.stop_on_error
|
|
132
|
-
|
|
133
|
-
return config
|
|
134
|
-
|
|
135
|
-
@property
|
|
136
|
-
def _dir_suffix(self) -> str:
|
|
137
|
-
is_test_running = "pytest" in sys.modules
|
|
138
|
-
if is_test_running:
|
|
139
|
-
# Todo change the below to f"-{os.getpid()}" when all tests supports parallel execution.
|
|
140
|
-
return ""
|
|
141
|
-
return ""
|
|
142
|
-
|
|
143
|
-
@property
|
|
144
|
-
def rules_store_path(self) -> Path:
|
|
145
|
-
return self.data_store_path / f"rules{self._dir_suffix}"
|
|
146
|
-
|
|
147
|
-
@property
|
|
148
|
-
def workflows_store_path(self) -> Path:
|
|
149
|
-
return self.data_store_path / f"workflows{self._dir_suffix}"
|
|
150
|
-
|
|
151
|
-
@property
|
|
152
|
-
def source_graph_path(self) -> Path:
|
|
153
|
-
return self.data_store_path / f"source-graphs{self._dir_suffix}"
|
|
154
|
-
|
|
155
|
-
@property
|
|
156
|
-
def staging_path(self) -> Path:
|
|
157
|
-
return self.data_store_path / f"staging{self._dir_suffix}"
|
|
158
|
-
|
|
159
|
-
@classmethod
|
|
160
|
-
def from_yaml(cls, filepath: Path) -> Self:
|
|
161
|
-
return cls(**safe_load(filepath.read_text()))
|
|
162
|
-
|
|
163
|
-
def to_yaml(self, filepath: Path):
|
|
164
|
-
# Parse as json to avoid Path and Enum objects
|
|
165
|
-
dump = json.loads(self.model_dump_json())
|
|
166
|
-
|
|
167
|
-
with filepath.open("w") as f:
|
|
168
|
-
yaml.safe_dump(dump, f)
|
|
169
|
-
|
|
170
|
-
@classmethod
|
|
171
|
-
def from_env(cls) -> Self:
|
|
172
|
-
missing = "Missing"
|
|
173
|
-
# This is to be backwards compatible with the old config
|
|
174
|
-
|
|
175
|
-
base_url: str | None = None
|
|
176
|
-
if "NEAT_CDF_BASE_URL" in os.environ:
|
|
177
|
-
base_url = os.environ["NEAT_CDF_BASE_URL"]
|
|
178
|
-
if isinstance(base_url, str):
|
|
179
|
-
cluster = base_url.removeprefix("https://").removesuffix(".cognitedata.com")
|
|
180
|
-
else:
|
|
181
|
-
cluster = missing
|
|
182
|
-
variables = EnvironmentVariables(
|
|
183
|
-
CDF_PROJECT=os.environ.get("NEAT_CDF_PROJECT", missing),
|
|
184
|
-
CDF_CLUSTER=cluster,
|
|
185
|
-
CDF_URL=base_url,
|
|
186
|
-
IDP_CLIENT_ID=os.environ.get("NEAT_CDF_CLIENT_ID"),
|
|
187
|
-
IDP_CLIENT_SECRET=os.environ.get("NEAT_CDF_CLIENT_SECRET"),
|
|
188
|
-
IDP_TOKEN_URL=os.environ.get("NEAT_CDF_TOKEN_URL"),
|
|
189
|
-
IDP_SCOPES=os.environ.get("NEAT_CDF_SCOPES"),
|
|
190
|
-
CDF_TIMEOUT=int(os.environ["NEAT_CDF_CLIENT_TIMEOUT"] if "NEAT_CDF_CLIENT_TIMEOUT" in os.environ else 60),
|
|
191
|
-
CDF_MAX_WORKERS=int(
|
|
192
|
-
os.environ["NEAT_CDF_CLIENT_MAX_WORKERS"] if "NEAT_CDF_CLIENT_MAX_WORKERS" in os.environ else 3
|
|
193
|
-
),
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
if workflow_downloader_filter_value := os.environ.get("NEAT_WORKFLOW_DOWNLOADER_FILTER", None):
|
|
197
|
-
workflow_downloader_filter = workflow_downloader_filter_value.split(",")
|
|
198
|
-
else:
|
|
199
|
-
workflow_downloader_filter = None
|
|
200
|
-
|
|
201
|
-
return cls(
|
|
202
|
-
cdf_auth_config=variables,
|
|
203
|
-
workflows_store_type=os.environ.get( # type: ignore[arg-type]
|
|
204
|
-
"NEAT_WORKFLOWS_STORE_TYPE", WorkflowsStoreType.FILE
|
|
205
|
-
),
|
|
206
|
-
data_store_path=Path(os.environ.get("NEAT_DATA_PATH", "_app/data")),
|
|
207
|
-
cdf_default_dataset_id=int(os.environ.get("NEAT_CDF_DEFAULT_DATASET_ID", 6476640149881990)),
|
|
208
|
-
log_level=cast(
|
|
209
|
-
Literal["ERROR", "WARNING", "INFO", "DEBUG"],
|
|
210
|
-
os.environ.get("NEAT_LOG_LEVEL", "INFO"),
|
|
211
|
-
),
|
|
212
|
-
workflow_downloader_filter=workflow_downloader_filter,
|
|
213
|
-
load_examples=bool(os.environ.get("NEAT_LOAD_EXAMPLES", True) in ["True", "true", "1"]),
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
def copy_examples_to_directory(config: Config):
|
|
218
|
-
"""
|
|
219
|
-
Copier over all the examples to the target_data_directory,
|
|
220
|
-
without overwriting
|
|
221
|
-
|
|
222
|
-
Args:
|
|
223
|
-
target_data_dir : The target directory
|
|
224
|
-
suffix : The suffix to add to the directory names
|
|
225
|
-
|
|
226
|
-
"""
|
|
227
|
-
|
|
228
|
-
print(f"Copying examples into {config.data_store_path}")
|
|
229
|
-
_copy_examples(EXAMPLE_RULES, config.rules_store_path)
|
|
230
|
-
_copy_examples(EXAMPLE_GRAPHS, config.source_graph_path)
|
|
231
|
-
_copy_examples(EXAMPLE_WORKFLOWS, config.workflows_store_path)
|
|
232
|
-
config.staging_path.mkdir(exist_ok=True, parents=True)
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
def create_data_dir_structure(config: Config) -> None:
|
|
236
|
-
"""
|
|
237
|
-
Create the data directory structure in empty directory
|
|
238
|
-
|
|
239
|
-
Args:
|
|
240
|
-
target_data_dir : The target directory
|
|
241
|
-
suffix : The suffix to add to the directory names
|
|
242
|
-
|
|
243
|
-
"""
|
|
244
|
-
for path in (
|
|
245
|
-
config.rules_store_path,
|
|
246
|
-
config.source_graph_path,
|
|
247
|
-
config.staging_path,
|
|
248
|
-
config.workflows_store_path,
|
|
249
|
-
):
|
|
250
|
-
path.mkdir(exist_ok=True, parents=True)
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def _copy_examples(source_dir: Path, target_dir: Path):
|
|
254
|
-
for current in source_dir.rglob("*"):
|
|
255
|
-
if current.is_dir():
|
|
256
|
-
continue
|
|
257
|
-
relative = current.relative_to(source_dir)
|
|
258
|
-
if not (target := target_dir / relative).exists():
|
|
259
|
-
target.parent.mkdir(exist_ok=True, parents=True)
|
|
260
|
-
shutil.copy2(current, target)
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
def configure_logging(level: str = "DEBUG", log_format: str = LOG_FORMAT):
|
|
264
|
-
"""Configure logging based on config."""
|
|
265
|
-
logging.basicConfig(format=log_format, level=logging.getLevelName(level), datefmt=LOG_DATE_FORMAT)
|
|
11
|
+
GLOBAL_CONFIG = NeatConfig()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from cognite.neat._session.engine._interface import Extractor as EngineExtractor
|
|
2
2
|
|
|
3
|
-
from ._base import BaseExtractor
|
|
3
|
+
from ._base import BaseExtractor, KnowledgeGraphExtractor
|
|
4
4
|
from ._classic_cdf._assets import AssetsExtractor
|
|
5
5
|
from ._classic_cdf._classic import ClassicGraphExtractor
|
|
6
6
|
from ._classic_cdf._data_sets import DataSetExtractor
|
|
@@ -12,6 +12,7 @@ from ._classic_cdf._sequences import SequencesExtractor
|
|
|
12
12
|
from ._classic_cdf._timeseries import TimeSeriesExtractor
|
|
13
13
|
from ._dexpi import DexpiExtractor
|
|
14
14
|
from ._dms import DMSExtractor
|
|
15
|
+
from ._dms_graph import DMSGraphExtractor
|
|
15
16
|
from ._iodd import IODDExtractor
|
|
16
17
|
from ._mock_graph_generator import MockGraphGenerator
|
|
17
18
|
from ._rdf_file import RdfFileExtractor
|
|
@@ -21,11 +22,13 @@ __all__ = [
|
|
|
21
22
|
"BaseExtractor",
|
|
22
23
|
"ClassicGraphExtractor",
|
|
23
24
|
"DMSExtractor",
|
|
25
|
+
"DMSGraphExtractor",
|
|
24
26
|
"DataSetExtractor",
|
|
25
27
|
"DexpiExtractor",
|
|
26
28
|
"EventsExtractor",
|
|
27
29
|
"FilesExtractor",
|
|
28
30
|
"IODDExtractor",
|
|
31
|
+
"KnowledgeGraphExtractor",
|
|
29
32
|
"LabelsExtractor",
|
|
30
33
|
"MockGraphGenerator",
|
|
31
34
|
"RdfFileExtractor",
|
|
@@ -51,6 +54,7 @@ TripleExtractors = (
|
|
|
51
54
|
| ClassicGraphExtractor
|
|
52
55
|
| DataSetExtractor
|
|
53
56
|
| EngineExtractor
|
|
57
|
+
| DMSGraphExtractor
|
|
54
58
|
)
|
|
55
59
|
|
|
56
60
|
|
|
@@ -1,9 +1,17 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
2
|
from collections.abc import Iterable
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
|
|
5
|
+
from rdflib import URIRef
|
|
6
|
+
|
|
7
|
+
from cognite.neat._constants import DEFAULT_NAMESPACE
|
|
8
|
+
from cognite.neat._rules.models import InformationRules
|
|
4
9
|
from cognite.neat._shared import Triple
|
|
5
10
|
from cognite.neat._utils.auxiliary import class_html_doc
|
|
6
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from cognite.neat._store._provenance import Agent as ProvenanceAgent
|
|
14
|
+
|
|
7
15
|
|
|
8
16
|
class BaseExtractor:
|
|
9
17
|
"""This is the base class for all extractors. It defines the interface that
|
|
@@ -24,3 +32,27 @@ class BaseExtractor:
|
|
|
24
32
|
@classmethod
|
|
25
33
|
def _repr_html_(cls) -> str:
|
|
26
34
|
return class_html_doc(cls)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class KnowledgeGraphExtractor(BaseExtractor):
|
|
38
|
+
"""A knowledge graph extractor extracts triples with a schema"""
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def get_information_rules(self) -> InformationRules:
|
|
42
|
+
"""Returns the information rules that the extractor uses."""
|
|
43
|
+
raise NotImplementedError()
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def description(self) -> str:
|
|
47
|
+
return self.__doc__.strip().split("\n")[0] if self.__doc__ else "Missing"
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def source_uri(self) -> URIRef:
|
|
51
|
+
raise NotImplementedError
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def agent(self) -> "ProvenanceAgent":
|
|
55
|
+
"""Provenance agent for the importer."""
|
|
56
|
+
from cognite.neat._store._provenance import Agent as ProvenanceAgent
|
|
57
|
+
|
|
58
|
+
return ProvenanceAgent(id_=DEFAULT_NAMESPACE[f"agent/{type(self).__name__}"])
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
3
|
import sys
|
|
4
|
+
import warnings
|
|
4
5
|
from abc import ABC, abstractmethod
|
|
5
6
|
from collections.abc import Callable, Iterable, Sequence, Set
|
|
6
7
|
from datetime import datetime, timezone
|
|
@@ -9,13 +10,16 @@ from typing import Any, Generic, TypeVar
|
|
|
9
10
|
|
|
10
11
|
from cognite.client import CogniteClient
|
|
11
12
|
from cognite.client.data_classes._base import WriteableCogniteResource
|
|
13
|
+
from cognite.client.exceptions import CogniteAPIError
|
|
12
14
|
from pydantic import AnyHttpUrl, ValidationError
|
|
13
15
|
from rdflib import RDF, XSD, Literal, Namespace, URIRef
|
|
14
16
|
|
|
15
17
|
from cognite.neat._constants import DEFAULT_NAMESPACE
|
|
16
18
|
from cognite.neat._graph.extractors._base import BaseExtractor
|
|
19
|
+
from cognite.neat._issues.warnings import CDFAuthWarning
|
|
17
20
|
from cognite.neat._shared import Triple
|
|
18
21
|
from cognite.neat._utils.auxiliary import string_to_ideal_type
|
|
22
|
+
from cognite.neat._utils.collection_ import iterate_progress_bar_if_above_config_threshold
|
|
19
23
|
|
|
20
24
|
T_CogniteResource = TypeVar("T_CogniteResource", bound=WriteableCogniteResource)
|
|
21
25
|
|
|
@@ -85,6 +89,7 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
85
89
|
skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
|
|
86
90
|
camel_case: bool = True,
|
|
87
91
|
as_write: bool = False,
|
|
92
|
+
prefix: str | None = None,
|
|
88
93
|
):
|
|
89
94
|
self.namespace = namespace or DEFAULT_NAMESPACE
|
|
90
95
|
self.items = items
|
|
@@ -95,20 +100,15 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
95
100
|
self.skip_metadata_values = skip_metadata_values
|
|
96
101
|
self.camel_case = camel_case
|
|
97
102
|
self.as_write = as_write
|
|
103
|
+
self.prefix = prefix
|
|
98
104
|
|
|
99
105
|
def extract(self) -> Iterable[Triple]:
|
|
100
106
|
"""Extracts an asset with the given asset_id."""
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
else:
|
|
107
|
-
to_iterate = track(
|
|
108
|
-
self.items,
|
|
109
|
-
total=self.limit or self.total,
|
|
110
|
-
description=f"Extracting {type(self).__name__.removesuffix('Extractor')}",
|
|
111
|
-
)
|
|
107
|
+
|
|
108
|
+
if self.total is not None and self.total > 0:
|
|
109
|
+
to_iterate = iterate_progress_bar_if_above_config_threshold(
|
|
110
|
+
self.items, self.total, f"Extracting {type(self).__name__.removesuffix('Extractor')}"
|
|
111
|
+
)
|
|
112
112
|
else:
|
|
113
113
|
to_iterate = self.items
|
|
114
114
|
for no, asset in enumerate(to_iterate):
|
|
@@ -176,6 +176,8 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
176
176
|
type_ = self._default_rdf_type
|
|
177
177
|
if self.to_type:
|
|
178
178
|
type_ = self.to_type(item) or type_
|
|
179
|
+
if self.prefix:
|
|
180
|
+
type_ = f"{self.prefix}{type_}"
|
|
179
181
|
return self._SPACE_PATTERN.sub("_", type_)
|
|
180
182
|
|
|
181
183
|
def _as_object(self, raw: Any, key: str) -> Literal | URIRef:
|
|
@@ -220,9 +222,12 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
220
222
|
skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
|
|
221
223
|
camel_case: bool = True,
|
|
222
224
|
as_write: bool = False,
|
|
225
|
+
prefix: str | None = None,
|
|
223
226
|
):
|
|
224
|
-
total, items = cls._from_dataset(client, data_set_external_id)
|
|
225
|
-
return cls(
|
|
227
|
+
total, items = cls._handle_no_access(lambda: cls._from_dataset(client, data_set_external_id))
|
|
228
|
+
return cls(
|
|
229
|
+
items, namespace, to_type, total, limit, unpack_metadata, skip_metadata_values, camel_case, as_write, prefix
|
|
230
|
+
)
|
|
226
231
|
|
|
227
232
|
@classmethod
|
|
228
233
|
@abstractmethod
|
|
@@ -243,9 +248,12 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
243
248
|
skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
|
|
244
249
|
camel_case: bool = True,
|
|
245
250
|
as_write: bool = False,
|
|
251
|
+
prefix: str | None = None,
|
|
246
252
|
):
|
|
247
|
-
total, items = cls._from_hierarchy(client, root_asset_external_id)
|
|
248
|
-
return cls(
|
|
253
|
+
total, items = cls._handle_no_access(lambda: cls._from_hierarchy(client, root_asset_external_id))
|
|
254
|
+
return cls(
|
|
255
|
+
items, namespace, to_type, total, limit, unpack_metadata, skip_metadata_values, camel_case, as_write, prefix
|
|
256
|
+
)
|
|
249
257
|
|
|
250
258
|
@classmethod
|
|
251
259
|
@abstractmethod
|
|
@@ -265,11 +273,29 @@ class ClassicCDFBaseExtractor(BaseExtractor, ABC, Generic[T_CogniteResource]):
|
|
|
265
273
|
skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
|
|
266
274
|
camel_case: bool = True,
|
|
267
275
|
as_write: bool = False,
|
|
276
|
+
prefix: str | None = None,
|
|
268
277
|
):
|
|
269
278
|
total, items = cls._from_file(file_path)
|
|
270
|
-
return cls(
|
|
279
|
+
return cls(
|
|
280
|
+
items, namespace, to_type, total, limit, unpack_metadata, skip_metadata_values, camel_case, as_write, prefix
|
|
281
|
+
)
|
|
271
282
|
|
|
272
283
|
@classmethod
|
|
273
284
|
@abstractmethod
|
|
274
285
|
def _from_file(cls, file_path: str | Path) -> tuple[int | None, Iterable[T_CogniteResource]]:
|
|
275
286
|
raise NotImplementedError
|
|
287
|
+
|
|
288
|
+
@classmethod
|
|
289
|
+
def _handle_no_access(
|
|
290
|
+
cls, action: Callable[[], tuple[int | None, Iterable[T_CogniteResource]]]
|
|
291
|
+
) -> tuple[int | None, Iterable[T_CogniteResource]]:
|
|
292
|
+
try:
|
|
293
|
+
return action()
|
|
294
|
+
except CogniteAPIError as e:
|
|
295
|
+
if e.code == 403:
|
|
296
|
+
warnings.warn(
|
|
297
|
+
CDFAuthWarning(f"extract {cls.__name__.removesuffix('Extractor').casefold()}", str(e)), stacklevel=2
|
|
298
|
+
)
|
|
299
|
+
return 0, []
|
|
300
|
+
else:
|
|
301
|
+
raise e
|
|
@@ -1,18 +1,25 @@
|
|
|
1
1
|
import warnings
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from collections.abc import Iterable, Sequence
|
|
4
|
-
from typing import ClassVar, NamedTuple
|
|
4
|
+
from typing import ClassVar, NamedTuple, cast
|
|
5
5
|
|
|
6
6
|
from cognite.client import CogniteClient
|
|
7
7
|
from cognite.client.exceptions import CogniteAPIError
|
|
8
|
-
from rdflib import Namespace
|
|
8
|
+
from rdflib import Namespace, URIRef
|
|
9
9
|
|
|
10
|
-
from cognite.neat._constants import CLASSIC_CDF_NAMESPACE
|
|
11
|
-
from cognite.neat._graph.extractors._base import
|
|
10
|
+
from cognite.neat._constants import CLASSIC_CDF_NAMESPACE, DEFAULT_NAMESPACE, get_default_prefixes_and_namespaces
|
|
11
|
+
from cognite.neat._graph.extractors._base import KnowledgeGraphExtractor
|
|
12
|
+
from cognite.neat._issues.errors import NeatValueError
|
|
12
13
|
from cognite.neat._issues.warnings import CDFAuthWarning
|
|
14
|
+
from cognite.neat._rules._shared import ReadRules
|
|
15
|
+
from cognite.neat._rules.catalog import classic_model
|
|
16
|
+
from cognite.neat._rules.models import InformationInputRules, InformationRules
|
|
17
|
+
from cognite.neat._rules.models._rdfpath import Entity as RDFPathEntity
|
|
18
|
+
from cognite.neat._rules.models._rdfpath import RDFPath, SingleProperty
|
|
13
19
|
from cognite.neat._shared import Triple
|
|
14
20
|
from cognite.neat._utils.collection_ import chunker, iterate_progress_bar
|
|
15
21
|
from cognite.neat._utils.rdf_ import remove_namespace_from_uri
|
|
22
|
+
from cognite.neat._utils.text import to_snake
|
|
16
23
|
|
|
17
24
|
from ._assets import AssetsExtractor
|
|
18
25
|
from ._base import ClassicCDFBaseExtractor, InstanceIdPrefix
|
|
@@ -37,7 +44,7 @@ class _ClassicCoreType(NamedTuple):
|
|
|
37
44
|
api_name: str
|
|
38
45
|
|
|
39
46
|
|
|
40
|
-
class ClassicGraphExtractor(
|
|
47
|
+
class ClassicGraphExtractor(KnowledgeGraphExtractor):
|
|
41
48
|
"""This extractor extracts all classic CDF Resources.
|
|
42
49
|
|
|
43
50
|
The Classic Graph consists of the following core resource type.
|
|
@@ -93,6 +100,7 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
93
100
|
root_asset_external_id: str | None = None,
|
|
94
101
|
namespace: Namespace | None = None,
|
|
95
102
|
limit_per_type: int | None = None,
|
|
103
|
+
prefix: str | None = None,
|
|
96
104
|
):
|
|
97
105
|
self._client = client
|
|
98
106
|
if sum([bool(data_set_external_id), bool(root_asset_external_id)]) != 1:
|
|
@@ -101,8 +109,14 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
101
109
|
self._data_set_external_id = data_set_external_id
|
|
102
110
|
self._namespace = namespace or CLASSIC_CDF_NAMESPACE
|
|
103
111
|
self._extractor_args = dict(
|
|
104
|
-
namespace=self._namespace,
|
|
112
|
+
namespace=self._namespace,
|
|
113
|
+
unpack_metadata=False,
|
|
114
|
+
as_write=True,
|
|
115
|
+
camel_case=True,
|
|
116
|
+
limit=limit_per_type,
|
|
117
|
+
prefix=prefix,
|
|
105
118
|
)
|
|
119
|
+
self._prefix = prefix
|
|
106
120
|
self._limit_per_type = limit_per_type
|
|
107
121
|
|
|
108
122
|
self._source_external_ids_by_type: dict[InstanceIdPrefix, set[str]] = defaultdict(set)
|
|
@@ -144,6 +158,59 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
144
158
|
else:
|
|
145
159
|
self._extracted_data_sets = True
|
|
146
160
|
|
|
161
|
+
def get_information_rules(self) -> InformationRules:
|
|
162
|
+
# To avoid circular imports
|
|
163
|
+
from cognite.neat._rules.importers import ExcelImporter
|
|
164
|
+
|
|
165
|
+
unverified = cast(ReadRules[InformationInputRules], ExcelImporter(classic_model).to_rules())
|
|
166
|
+
if unverified.rules is None:
|
|
167
|
+
raise NeatValueError(f"Could not read the classic model rules from {classic_model}.")
|
|
168
|
+
|
|
169
|
+
verified = unverified.rules.as_verified_rules()
|
|
170
|
+
prefixes = get_default_prefixes_and_namespaces()
|
|
171
|
+
instance_prefix: str | None = next((k for k, v in prefixes.items() if v == self._namespace), None)
|
|
172
|
+
if instance_prefix is None:
|
|
173
|
+
# We need to add a new prefix
|
|
174
|
+
instance_prefix = f"prefix_{len(prefixes) + 1}"
|
|
175
|
+
prefixes[instance_prefix] = self._namespace
|
|
176
|
+
verified.prefixes = prefixes
|
|
177
|
+
|
|
178
|
+
is_snake_case = self._extractor_args["camel_case"] is False
|
|
179
|
+
for prop in verified.properties:
|
|
180
|
+
prop_id = prop.property_
|
|
181
|
+
if is_snake_case:
|
|
182
|
+
prop_id = to_snake(prop_id)
|
|
183
|
+
prop.instance_source = RDFPath(
|
|
184
|
+
traversal=SingleProperty(
|
|
185
|
+
class_=RDFPathEntity(prefix=instance_prefix, suffix=prop.class_.suffix),
|
|
186
|
+
property=RDFPathEntity(prefix=instance_prefix, suffix=prop_id),
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
return verified
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def description(self) -> str:
|
|
193
|
+
if self._data_set_external_id:
|
|
194
|
+
source = f"data set {self._data_set_external_id}."
|
|
195
|
+
elif self._root_asset_external_id:
|
|
196
|
+
source = f"root asset {self._root_asset_external_id}."
|
|
197
|
+
else:
|
|
198
|
+
source = "unknown source."
|
|
199
|
+
return f"Extracting clasic CDF Graph (Assets, TimeSeries, Sequences, Events, Files) from {source}."
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def source_uri(self) -> URIRef:
|
|
203
|
+
if self._data_set_external_id:
|
|
204
|
+
resource = "dataset"
|
|
205
|
+
external_id = self._data_set_external_id
|
|
206
|
+
elif self._root_asset_external_id:
|
|
207
|
+
resource = "asset"
|
|
208
|
+
external_id = self._root_asset_external_id
|
|
209
|
+
else:
|
|
210
|
+
resource = "unknown"
|
|
211
|
+
external_id = "unknown"
|
|
212
|
+
return DEFAULT_NAMESPACE[f"{self._client.config.project}/{resource}/{external_id}"]
|
|
213
|
+
|
|
147
214
|
def _extract_core_start_nodes(self):
|
|
148
215
|
for core_node in self._classic_node_types:
|
|
149
216
|
if self._data_set_external_id:
|
|
@@ -217,7 +284,7 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
217
284
|
self._source_external_ids_by_type[resource_type].add(remove_namespace_from_uri(triple[2]))
|
|
218
285
|
elif triple[1] == self._namespace.labels:
|
|
219
286
|
self._labels.add(remove_namespace_from_uri(triple[2]).removeprefix(InstanceIdPrefix.label))
|
|
220
|
-
elif triple[1] == self._namespace.
|
|
287
|
+
elif triple[1] == self._namespace.dataSetId:
|
|
221
288
|
self._data_set_ids.add(
|
|
222
289
|
int(remove_namespace_from_uri(triple[2]).removeprefix(InstanceIdPrefix.data_set))
|
|
223
290
|
)
|
|
@@ -226,4 +293,7 @@ class ClassicGraphExtractor(BaseExtractor):
|
|
|
226
293
|
@staticmethod
|
|
227
294
|
def _chunk(items: Sequence, description: str) -> Iterable:
|
|
228
295
|
to_iterate: Iterable = chunker(items, chunk_size=1000)
|
|
229
|
-
|
|
296
|
+
if items:
|
|
297
|
+
return iterate_progress_bar(to_iterate, (len(items) // 1_000) + 1, description)
|
|
298
|
+
else:
|
|
299
|
+
return to_iterate
|
|
@@ -28,6 +28,7 @@ class RelationshipsExtractor(ClassicCDFBaseExtractor[Relationship]):
|
|
|
28
28
|
skip_metadata_values: Set[str] | None = DEFAULT_SKIP_METADATA_VALUES,
|
|
29
29
|
camel_case: bool = True,
|
|
30
30
|
as_write: bool = False,
|
|
31
|
+
prefix: str | None = None,
|
|
31
32
|
):
|
|
32
33
|
super().__init__(
|
|
33
34
|
items,
|
|
@@ -39,6 +40,7 @@ class RelationshipsExtractor(ClassicCDFBaseExtractor[Relationship]):
|
|
|
39
40
|
skip_metadata_values=skip_metadata_values,
|
|
40
41
|
camel_case=camel_case,
|
|
41
42
|
as_write=as_write,
|
|
43
|
+
prefix=prefix,
|
|
42
44
|
)
|
|
43
45
|
# This is used by the ClassicExtractor to log the target nodes, such
|
|
44
46
|
# that it can extract them.
|