cognite-neat 0.87.6__py3-none-any.whl → 0.88.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cognite-neat might be problematic. Click here for more details.
- cognite/neat/_version.py +1 -1
- cognite/neat/app/api/data_classes/rest.py +0 -19
- cognite/neat/app/api/explorer.py +6 -4
- cognite/neat/app/api/routers/crud.py +11 -21
- cognite/neat/app/api/routers/workflows.py +24 -94
- cognite/neat/graph/stores/_base.py +5 -0
- cognite/neat/rules/importers/_inference2rules.py +31 -35
- cognite/neat/workflows/steps/data_contracts.py +17 -43
- cognite/neat/workflows/steps/lib/current/graph_extractor.py +28 -24
- cognite/neat/workflows/steps/lib/current/graph_loader.py +4 -21
- cognite/neat/workflows/steps/lib/current/graph_store.py +18 -134
- cognite/neat/workflows/steps_registry.py +5 -7
- {cognite_neat-0.87.6.dist-info → cognite_neat-0.88.0.dist-info}/METADATA +1 -1
- {cognite_neat-0.87.6.dist-info → cognite_neat-0.88.0.dist-info}/RECORD +17 -125
- cognite/neat/app/api/routers/core.py +0 -91
- cognite/neat/app/api/routers/data_exploration.py +0 -336
- cognite/neat/app/api/routers/rules.py +0 -203
- cognite/neat/legacy/__init__.py +0 -0
- cognite/neat/legacy/graph/__init__.py +0 -3
- cognite/neat/legacy/graph/examples/Knowledge-Graph-Nordic44-dirty.xml +0 -20182
- cognite/neat/legacy/graph/examples/Knowledge-Graph-Nordic44.xml +0 -20163
- cognite/neat/legacy/graph/examples/__init__.py +0 -10
- cognite/neat/legacy/graph/examples/skos-capturing-sheet-wind-topics.xlsx +0 -0
- cognite/neat/legacy/graph/exceptions.py +0 -90
- cognite/neat/legacy/graph/extractors/__init__.py +0 -6
- cognite/neat/legacy/graph/extractors/_base.py +0 -14
- cognite/neat/legacy/graph/extractors/_dexpi.py +0 -44
- cognite/neat/legacy/graph/extractors/_graph_capturing_sheet.py +0 -403
- cognite/neat/legacy/graph/extractors/_mock_graph_generator.py +0 -361
- cognite/neat/legacy/graph/loaders/__init__.py +0 -23
- cognite/neat/legacy/graph/loaders/_asset_loader.py +0 -511
- cognite/neat/legacy/graph/loaders/_base.py +0 -67
- cognite/neat/legacy/graph/loaders/_exceptions.py +0 -85
- cognite/neat/legacy/graph/loaders/core/__init__.py +0 -0
- cognite/neat/legacy/graph/loaders/core/labels.py +0 -58
- cognite/neat/legacy/graph/loaders/core/models.py +0 -136
- cognite/neat/legacy/graph/loaders/core/rdf_to_assets.py +0 -1046
- cognite/neat/legacy/graph/loaders/core/rdf_to_relationships.py +0 -559
- cognite/neat/legacy/graph/loaders/rdf_to_dms.py +0 -309
- cognite/neat/legacy/graph/loaders/validator.py +0 -87
- cognite/neat/legacy/graph/models.py +0 -6
- cognite/neat/legacy/graph/stores/__init__.py +0 -13
- cognite/neat/legacy/graph/stores/_base.py +0 -400
- cognite/neat/legacy/graph/stores/_graphdb_store.py +0 -52
- cognite/neat/legacy/graph/stores/_memory_store.py +0 -43
- cognite/neat/legacy/graph/stores/_oxigraph_store.py +0 -151
- cognite/neat/legacy/graph/stores/_oxrdflib.py +0 -247
- cognite/neat/legacy/graph/stores/_rdf_to_graph.py +0 -42
- cognite/neat/legacy/graph/transformations/__init__.py +0 -0
- cognite/neat/legacy/graph/transformations/entity_matcher.py +0 -101
- cognite/neat/legacy/graph/transformations/query_generator/__init__.py +0 -3
- cognite/neat/legacy/graph/transformations/query_generator/sparql.py +0 -575
- cognite/neat/legacy/graph/transformations/transformer.py +0 -322
- cognite/neat/legacy/rules/__init__.py +0 -0
- cognite/neat/legacy/rules/analysis.py +0 -231
- cognite/neat/legacy/rules/examples/Rules-Nordic44-to-graphql.xlsx +0 -0
- cognite/neat/legacy/rules/examples/Rules-Nordic44.xlsx +0 -0
- cognite/neat/legacy/rules/examples/__init__.py +0 -18
- cognite/neat/legacy/rules/examples/power-grid-containers.yaml +0 -124
- cognite/neat/legacy/rules/examples/power-grid-example.xlsx +0 -0
- cognite/neat/legacy/rules/examples/power-grid-model.yaml +0 -224
- cognite/neat/legacy/rules/examples/rules-template.xlsx +0 -0
- cognite/neat/legacy/rules/examples/sheet2cdf-transformation-rules.xlsx +0 -0
- cognite/neat/legacy/rules/examples/skos-rules.xlsx +0 -0
- cognite/neat/legacy/rules/examples/source-to-solution-mapping-rules.xlsx +0 -0
- cognite/neat/legacy/rules/examples/wind-energy.owl +0 -1511
- cognite/neat/legacy/rules/exceptions.py +0 -2972
- cognite/neat/legacy/rules/exporters/__init__.py +0 -20
- cognite/neat/legacy/rules/exporters/_base.py +0 -45
- cognite/neat/legacy/rules/exporters/_core/__init__.py +0 -5
- cognite/neat/legacy/rules/exporters/_core/rules2labels.py +0 -24
- cognite/neat/legacy/rules/exporters/_rules2dms.py +0 -885
- cognite/neat/legacy/rules/exporters/_rules2excel.py +0 -213
- cognite/neat/legacy/rules/exporters/_rules2graphql.py +0 -183
- cognite/neat/legacy/rules/exporters/_rules2ontology.py +0 -524
- cognite/neat/legacy/rules/exporters/_rules2pydantic_models.py +0 -748
- cognite/neat/legacy/rules/exporters/_rules2rules.py +0 -105
- cognite/neat/legacy/rules/exporters/_rules2triples.py +0 -38
- cognite/neat/legacy/rules/exporters/_validation.py +0 -146
- cognite/neat/legacy/rules/importers/__init__.py +0 -22
- cognite/neat/legacy/rules/importers/_base.py +0 -66
- cognite/neat/legacy/rules/importers/_dict2rules.py +0 -158
- cognite/neat/legacy/rules/importers/_dms2rules.py +0 -194
- cognite/neat/legacy/rules/importers/_graph2rules.py +0 -308
- cognite/neat/legacy/rules/importers/_json2rules.py +0 -39
- cognite/neat/legacy/rules/importers/_owl2rules/__init__.py +0 -3
- cognite/neat/legacy/rules/importers/_owl2rules/_owl2classes.py +0 -239
- cognite/neat/legacy/rules/importers/_owl2rules/_owl2metadata.py +0 -260
- cognite/neat/legacy/rules/importers/_owl2rules/_owl2properties.py +0 -217
- cognite/neat/legacy/rules/importers/_owl2rules/_owl2rules.py +0 -290
- cognite/neat/legacy/rules/importers/_spreadsheet2rules.py +0 -45
- cognite/neat/legacy/rules/importers/_xsd2rules.py +0 -20
- cognite/neat/legacy/rules/importers/_yaml2rules.py +0 -39
- cognite/neat/legacy/rules/models/__init__.py +0 -5
- cognite/neat/legacy/rules/models/_base.py +0 -151
- cognite/neat/legacy/rules/models/raw_rules.py +0 -316
- cognite/neat/legacy/rules/models/rdfpath.py +0 -237
- cognite/neat/legacy/rules/models/rules.py +0 -1289
- cognite/neat/legacy/rules/models/tables.py +0 -9
- cognite/neat/legacy/rules/models/value_types.py +0 -118
- cognite/neat/legacy/workflows/examples/Export_DMS/workflow.yaml +0 -89
- cognite/neat/legacy/workflows/examples/Export_Rules_to_Ontology/workflow.yaml +0 -152
- cognite/neat/legacy/workflows/examples/Extract_DEXPI_Graph_and_Export_Rules/workflow.yaml +0 -139
- cognite/neat/legacy/workflows/examples/Extract_RDF_Graph_and_Generate_Assets/workflow.yaml +0 -270
- cognite/neat/legacy/workflows/examples/Import_DMS/workflow.yaml +0 -65
- cognite/neat/legacy/workflows/examples/Ontology_to_Data_Model/workflow.yaml +0 -116
- cognite/neat/legacy/workflows/examples/Validate_Rules/workflow.yaml +0 -67
- cognite/neat/legacy/workflows/examples/Validate_Solution_Model/workflow.yaml +0 -64
- cognite/neat/legacy/workflows/examples/Visualize_Data_Model_Using_Mock_Graph/workflow.yaml +0 -95
- cognite/neat/legacy/workflows/examples/Visualize_Semantic_Data_Model/workflow.yaml +0 -111
- cognite/neat/workflows/examples/Extract_RDF_Graph_and_Generate_Assets/workflow.yaml +0 -270
- cognite/neat/workflows/migration/__init__.py +0 -0
- cognite/neat/workflows/migration/steps.py +0 -91
- cognite/neat/workflows/migration/wf_manifests.py +0 -33
- cognite/neat/workflows/steps/lib/legacy/__init__.py +0 -7
- cognite/neat/workflows/steps/lib/legacy/graph_contextualization.py +0 -82
- cognite/neat/workflows/steps/lib/legacy/graph_extractor.py +0 -746
- cognite/neat/workflows/steps/lib/legacy/graph_loader.py +0 -606
- cognite/neat/workflows/steps/lib/legacy/graph_store.py +0 -307
- cognite/neat/workflows/steps/lib/legacy/graph_transformer.py +0 -58
- cognite/neat/workflows/steps/lib/legacy/rules_exporter.py +0 -511
- cognite/neat/workflows/steps/lib/legacy/rules_importer.py +0 -612
- {cognite_neat-0.87.6.dist-info → cognite_neat-0.88.0.dist-info}/LICENSE +0 -0
- {cognite_neat-0.87.6.dist-info → cognite_neat-0.88.0.dist-info}/WHEEL +0 -0
- {cognite_neat-0.87.6.dist-info → cognite_neat-0.88.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,1046 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import sys
|
|
3
|
-
from collections.abc import Iterable, Mapping, Sequence
|
|
4
|
-
from dataclasses import dataclass, fields
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
from typing import Any, Literal, TypeAlias, cast, overload
|
|
7
|
-
from warnings import warn
|
|
8
|
-
|
|
9
|
-
import numpy as np
|
|
10
|
-
import pandas as pd
|
|
11
|
-
from cognite.client import CogniteClient
|
|
12
|
-
from cognite.client.data_classes import Asset, AssetHierarchy, AssetList, AssetUpdate
|
|
13
|
-
from cognite.client.exceptions import CogniteDuplicatedError
|
|
14
|
-
from deepdiff import DeepDiff # type: ignore[import]
|
|
15
|
-
from rdflib import Graph
|
|
16
|
-
from rdflib.term import URIRef
|
|
17
|
-
|
|
18
|
-
from cognite.neat.legacy.graph.loaders.core.models import AssetTemplate
|
|
19
|
-
from cognite.neat.legacy.graph.stores import NeatGraphStoreBase
|
|
20
|
-
from cognite.neat.legacy.rules.models.rules import Property, Rules
|
|
21
|
-
from cognite.neat.utils.auxiliary import retry_decorator
|
|
22
|
-
from cognite.neat.utils.collection_ import chunker
|
|
23
|
-
from cognite.neat.utils.rdf_ import remove_namespace_from_uri
|
|
24
|
-
from cognite.neat.utils.time_ import datetime_utc_now
|
|
25
|
-
|
|
26
|
-
if sys.version_info >= (3, 11):
|
|
27
|
-
from datetime import UTC
|
|
28
|
-
from typing import Self
|
|
29
|
-
else:
|
|
30
|
-
from datetime import timezone
|
|
31
|
-
|
|
32
|
-
from typing_extensions import Self
|
|
33
|
-
|
|
34
|
-
UTC = timezone.utc
|
|
35
|
-
|
|
36
|
-
EXCLUDE_PATHS = [
|
|
37
|
-
"root['labels']",
|
|
38
|
-
"root['metadata']['create_time']",
|
|
39
|
-
"root['metadata']['start_time']",
|
|
40
|
-
"root['metadata']['update_time']",
|
|
41
|
-
"root['metadata']['end_time']",
|
|
42
|
-
"root['metadata']['resurrection_time']", # need to account for assets that are brought back to life
|
|
43
|
-
]
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@dataclass
|
|
47
|
-
class NeatMetadataKeys:
|
|
48
|
-
"""Class holding mapping between NEAT metadata key names and their desired names in
|
|
49
|
-
in CDF Asset metadata
|
|
50
|
-
|
|
51
|
-
Args:
|
|
52
|
-
start_time: Start time key name
|
|
53
|
-
end_time: End time key name
|
|
54
|
-
update_time: Update time key name
|
|
55
|
-
resurrection_time: Resurrection time key name
|
|
56
|
-
identifier: Identifier key name
|
|
57
|
-
active: Active key name
|
|
58
|
-
type: Type key name
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
start_time: str = "start_time"
|
|
62
|
-
end_time: str = "end_time"
|
|
63
|
-
update_time: str = "update_time"
|
|
64
|
-
resurrection_time: str = "resurrection_time"
|
|
65
|
-
identifier: str = "identifier"
|
|
66
|
-
active: str = "active"
|
|
67
|
-
type: str = "type"
|
|
68
|
-
|
|
69
|
-
@classmethod
|
|
70
|
-
def load(cls, data: dict) -> Self:
|
|
71
|
-
cls_field_names = {f.name for f in fields(cls)}
|
|
72
|
-
valid_keys = {}
|
|
73
|
-
for key, value in data.items():
|
|
74
|
-
if key in cls_field_names:
|
|
75
|
-
valid_keys[key] = value
|
|
76
|
-
else:
|
|
77
|
-
logging.warning(f"Invalid key set {key}")
|
|
78
|
-
|
|
79
|
-
return cls(**valid_keys)
|
|
80
|
-
|
|
81
|
-
def as_aliases(self) -> dict[str, str]:
|
|
82
|
-
return {str(field.default): getattr(self, field.name) for field in fields(self)}
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def _get_class_instance_ids(graph: Graph, class_uri: URIRef, limit: int = -1) -> list[URIRef]:
|
|
86
|
-
"""Get instances ids for a given class
|
|
87
|
-
|
|
88
|
-
Args:
|
|
89
|
-
graph: Graph containing class instances
|
|
90
|
-
class_uri: Class for which instances are to be found
|
|
91
|
-
limit: Max number of instances to return, by default -1 meaning all instances
|
|
92
|
-
|
|
93
|
-
Returns:
|
|
94
|
-
List of class instance URIs
|
|
95
|
-
"""
|
|
96
|
-
|
|
97
|
-
query_statement = "SELECT DISTINCT ?subject WHERE { ?subject a <class> .} LIMIT X".replace(
|
|
98
|
-
"class", class_uri
|
|
99
|
-
).replace("LIMIT X", "" if limit == -1 else f"LIMIT {limit}")
|
|
100
|
-
logging.debug(f"Query statement: {query_statement}")
|
|
101
|
-
return [cast(tuple, res)[0] for res in list(graph.query(query_statement))]
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def _get_class_instance(graph: Graph, instance: URIRef) -> list[tuple]:
|
|
105
|
-
"""Get instance by means of tuples containing property-value pairs
|
|
106
|
-
Args:
|
|
107
|
-
graph: Graph containing class instances
|
|
108
|
-
instance: Instance URI
|
|
109
|
-
|
|
110
|
-
Returns:
|
|
111
|
-
list of property-value pairs for given instance
|
|
112
|
-
"""
|
|
113
|
-
|
|
114
|
-
query_statement = "SELECT DISTINCT ?predicate ?object WHERE {<subject> ?predicate ?object .}".replace(
|
|
115
|
-
"subject", instance
|
|
116
|
-
)
|
|
117
|
-
result = list(cast(tuple, graph.query(query_statement)))
|
|
118
|
-
|
|
119
|
-
# Adds instance id for the sake of keep the chain of custody
|
|
120
|
-
result += [(URIRef("http://purl.org/dc/terms/identifier"), instance)]
|
|
121
|
-
|
|
122
|
-
return result
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def _get_class_property_pairs(transformation_rules: Rules) -> dict[str, list[Property]]:
|
|
126
|
-
"""Define classes in terms of their properties
|
|
127
|
-
|
|
128
|
-
Args:
|
|
129
|
-
transformation_rules : Instance of TransformationRules containing class and property definitions
|
|
130
|
-
|
|
131
|
-
Returns:
|
|
132
|
-
Dict containing keys as class ids and list of their properties
|
|
133
|
-
"""
|
|
134
|
-
|
|
135
|
-
classes: dict[str, list[Property]] = {}
|
|
136
|
-
|
|
137
|
-
for property_ in transformation_rules.properties.keys():
|
|
138
|
-
class_ = transformation_rules.properties[property_].class_id
|
|
139
|
-
if class_ in classes:
|
|
140
|
-
classes[class_] += [transformation_rules.properties[property_]]
|
|
141
|
-
else:
|
|
142
|
-
classes[class_] = [transformation_rules.properties[property_]]
|
|
143
|
-
|
|
144
|
-
return classes
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def _define_asset_class_mapping(transformation_rules: Rules) -> dict[str, dict[str, list]]:
|
|
148
|
-
"""Define mapping from class to asset properties
|
|
149
|
-
|
|
150
|
-
Args:
|
|
151
|
-
transformation_rules : Instance of TransformationRules containing class and property definitions
|
|
152
|
-
|
|
153
|
-
Returns:
|
|
154
|
-
Dict containing mapping from class to asset properties
|
|
155
|
-
"""
|
|
156
|
-
solution2cdf_mapping_rules = _get_class_property_pairs(transformation_rules)
|
|
157
|
-
|
|
158
|
-
asset_class_mapping: dict[str, dict[str, list]] = {}
|
|
159
|
-
|
|
160
|
-
for class_, properties in solution2cdf_mapping_rules.items():
|
|
161
|
-
asset_class_mapping[class_] = {
|
|
162
|
-
"external_id": [],
|
|
163
|
-
"name": [],
|
|
164
|
-
"description": [],
|
|
165
|
-
"parent_external_id": [],
|
|
166
|
-
"metadata": [],
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
for property_ in properties:
|
|
170
|
-
if "Asset" in property_.cdf_resource_type and property_.property_name != "*":
|
|
171
|
-
for resource_type_property in property_.resource_type_property or []:
|
|
172
|
-
if (
|
|
173
|
-
resource_type_property in asset_class_mapping[class_]
|
|
174
|
-
and property_.property_name not in asset_class_mapping[class_][resource_type_property]
|
|
175
|
-
):
|
|
176
|
-
asset_class_mapping[class_][resource_type_property] += [property_.property_name]
|
|
177
|
-
|
|
178
|
-
if property_.property_name not in asset_class_mapping[class_]["metadata"]:
|
|
179
|
-
# Todo; Why Nikola? This adds for example name property to metadata? Isn't that
|
|
180
|
-
# controlled by the resource_type_property? If you would like this behavior you
|
|
181
|
-
# would set resource_type_property to ["metadata", "name"]?
|
|
182
|
-
asset_class_mapping[class_]["metadata"] += [property_.property_name]
|
|
183
|
-
|
|
184
|
-
return asset_class_mapping
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
def _remap_class_properties(class_instance: dict, asset_class_mapping: dict) -> tuple[dict, set, set]:
|
|
188
|
-
"""Remaps original class instance properties to asset properties (e.g., external_id, name, description, metadata)
|
|
189
|
-
|
|
190
|
-
Args:
|
|
191
|
-
class_instance: Dictionary containing class instance properties and values
|
|
192
|
-
originating from RDF stripped from namespaces
|
|
193
|
-
asset_class_mapping: Property mapping from class to asset
|
|
194
|
-
|
|
195
|
-
Returns:
|
|
196
|
-
Remapped class instance, set of missing asset properties and set of missing asset metadata
|
|
197
|
-
"""
|
|
198
|
-
# Make distinction between missing properties that map into Asset fields
|
|
199
|
-
# and missing RDF properties that are defined by sheet
|
|
200
|
-
instance_properties = list(class_instance.keys())
|
|
201
|
-
missing_properties = set()
|
|
202
|
-
|
|
203
|
-
for property_group, ordered_properties in asset_class_mapping.items():
|
|
204
|
-
if property_group != "metadata" and ordered_properties:
|
|
205
|
-
if matching_property := next((a for a in ordered_properties if a in instance_properties), None):
|
|
206
|
-
class_instance[property_group] = class_instance[matching_property]
|
|
207
|
-
else:
|
|
208
|
-
missing_properties.add(property_group)
|
|
209
|
-
|
|
210
|
-
missing_metadata = set(asset_class_mapping["metadata"]).difference(set(instance_properties))
|
|
211
|
-
|
|
212
|
-
return class_instance, missing_properties, missing_metadata
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
def _class2asset_instance(
|
|
216
|
-
class_: str,
|
|
217
|
-
class_instance: dict,
|
|
218
|
-
asset_class_mapping: dict,
|
|
219
|
-
data_set_id: int,
|
|
220
|
-
meta_keys: NeatMetadataKeys,
|
|
221
|
-
orphanage_asset_external_id: str | None = None,
|
|
222
|
-
external_id_prefix: str | None = None,
|
|
223
|
-
fallback_property: str = NeatMetadataKeys.identifier,
|
|
224
|
-
empty_name_default: str = "Missing Name",
|
|
225
|
-
add_missing_metadata: bool = True,
|
|
226
|
-
) -> dict[str, Any]:
|
|
227
|
-
"""Converts class instance to asset instance dictionary
|
|
228
|
-
|
|
229
|
-
Args:
|
|
230
|
-
class_: Class name which instance is being converted to asset instance
|
|
231
|
-
class_instance: Dictionary containing class instance properties and values originating from RDF
|
|
232
|
-
stripped from namespaces
|
|
233
|
-
asset_class_mapping: Property mapping from class to asset
|
|
234
|
-
data_set_id: data set id to which asset belongs
|
|
235
|
-
orphanage_asset_id: Orphanage asset external id, by default None
|
|
236
|
-
external_id_prefix: External id prefix to be added to any external id, by default None
|
|
237
|
-
fallback_property: Property from class instance to be used as fallback in case of
|
|
238
|
-
missing properties, by default "identifier"
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
Returns:
|
|
242
|
-
Asset instance dictionary
|
|
243
|
-
"""
|
|
244
|
-
|
|
245
|
-
remapped_class_instance, missing_properties, missing_metadata = _remap_class_properties(
|
|
246
|
-
class_instance, asset_class_mapping
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
# setting class instance type to class name
|
|
250
|
-
remapped_class_instance[meta_keys.type] = class_
|
|
251
|
-
# This will be a default case since we want to use original identifier as external_id
|
|
252
|
-
# We are though dropping namespace from the original identifier (avoiding long-tail URIs)
|
|
253
|
-
|
|
254
|
-
if "external_id" in missing_properties or asset_class_mapping["external_id"] == []:
|
|
255
|
-
try:
|
|
256
|
-
__extracted_from___class2asset_instance_49(
|
|
257
|
-
remapped_class_instance, fallback_property, "external_id", class_
|
|
258
|
-
)
|
|
259
|
-
except Exception:
|
|
260
|
-
__extracted_from___class2asset_instance_56(fallback_property, class_, remapped_class_instance)
|
|
261
|
-
# This should not be the use case however to still have name of the object we are using
|
|
262
|
-
# fallback property here as well (typically identifier)
|
|
263
|
-
if "name" in missing_properties:
|
|
264
|
-
try:
|
|
265
|
-
__extracted_from___class2asset_instance_49(remapped_class_instance, fallback_property, "name", class_)
|
|
266
|
-
except Exception:
|
|
267
|
-
__extracted_from___class2asset_instance_56(fallback_property, class_, remapped_class_instance)
|
|
268
|
-
|
|
269
|
-
# If object is expected to have parent, but parent is not provided, it is added to orphanage
|
|
270
|
-
# This is typically sign of objects not following proposed ontology/data model/schema
|
|
271
|
-
if "parent_external_id" in missing_properties and orphanage_asset_external_id:
|
|
272
|
-
remapped_class_instance["parent_external_id"] = orphanage_asset_external_id
|
|
273
|
-
|
|
274
|
-
if "name" in remapped_class_instance and remapped_class_instance["name"] == "":
|
|
275
|
-
remapped_class_instance["name"] = empty_name_default
|
|
276
|
-
# To maintain shape across of all assets of specific type we are adding missing metadata
|
|
277
|
-
# keys as empty strings, this was request by a customer
|
|
278
|
-
# Generally this is bad practice, but more of a workaround of their bad data
|
|
279
|
-
if missing_metadata and add_missing_metadata:
|
|
280
|
-
msg = f"Adding missing metadata keys with values set to empty string for {class_}"
|
|
281
|
-
msg += f" instance <{remapped_class_instance['identifier']}>. "
|
|
282
|
-
logging.debug(msg)
|
|
283
|
-
for key in missing_metadata:
|
|
284
|
-
if key not in remapped_class_instance.keys():
|
|
285
|
-
remapped_class_instance[key] = ""
|
|
286
|
-
logging.debug(f"\tKey {key} added to <{remapped_class_instance['identifier']}> metadata!")
|
|
287
|
-
|
|
288
|
-
asset_instance = AssetTemplate(
|
|
289
|
-
**remapped_class_instance, external_id_prefix=external_id_prefix, data_set_id=data_set_id
|
|
290
|
-
)
|
|
291
|
-
# Removing field external_id_prefix from asset instance dictionary as it is only
|
|
292
|
-
# convenience field for external_id and parent_external_id update in AssetTemplate
|
|
293
|
-
return asset_instance.model_dump(exclude={"external_id_prefix"})
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
# TODO Rename this here and in `__class2asset_instance`
|
|
297
|
-
def __extracted_from___class2asset_instance_49(remapped_class_instance, fallback_property, arg2, class_):
|
|
298
|
-
remapped_class_instance[arg2] = remapped_class_instance[fallback_property]
|
|
299
|
-
msg = f"Missing external_id for {class_} instance <{remapped_class_instance['identifier']}>. "
|
|
300
|
-
msg += f"Using value <{remapped_class_instance[fallback_property]}> provided "
|
|
301
|
-
msg += f"by property <{fallback_property}>!"
|
|
302
|
-
|
|
303
|
-
logging.debug(msg)
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
# TODO Rename this here and in `__class2asset_instance`
|
|
307
|
-
def __extracted_from___class2asset_instance_56(fallback_property, class_, remapped_class_instance):
|
|
308
|
-
msg = f"Fallback property <{fallback_property}> not found for {class_} "
|
|
309
|
-
msg += f"instance <{remapped_class_instance['identifier']}>."
|
|
310
|
-
logging.error(msg)
|
|
311
|
-
raise ValueError(msg)
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
def _list2dict(class_instance: list) -> dict[str, Any]:
|
|
315
|
-
"""Converting list of class instance properties and values to dictionary
|
|
316
|
-
|
|
317
|
-
Args:
|
|
318
|
-
class_instance: Class instance properties and values originating from RDF as list of tuples
|
|
319
|
-
|
|
320
|
-
Returns:
|
|
321
|
-
Class instance properties and values as dictionary
|
|
322
|
-
"""
|
|
323
|
-
|
|
324
|
-
class_instance_dict: dict[str, Any] = {}
|
|
325
|
-
for property_value_pair in class_instance:
|
|
326
|
-
property_ = remove_namespace_from_uri(property_value_pair[0])
|
|
327
|
-
|
|
328
|
-
# Remove namespace from URIRef values, otherwise convert Literal to string
|
|
329
|
-
# ideally this should react upon property type provided in sheet
|
|
330
|
-
# however Assets only support string values
|
|
331
|
-
value = (
|
|
332
|
-
remove_namespace_from_uri(property_value_pair[1])
|
|
333
|
-
if isinstance(property_value_pair[1], URIRef)
|
|
334
|
-
else str(property_value_pair[1])
|
|
335
|
-
)
|
|
336
|
-
|
|
337
|
-
if property_ in class_instance_dict and value not in class_instance_dict[property_]:
|
|
338
|
-
class_instance_dict[property_] = (
|
|
339
|
-
class_instance_dict[property_] + [value]
|
|
340
|
-
if isinstance(class_instance_dict[property_], list)
|
|
341
|
-
else [class_instance_dict[property_], value]
|
|
342
|
-
)
|
|
343
|
-
else:
|
|
344
|
-
class_instance_dict[property_] = value
|
|
345
|
-
|
|
346
|
-
return class_instance_dict
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
def rdf2assets(
|
|
350
|
-
graph_store: NeatGraphStoreBase,
|
|
351
|
-
rules: Rules,
|
|
352
|
-
data_set_id: int,
|
|
353
|
-
stop_on_exception: bool = False,
|
|
354
|
-
use_orphanage: bool = True,
|
|
355
|
-
meta_keys: NeatMetadataKeys | None = None,
|
|
356
|
-
asset_external_id_prefix: str | None = None,
|
|
357
|
-
) -> dict[str, dict[str, Any]]:
|
|
358
|
-
"""Creates assets from RDF graph
|
|
359
|
-
|
|
360
|
-
Args:
|
|
361
|
-
graph_store : Graph containing RDF data
|
|
362
|
-
rules : Instance of TransformationRules class containing transformation rules
|
|
363
|
-
data_set_id: data set id to which assets belong
|
|
364
|
-
stop_on_exception : Whether to stop upon exception.
|
|
365
|
-
use_orphanage : Whether to use an orphanage for assets without parent_external_id
|
|
366
|
-
meta_keys : The names of neat metadat keys to use.
|
|
367
|
-
|
|
368
|
-
Returns:
|
|
369
|
-
Dictionary representations of assets by external id.
|
|
370
|
-
"""
|
|
371
|
-
meta_keys = NeatMetadataKeys() if meta_keys is None else meta_keys
|
|
372
|
-
if rules.metadata.namespace is None:
|
|
373
|
-
raise ValueError("Namespace must be provided in transformation rules!")
|
|
374
|
-
namespace = rules.metadata.namespace
|
|
375
|
-
|
|
376
|
-
orphanage_asset_external_id = f"{asset_external_id_prefix or ''}orphanage-{data_set_id}"
|
|
377
|
-
|
|
378
|
-
graph = graph_store.get_graph()
|
|
379
|
-
# Step 1: Create rdf to asset property mapping
|
|
380
|
-
logging.info("Generating rdf to asset property mapping")
|
|
381
|
-
asset_class_mapping = _define_asset_class_mapping(rules)
|
|
382
|
-
|
|
383
|
-
# Step 4: Get ids of classes
|
|
384
|
-
logging.info("Get ids of instances of classes")
|
|
385
|
-
assets: dict[str, dict[str, Any]] = {}
|
|
386
|
-
class_ids = {class_: _get_class_instance_ids(graph, namespace[class_]) for class_ in asset_class_mapping}
|
|
387
|
-
# Step 5: Create Assets based on class instances
|
|
388
|
-
logging.info("Create Assets based on class instances")
|
|
389
|
-
meta_keys_aliases = meta_keys.as_aliases()
|
|
390
|
-
for class_ in asset_class_mapping:
|
|
391
|
-
# TODO: Rename class_id to instance_id
|
|
392
|
-
class_ns = namespace[class_]
|
|
393
|
-
logging.debug(f"Processing class <{class_ns}> . Number of instances: {len(class_ids[class_])}")
|
|
394
|
-
progress_counter = 0
|
|
395
|
-
# loading all instances into cache
|
|
396
|
-
try:
|
|
397
|
-
query = (
|
|
398
|
-
f"SELECT ?instance ?prop ?value "
|
|
399
|
-
f"WHERE {{ ?instance rdf:type <{class_ns}> . ?instance ?prop ?value . }} order by ?instance "
|
|
400
|
-
)
|
|
401
|
-
logging.info(query)
|
|
402
|
-
response_df = graph_store.query_to_dataframe(query)
|
|
403
|
-
except Exception as e:
|
|
404
|
-
logging.error(f"Error while loading instances of class <{class_ns}> into cache. Reason: {e}")
|
|
405
|
-
if stop_on_exception:
|
|
406
|
-
raise e
|
|
407
|
-
continue
|
|
408
|
-
|
|
409
|
-
grouped_df = response_df.groupby("instance")
|
|
410
|
-
|
|
411
|
-
for instance_id, group_df in grouped_df:
|
|
412
|
-
try:
|
|
413
|
-
instance_property_values = group_df.filter(items=["property", "value"]).values.tolist()
|
|
414
|
-
instance_property_values += [(URIRef("http://purl.org/dc/terms/identifier"), URIRef(str(instance_id)))]
|
|
415
|
-
|
|
416
|
-
# this will strip namespace from property names and values
|
|
417
|
-
class_instance = _list2dict(instance_property_values)
|
|
418
|
-
|
|
419
|
-
# class instance is repaired and converted to asset dictionary
|
|
420
|
-
asset = _class2asset_instance(
|
|
421
|
-
class_,
|
|
422
|
-
class_instance,
|
|
423
|
-
asset_class_mapping[class_],
|
|
424
|
-
data_set_id,
|
|
425
|
-
meta_keys,
|
|
426
|
-
orphanage_asset_external_id if use_orphanage else None, # we need only base external id
|
|
427
|
-
asset_external_id_prefix or None,
|
|
428
|
-
fallback_property=meta_keys.identifier,
|
|
429
|
-
)
|
|
430
|
-
|
|
431
|
-
# adding labels and timestamps
|
|
432
|
-
asset["labels"] = [asset["metadata"][meta_keys.type], "non-historic"]
|
|
433
|
-
now = str(datetime.now(UTC))
|
|
434
|
-
asset["metadata"][meta_keys.start_time] = now
|
|
435
|
-
asset["metadata"][meta_keys.update_time] = now
|
|
436
|
-
asset["metadata"] = {meta_keys_aliases.get(k, k): v for k, v in asset["metadata"].items()}
|
|
437
|
-
|
|
438
|
-
# log every 10000 assets
|
|
439
|
-
if progress_counter % 10000 == 0:
|
|
440
|
-
logging.info(" Next 10000 Assets processed")
|
|
441
|
-
|
|
442
|
-
assets[asset["external_id"]] = asset
|
|
443
|
-
progress_counter += 1
|
|
444
|
-
except Exception as ValidationError:
|
|
445
|
-
logging.error(
|
|
446
|
-
f"Skipping class <{class_}> instance <{remove_namespace_from_uri(str(instance_id))}>, "
|
|
447
|
-
f"reason:\n{ValidationError}\n"
|
|
448
|
-
)
|
|
449
|
-
if stop_on_exception:
|
|
450
|
-
raise ValidationError
|
|
451
|
-
|
|
452
|
-
logging.debug(f"Class <{class_}> processed")
|
|
453
|
-
|
|
454
|
-
if orphanage_asset_external_id not in assets:
|
|
455
|
-
logging.warning(f"Orphanage with external id {orphanage_asset_external_id} not found in asset hierarchy!")
|
|
456
|
-
logging.warning(f"Adding default orphanage with external id {orphanage_asset_external_id}")
|
|
457
|
-
assets[orphanage_asset_external_id] = _create_orphanage(orphanage_asset_external_id, data_set_id, meta_keys)
|
|
458
|
-
|
|
459
|
-
logging.info("Assets dictionary created")
|
|
460
|
-
|
|
461
|
-
return assets
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
def rdf2asset_dictionary(
|
|
465
|
-
graph_store: NeatGraphStoreBase,
|
|
466
|
-
transformation_rules: Rules,
|
|
467
|
-
stop_on_exception: bool = False,
|
|
468
|
-
use_orphanage: bool = True,
|
|
469
|
-
) -> dict[str, dict[str, Any]]:
|
|
470
|
-
warn("'rdf2asset_dictionary' is deprecated, please use 'rdf2assets' instead!", stacklevel=2)
|
|
471
|
-
logging.warning("'rdf2asset_dictionary' is deprecated, please use 'rdf2assets' instead!")
|
|
472
|
-
return rdf2assets(graph_store, transformation_rules, stop_on_exception, use_orphanage)
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
def _create_orphanage(orphanage_external_id: str, dataset_id: int, meta_keys: NeatMetadataKeys) -> dict:
|
|
476
|
-
now = str(datetime_utc_now())
|
|
477
|
-
return {
|
|
478
|
-
"external_id": orphanage_external_id,
|
|
479
|
-
"name": "Orphanage",
|
|
480
|
-
"parent_external_id": None,
|
|
481
|
-
"description": "Used to store all assets which parent does not exist",
|
|
482
|
-
"metadata": {
|
|
483
|
-
meta_keys.type: "Orphanage",
|
|
484
|
-
"cdfResourceType": "Asset",
|
|
485
|
-
meta_keys.identifier: "orphanage",
|
|
486
|
-
meta_keys.active: "true",
|
|
487
|
-
meta_keys.start_time: now,
|
|
488
|
-
meta_keys.update_time: now,
|
|
489
|
-
},
|
|
490
|
-
"data_set_id": dataset_id,
|
|
491
|
-
"labels": ["Orphanage", "non-historic"],
|
|
492
|
-
}
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
def _asset2dict(asset: Asset) -> dict:
|
|
496
|
-
"""Return asset as dict representation
|
|
497
|
-
|
|
498
|
-
Args:
|
|
499
|
-
asset : Instance of Asset class
|
|
500
|
-
|
|
501
|
-
Returns:
|
|
502
|
-
Asset in dict representation
|
|
503
|
-
"""
|
|
504
|
-
|
|
505
|
-
return {
|
|
506
|
-
"external_id": asset.external_id,
|
|
507
|
-
"name": asset.name,
|
|
508
|
-
"description": asset.description,
|
|
509
|
-
"parent_external_id": asset.parent_external_id,
|
|
510
|
-
"data_set_id": asset.data_set_id,
|
|
511
|
-
"metadata": asset.metadata,
|
|
512
|
-
}
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
def _flatten_labels(labels: list[dict[str, str]]) -> set[str]:
|
|
516
|
-
"""Flatten labels"""
|
|
517
|
-
result = set()
|
|
518
|
-
if labels is None:
|
|
519
|
-
return result
|
|
520
|
-
for label in labels:
|
|
521
|
-
if "externalId" in label:
|
|
522
|
-
result.add(label["externalId"])
|
|
523
|
-
elif "external_id" in label:
|
|
524
|
-
result.add(label["external_id"])
|
|
525
|
-
else:
|
|
526
|
-
logging.warning(f"Label {label} does not have externalId")
|
|
527
|
-
return result
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
def _is_historic(labels) -> bool:
|
|
531
|
-
"""Check if asset is historic"""
|
|
532
|
-
return "historic" in labels
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
def _categorize_cdf_assets(
|
|
536
|
-
client: CogniteClient, data_set_id: int, partitions: int
|
|
537
|
-
) -> tuple[pd.DataFrame | None, dict[str, set]]:
|
|
538
|
-
"""Categorize CDF assets
|
|
539
|
-
|
|
540
|
-
Args:
|
|
541
|
-
client : Instance of CogniteClient
|
|
542
|
-
data_set_id : Id of data set
|
|
543
|
-
partitions : Number of partitions
|
|
544
|
-
|
|
545
|
-
Returns:
|
|
546
|
-
CDF assets as pandas dataframe and dictionary with categorized assets
|
|
547
|
-
"""
|
|
548
|
-
cdf_assets = client.assets.list(data_set_ids=data_set_id, limit=-1, partitions=partitions)
|
|
549
|
-
|
|
550
|
-
cdf_assets = remove_non_existing_labels(client, cdf_assets)
|
|
551
|
-
|
|
552
|
-
cdf_asset_df = AssetList(resources=cdf_assets).to_pandas()
|
|
553
|
-
|
|
554
|
-
logging.info(f"Number of assets in CDF {len(cdf_asset_df)} that have been fetched")
|
|
555
|
-
|
|
556
|
-
if cdf_asset_df.empty:
|
|
557
|
-
return None, {"non-historic": set(), "historic": set()}
|
|
558
|
-
if "labels" not in cdf_asset_df:
|
|
559
|
-
# Add empty list for labels column.
|
|
560
|
-
cdf_asset_df["labels"] = np.empty((len(cdf_asset_df), 0)).tolist()
|
|
561
|
-
|
|
562
|
-
cdf_columns = set(cdf_asset_df.columns)
|
|
563
|
-
expected_columns = {"external_id", "labels", "parent_external_id", "data_set_id", "name", "description", "metadata"}
|
|
564
|
-
|
|
565
|
-
cdf_asset_df = cdf_asset_df[list(expected_columns.intersection(cdf_columns))]
|
|
566
|
-
cdf_asset_df = cdf_asset_df.where(pd.notnull(cdf_asset_df), None)
|
|
567
|
-
cdf_asset_df["labels"] = cdf_asset_df["labels"].apply(_flatten_labels).values # type: ignore
|
|
568
|
-
cdf_asset_df["is_historic"] = cdf_asset_df.labels.apply(_is_historic).values
|
|
569
|
-
|
|
570
|
-
categorized_asset_ids = {
|
|
571
|
-
"historic": set(cdf_asset_df[cdf_asset_df.is_historic].external_id.values),
|
|
572
|
-
"non-historic": set(cdf_asset_df[~cdf_asset_df.is_historic].external_id.values),
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
cdf_asset_df.drop(["is_historic"], axis=1, inplace=True)
|
|
576
|
-
msg = f"CDF assets categorized into {len(categorized_asset_ids['historic'])} historic"
|
|
577
|
-
msg += f" and {len(categorized_asset_ids['non-historic'])} non-historic assets"
|
|
578
|
-
logging.info(msg)
|
|
579
|
-
|
|
580
|
-
return cdf_asset_df, categorized_asset_ids
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
def order_assets(assets: dict[str, dict]) -> list[Asset]:
|
|
584
|
-
"""Order assets in a way that parent assets are created before child assets
|
|
585
|
-
|
|
586
|
-
Args:
|
|
587
|
-
assets : List of assets to be created
|
|
588
|
-
|
|
589
|
-
Returns:
|
|
590
|
-
Ordered list of assets
|
|
591
|
-
"""
|
|
592
|
-
hierarchy = AssetHierarchy([Asset(**asset) for asset in assets.values()], ignore_orphans=True)
|
|
593
|
-
insert_dct = hierarchy.groupby_parent_xid()
|
|
594
|
-
subtree_count = hierarchy.count_subtree(insert_dct)
|
|
595
|
-
|
|
596
|
-
hierarchy = None
|
|
597
|
-
|
|
598
|
-
asset_creation_order = pd.DataFrame.from_dict(subtree_count, orient="index", columns=["order"]).sort_values(
|
|
599
|
-
by="order", ascending=False
|
|
600
|
-
)
|
|
601
|
-
asset_creation_order["external_id"] = asset_creation_order.index
|
|
602
|
-
|
|
603
|
-
hierarchy = AssetList([Asset(**asset) for asset in assets.values()]).to_pandas()
|
|
604
|
-
hierarchy = hierarchy.where(pd.notnull(hierarchy), None)
|
|
605
|
-
hierarchy = hierarchy.merge(asset_creation_order, left_on="external_id", right_on="external_id")
|
|
606
|
-
hierarchy = hierarchy.sort_values(by="order", ascending=False)
|
|
607
|
-
hierarchy.reset_index(drop=True, inplace=True)
|
|
608
|
-
hierarchy.labels = hierarchy.labels.apply(_flatten_labels)
|
|
609
|
-
hierarchy.drop(["order"], axis=1, inplace=True)
|
|
610
|
-
|
|
611
|
-
return [Asset(**row.to_dict()) for _, row in hierarchy.iterrows()]
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
def _assets_to_create(rdf_assets: dict, asset_ids: set) -> list[Asset]:
|
|
615
|
-
"""Return list of assets to be created
|
|
616
|
-
|
|
617
|
-
Args:
|
|
618
|
-
rdf_assets : Dictionary containing assets derived from knowledge graph (RDF)
|
|
619
|
-
asset_ids : Set of asset ids to be created
|
|
620
|
-
|
|
621
|
-
Returns:
|
|
622
|
-
Ordered list of assets to be created
|
|
623
|
-
"""
|
|
624
|
-
start_time = datetime_utc_now()
|
|
625
|
-
if asset_ids:
|
|
626
|
-
logging.info("Wrangling assets to be created into their final form")
|
|
627
|
-
ordered_assets = order_assets({external_id: rdf_assets[external_id] for external_id in asset_ids})
|
|
628
|
-
|
|
629
|
-
logging.info(f"Wrangling completed in {(datetime_utc_now() - start_time).seconds} seconds")
|
|
630
|
-
return ordered_assets
|
|
631
|
-
return []
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
def _assets_to_update(
|
|
635
|
-
rdf_assets: dict,
|
|
636
|
-
cdf_assets: pd.DataFrame | None,
|
|
637
|
-
asset_ids: set,
|
|
638
|
-
meta_keys: NeatMetadataKeys,
|
|
639
|
-
exclude_paths: list = EXCLUDE_PATHS,
|
|
640
|
-
) -> tuple[list[Asset], dict[str, dict]]:
|
|
641
|
-
"""Return list of assets to be updated
|
|
642
|
-
|
|
643
|
-
Args:
|
|
644
|
-
rdf_assets : Dictionary containing assets derived from knowledge graph (RDF)
|
|
645
|
-
cdf_assets : Dataframe containing assets from CDF
|
|
646
|
-
asset_ids : Candidate assets to be updated
|
|
647
|
-
meta_keys : The neat meta data keys.
|
|
648
|
-
exclude_paths : Paths not to be checked when diffing rdf and cdf assets, by default EXCLUDE_PATHS
|
|
649
|
-
|
|
650
|
-
Returns:
|
|
651
|
-
List of assets to be updated and detailed report of changes per asset
|
|
652
|
-
"""
|
|
653
|
-
|
|
654
|
-
start_time = datetime_utc_now()
|
|
655
|
-
assets = []
|
|
656
|
-
report = {}
|
|
657
|
-
if not asset_ids:
|
|
658
|
-
return [], {}
|
|
659
|
-
logging.info("Wrangling assets to be updated into their final form")
|
|
660
|
-
if cdf_assets is None:
|
|
661
|
-
cdf_asset_subset = {}
|
|
662
|
-
else:
|
|
663
|
-
cdf_asset_subset = {
|
|
664
|
-
row["external_id"]: row
|
|
665
|
-
for row in cdf_assets[cdf_assets["external_id"].isin(asset_ids)].to_dict(orient="records")
|
|
666
|
-
}
|
|
667
|
-
for external_id in asset_ids:
|
|
668
|
-
cdf_asset = cdf_asset_subset[external_id]
|
|
669
|
-
diffing_result = DeepDiff(cdf_asset, rdf_assets[external_id], exclude_paths=exclude_paths)
|
|
670
|
-
|
|
671
|
-
if diffing_result and f"root['metadata']['{meta_keys.active}']" not in diffing_result.affected_paths:
|
|
672
|
-
asset = Asset(**rdf_assets[external_id])
|
|
673
|
-
if asset.metadata is None:
|
|
674
|
-
asset.metadata = {}
|
|
675
|
-
try:
|
|
676
|
-
asset.metadata[meta_keys.start_time] = cdf_asset[external_id]["metadata"][meta_keys.start_time]
|
|
677
|
-
except KeyError:
|
|
678
|
-
asset.metadata[meta_keys.start_time] = str(datetime.now(UTC))
|
|
679
|
-
asset.metadata[meta_keys.update_time] = str(datetime.now(UTC))
|
|
680
|
-
assets.append(asset)
|
|
681
|
-
|
|
682
|
-
report[external_id] = dict(diffing_result)
|
|
683
|
-
|
|
684
|
-
logging.info(f"Wrangling of {len(assets)} completed in {(datetime_utc_now() - start_time).seconds} seconds")
|
|
685
|
-
return assets, report
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
def _assets_to_resurrect(
|
|
689
|
-
rdf_assets: dict, cdf_assets: pd.DataFrame | None, asset_ids: set, meta_keys: NeatMetadataKeys
|
|
690
|
-
) -> list[Asset]:
|
|
691
|
-
"""Returns list of assets to be resurrected
|
|
692
|
-
|
|
693
|
-
Args:
|
|
694
|
-
rdf_assets : Dictionary containing assets derived from knowledge graph (RDF)
|
|
695
|
-
cdf_assets : Dataframe containing assets from CDF
|
|
696
|
-
asset_ids : Set of asset ids to be resurrected
|
|
697
|
-
|
|
698
|
-
Returns:
|
|
699
|
-
List of assets to be resurrected
|
|
700
|
-
"""
|
|
701
|
-
start_time = datetime_utc_now()
|
|
702
|
-
assets = []
|
|
703
|
-
if not asset_ids:
|
|
704
|
-
return []
|
|
705
|
-
logging.info("Wrangling assets to be resurrected into their final form")
|
|
706
|
-
if cdf_assets is None:
|
|
707
|
-
cdf_asset_subset = {}
|
|
708
|
-
else:
|
|
709
|
-
cdf_asset_subset = {
|
|
710
|
-
row["external_id"]: row
|
|
711
|
-
for row in cdf_assets[cdf_assets["external_id"].isin(asset_ids)].to_dict(orient="records")
|
|
712
|
-
}
|
|
713
|
-
for external_id in asset_ids:
|
|
714
|
-
cdf_asset = cdf_asset_subset[external_id]
|
|
715
|
-
|
|
716
|
-
asset = Asset(**rdf_assets[external_id])
|
|
717
|
-
if asset.metadata is None:
|
|
718
|
-
asset.metadata = {}
|
|
719
|
-
now = str(datetime.now(UTC))
|
|
720
|
-
try:
|
|
721
|
-
asset.metadata[meta_keys.start_time] = cdf_asset[external_id]["metadata"][meta_keys.start_time]
|
|
722
|
-
except KeyError:
|
|
723
|
-
asset.metadata[meta_keys.start_time] = now
|
|
724
|
-
asset.metadata[meta_keys.update_time] = now
|
|
725
|
-
asset.metadata[meta_keys.resurrection_time] = now
|
|
726
|
-
assets.append(asset)
|
|
727
|
-
|
|
728
|
-
logging.info(f"Wrangling of {len(assets)} completed in {(datetime_utc_now() - start_time).seconds} seconds")
|
|
729
|
-
return assets
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
def _assets_to_decommission(
|
|
733
|
-
cdf_assets: pd.DataFrame | None, asset_ids: set[str], meta_keys: NeatMetadataKeys
|
|
734
|
-
) -> list[Asset]:
|
|
735
|
-
start_time = datetime_utc_now()
|
|
736
|
-
|
|
737
|
-
assets = []
|
|
738
|
-
if not asset_ids:
|
|
739
|
-
return []
|
|
740
|
-
logging.info("Wrangling assets to be decommissioned into their final form")
|
|
741
|
-
if cdf_assets is None:
|
|
742
|
-
cdf_asset_subset: dict[str, dict] = {}
|
|
743
|
-
else:
|
|
744
|
-
cdf_asset_subset = {
|
|
745
|
-
row["external_id"]: row
|
|
746
|
-
for row in cdf_assets[cdf_assets["external_id"].isin(asset_ids)].to_dict(orient="records")
|
|
747
|
-
}
|
|
748
|
-
|
|
749
|
-
for external_id in asset_ids:
|
|
750
|
-
cdf_asset = cdf_asset_subset[external_id]
|
|
751
|
-
|
|
752
|
-
now = str(datetime.now(UTC))
|
|
753
|
-
cdf_asset["metadata"][meta_keys.update_time] = now
|
|
754
|
-
cdf_asset["metadata"].pop(meta_keys.resurrection_time, None)
|
|
755
|
-
cdf_asset["metadata"][meta_keys.end_time] = now
|
|
756
|
-
cdf_asset["metadata"][meta_keys.active] = "false"
|
|
757
|
-
try:
|
|
758
|
-
cdf_asset["labels"].remove("non-historic")
|
|
759
|
-
except KeyError:
|
|
760
|
-
logging.info(f"Asset {external_id} missed label 'non-historic'")
|
|
761
|
-
cdf_asset["labels"].add("historic")
|
|
762
|
-
|
|
763
|
-
assets.append(Asset(**cdf_asset))
|
|
764
|
-
|
|
765
|
-
logging.info(f"Wrangling completed in {(datetime_utc_now() - start_time).seconds} seconds")
|
|
766
|
-
return assets
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
@overload
|
|
770
|
-
def categorize_assets(
|
|
771
|
-
client: CogniteClient,
|
|
772
|
-
rdf_assets: dict,
|
|
773
|
-
data_set_id: int,
|
|
774
|
-
return_report: Literal[False] = False,
|
|
775
|
-
partitions: int = 2,
|
|
776
|
-
stop_on_exception: bool = False,
|
|
777
|
-
meta_keys: NeatMetadataKeys | None = None,
|
|
778
|
-
) -> dict: ...
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
@overload
|
|
782
|
-
def categorize_assets(
|
|
783
|
-
client: CogniteClient,
|
|
784
|
-
rdf_assets: dict,
|
|
785
|
-
data_set_id: int,
|
|
786
|
-
return_report: Literal[True],
|
|
787
|
-
partitions: int = 2,
|
|
788
|
-
stop_on_exception: bool = False,
|
|
789
|
-
meta_keys: NeatMetadataKeys | None = None,
|
|
790
|
-
) -> tuple[dict, dict]: ...
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
def categorize_assets(
|
|
794
|
-
client: CogniteClient,
|
|
795
|
-
rdf_assets: dict,
|
|
796
|
-
data_set_id: int,
|
|
797
|
-
return_report: bool = False,
|
|
798
|
-
partitions: int = 2,
|
|
799
|
-
stop_on_exception: bool = False,
|
|
800
|
-
meta_keys: NeatMetadataKeys | None = None,
|
|
801
|
-
) -> tuple[dict, dict] | dict:
|
|
802
|
-
"""Categorize assets on those that are to be created, updated and decommissioned
|
|
803
|
-
|
|
804
|
-
Args:
|
|
805
|
-
client : Instance of CogniteClient
|
|
806
|
-
rdf_assets : Dictionary containing asset external_id - asset pairs
|
|
807
|
-
data_set_id : Dataset id to which assets are to be/are stored
|
|
808
|
-
partitions : Number of partitions to use when fetching assets from CDF, by default 2
|
|
809
|
-
stop_on_exception : Whether to stop on exception or not, by default False
|
|
810
|
-
return_report : Whether to report on the diffing results or not, by default False
|
|
811
|
-
meta_keys : The metadata keys used by neat.
|
|
812
|
-
|
|
813
|
-
Returns:
|
|
814
|
-
dictionary containing asset category - list of asset pairs
|
|
815
|
-
"""
|
|
816
|
-
meta_keys = NeatMetadataKeys() if meta_keys is None else meta_keys
|
|
817
|
-
|
|
818
|
-
# TODO: Cache categorized assets somewhere instead of creating them
|
|
819
|
-
cdf_assets, categorized_asset_ids = _categorize_cdf_assets(client, data_set_id, partitions)
|
|
820
|
-
|
|
821
|
-
rdf_asset_ids = set(rdf_assets.keys())
|
|
822
|
-
|
|
823
|
-
# ids to create
|
|
824
|
-
create_ids = rdf_asset_ids.difference(
|
|
825
|
-
categorized_asset_ids["historic"].union(categorized_asset_ids["non-historic"])
|
|
826
|
-
)
|
|
827
|
-
|
|
828
|
-
# ids potentially to update
|
|
829
|
-
update_ids = rdf_asset_ids.intersection(categorized_asset_ids["non-historic"])
|
|
830
|
-
|
|
831
|
-
# ids to decommission
|
|
832
|
-
decommission_ids = categorized_asset_ids["non-historic"].difference(rdf_asset_ids)
|
|
833
|
-
|
|
834
|
-
# ids to resurrect
|
|
835
|
-
resurrect_ids = categorized_asset_ids["historic"].intersection(rdf_asset_ids)
|
|
836
|
-
|
|
837
|
-
logging.info(f"Number of assets to create: { len(create_ids)}")
|
|
838
|
-
logging.info(f"Number of assets to potentially update: { len(update_ids)}")
|
|
839
|
-
logging.info(f"Number of assets to decommission: { len(decommission_ids)}")
|
|
840
|
-
logging.info(f"Number of assets to resurrect: { len(resurrect_ids)}")
|
|
841
|
-
|
|
842
|
-
categorized_assets_update, report_update = _assets_to_update(
|
|
843
|
-
rdf_assets, cdf_assets, update_ids, meta_keys=meta_keys
|
|
844
|
-
)
|
|
845
|
-
report = {
|
|
846
|
-
"create": create_ids,
|
|
847
|
-
"resurrect": resurrect_ids,
|
|
848
|
-
"decommission": decommission_ids,
|
|
849
|
-
"update": report_update,
|
|
850
|
-
}
|
|
851
|
-
categorized_assets = {
|
|
852
|
-
"create": _assets_to_create(rdf_assets, create_ids),
|
|
853
|
-
"update": categorized_assets_update,
|
|
854
|
-
"resurrect": _assets_to_resurrect(rdf_assets, cdf_assets, resurrect_ids, meta_keys),
|
|
855
|
-
"decommission": _assets_to_decommission(cdf_assets, decommission_ids, meta_keys),
|
|
856
|
-
}
|
|
857
|
-
|
|
858
|
-
return (categorized_assets, report) if return_report else categorized_assets
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
def _micro_batch_push(
|
|
862
|
-
client: CogniteClient,
|
|
863
|
-
assets: Sequence[Asset | AssetUpdate],
|
|
864
|
-
batch_size: int = 1000,
|
|
865
|
-
push_type: str = "update",
|
|
866
|
-
message: str = "Updated",
|
|
867
|
-
max_retries: int = 1,
|
|
868
|
-
retry_delay: int = 5,
|
|
869
|
-
):
|
|
870
|
-
"""Updates assets in batches of 1000
|
|
871
|
-
|
|
872
|
-
Args:
|
|
873
|
-
client : CogniteClient
|
|
874
|
-
Instance of CogniteClient
|
|
875
|
-
assets : list
|
|
876
|
-
List of assets to be created or updated
|
|
877
|
-
batch_size : int, optional
|
|
878
|
-
Size of batch, by default 1000
|
|
879
|
-
push_type : str, optional
|
|
880
|
-
Type of push, either "update" or "create", by default "update"
|
|
881
|
-
message : str, optional
|
|
882
|
-
Message to logged, by default "Updated"
|
|
883
|
-
"""
|
|
884
|
-
total = len(assets)
|
|
885
|
-
counter = 0
|
|
886
|
-
if push_type not in ["update", "create"]:
|
|
887
|
-
logging.info(f"push_type {push_type} not supported")
|
|
888
|
-
raise ValueError(f"push_type {push_type} not supported")
|
|
889
|
-
for batch in chunker(assets, batch_size):
|
|
890
|
-
counter += len(batch)
|
|
891
|
-
start_time = datetime_utc_now()
|
|
892
|
-
|
|
893
|
-
@retry_decorator(max_retries=max_retries, retry_delay=retry_delay, component_name="microbatch-assets")
|
|
894
|
-
def upsert_assets(batch):
|
|
895
|
-
if push_type == "update":
|
|
896
|
-
client.assets.update(batch)
|
|
897
|
-
elif push_type == "create":
|
|
898
|
-
client.assets.create_hierarchy(batch)
|
|
899
|
-
|
|
900
|
-
try:
|
|
901
|
-
upsert_assets(batch)
|
|
902
|
-
except CogniteDuplicatedError:
|
|
903
|
-
# this is handling of very rare case when some assets might be lost . Normally this should not happen.
|
|
904
|
-
# Last attempt to recover
|
|
905
|
-
client.assets.create_hierarchy(batch, upsert=True) # type: ignore[arg-type]
|
|
906
|
-
|
|
907
|
-
delta_time = (datetime_utc_now() - start_time).seconds
|
|
908
|
-
|
|
909
|
-
msg = f"{message} {counter} of {total} assets, batch processing time: {delta_time:.2f} "
|
|
910
|
-
msg += f"seconds ETC: {delta_time * (total - counter) / (60*batch_size) :.2f} minutes"
|
|
911
|
-
logging.info(msg)
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
def upload_assets(
|
|
915
|
-
client: CogniteClient,
|
|
916
|
-
categorized_assets: Mapping[str, Sequence[Asset | AssetUpdate]],
|
|
917
|
-
batch_size: int = 5000,
|
|
918
|
-
max_retries: int = 1,
|
|
919
|
-
retry_delay: int = 3,
|
|
920
|
-
):
|
|
921
|
-
"""Uploads categorized assets to CDF
|
|
922
|
-
|
|
923
|
-
Args:
|
|
924
|
-
client : CogniteClient
|
|
925
|
-
Instance of CogniteClient
|
|
926
|
-
categorized_assets : Dict[str, list]
|
|
927
|
-
dictionary containing asset category - list of asset pairs
|
|
928
|
-
batch_size : int, optional
|
|
929
|
-
Size of batch, by default 5000
|
|
930
|
-
|
|
931
|
-
!!! note "batch_size"
|
|
932
|
-
If batch size is set to 1 or None, all assets will be pushed to CDF in one go.
|
|
933
|
-
"""
|
|
934
|
-
if batch_size:
|
|
935
|
-
logging.info(f"Uploading assets in batches of {batch_size}")
|
|
936
|
-
if categorized_assets["create"]:
|
|
937
|
-
_micro_batch_push(
|
|
938
|
-
client,
|
|
939
|
-
categorized_assets["create"],
|
|
940
|
-
batch_size,
|
|
941
|
-
push_type="create",
|
|
942
|
-
message="Created",
|
|
943
|
-
max_retries=max_retries,
|
|
944
|
-
retry_delay=retry_delay,
|
|
945
|
-
)
|
|
946
|
-
|
|
947
|
-
if categorized_assets["update"]:
|
|
948
|
-
_micro_batch_push(
|
|
949
|
-
client,
|
|
950
|
-
categorized_assets["update"],
|
|
951
|
-
batch_size,
|
|
952
|
-
message="Updated",
|
|
953
|
-
max_retries=max_retries,
|
|
954
|
-
retry_delay=retry_delay,
|
|
955
|
-
)
|
|
956
|
-
|
|
957
|
-
if categorized_assets["resurrect"]:
|
|
958
|
-
_micro_batch_push(
|
|
959
|
-
client,
|
|
960
|
-
categorized_assets["resurrect"],
|
|
961
|
-
batch_size,
|
|
962
|
-
message="Resurrected",
|
|
963
|
-
max_retries=max_retries,
|
|
964
|
-
retry_delay=retry_delay,
|
|
965
|
-
)
|
|
966
|
-
|
|
967
|
-
if categorized_assets["decommission"]:
|
|
968
|
-
_micro_batch_push(
|
|
969
|
-
client,
|
|
970
|
-
categorized_assets["decommission"],
|
|
971
|
-
batch_size,
|
|
972
|
-
message="Decommissioned",
|
|
973
|
-
max_retries=max_retries,
|
|
974
|
-
retry_delay=retry_delay,
|
|
975
|
-
)
|
|
976
|
-
|
|
977
|
-
else:
|
|
978
|
-
logging.info("Batch size not set, pushing all assets to CDF in one go!")
|
|
979
|
-
|
|
980
|
-
@retry_decorator(max_retries=max_retries, retry_delay=retry_delay, component_name="create-assets")
|
|
981
|
-
def create_assets():
|
|
982
|
-
if categorized_assets["create"]:
|
|
983
|
-
try:
|
|
984
|
-
client.assets.create_hierarchy(categorized_assets["create"])
|
|
985
|
-
except CogniteDuplicatedError:
|
|
986
|
-
client.assets.create_hierarchy(categorized_assets["create"], upsert=True)
|
|
987
|
-
|
|
988
|
-
if categorized_assets["update"]:
|
|
989
|
-
client.assets.create_hierarchy(categorized_assets["update"], upsert=True, upsert_mode="replace")
|
|
990
|
-
|
|
991
|
-
if categorized_assets["resurrect"]:
|
|
992
|
-
client.assets.create_hierarchy(categorized_assets["resurrect"], upsert=True, upsert_mode="replace")
|
|
993
|
-
|
|
994
|
-
if categorized_assets["decommission"]:
|
|
995
|
-
client.assets.create_hierarchy(categorized_assets["decommission"], upsert=True, upsert_mode="replace")
|
|
996
|
-
|
|
997
|
-
create_assets()
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
AssetLike: TypeAlias = Asset | dict[str, Any]
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
@overload
|
|
1004
|
-
def remove_non_existing_labels(client: CogniteClient, assets: Sequence[AssetLike]) -> Sequence[AssetLike]: ...
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
@overload
|
|
1008
|
-
def remove_non_existing_labels(client: CogniteClient, assets: Mapping[str, AssetLike]) -> Mapping[str, AssetLike]: ...
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
def remove_non_existing_labels(
|
|
1012
|
-
client: CogniteClient, assets: Sequence[AssetLike] | Mapping[str, AssetLike]
|
|
1013
|
-
) -> Sequence[AssetLike] | Mapping[str, AssetLike]:
|
|
1014
|
-
cdf_labels = client.labels.list(limit=-1)
|
|
1015
|
-
if not cdf_labels:
|
|
1016
|
-
# No labels, nothing to check.
|
|
1017
|
-
return assets
|
|
1018
|
-
|
|
1019
|
-
available_labels = {label.external_id for label in cdf_labels}
|
|
1020
|
-
|
|
1021
|
-
def clean_asset_labels(asset: Asset | dict[str, Any]) -> Asset | dict[str, Any]:
|
|
1022
|
-
if isinstance(asset, Asset):
|
|
1023
|
-
asset.labels = [label for label in (asset.labels or []) if label.external_id in available_labels] or None
|
|
1024
|
-
elif isinstance(asset, dict) and "labels" in asset:
|
|
1025
|
-
asset["labels"] = [label for label in asset["labels"] if label in available_labels]
|
|
1026
|
-
return asset
|
|
1027
|
-
|
|
1028
|
-
if isinstance(assets, Sequence):
|
|
1029
|
-
return [clean_asset_labels(a) for a in assets]
|
|
1030
|
-
|
|
1031
|
-
elif isinstance(assets, dict):
|
|
1032
|
-
return {external_id: clean_asset_labels(a) for external_id, a in assets.items()}
|
|
1033
|
-
|
|
1034
|
-
raise ValueError(f"Invalid format for Assets={type(assets)}")
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
def unique_asset_labels(assets: Iterable[Asset | dict[str, Any]]) -> set[str]:
|
|
1038
|
-
labels: set[str] = set()
|
|
1039
|
-
for asset in assets:
|
|
1040
|
-
if isinstance(asset, Asset):
|
|
1041
|
-
labels |= {label.external_id for label in (asset.labels or []) if label.external_id}
|
|
1042
|
-
elif isinstance(asset, dict) and (asset_labels := asset.get("labels")):
|
|
1043
|
-
labels |= set(asset_labels)
|
|
1044
|
-
else:
|
|
1045
|
-
raise ValueError(f"Unsupported {type(asset)}")
|
|
1046
|
-
return labels
|