cognite-toolkit 0.6.97__py3-none-any.whl → 0.7.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognite_toolkit/_cdf.py +16 -17
- cognite_toolkit/_cdf_tk/apps/__init__.py +2 -0
- cognite_toolkit/_cdf_tk/apps/_core_app.py +13 -5
- cognite_toolkit/_cdf_tk/apps/_data_app.py +1 -1
- cognite_toolkit/_cdf_tk/apps/_dev_app.py +86 -0
- cognite_toolkit/_cdf_tk/apps/_download_app.py +692 -24
- cognite_toolkit/_cdf_tk/apps/_dump_app.py +43 -101
- cognite_toolkit/_cdf_tk/apps/_landing_app.py +18 -4
- cognite_toolkit/_cdf_tk/apps/_migrate_app.py +249 -9
- cognite_toolkit/_cdf_tk/apps/_modules_app.py +0 -3
- cognite_toolkit/_cdf_tk/apps/_purge.py +15 -43
- cognite_toolkit/_cdf_tk/apps/_run.py +11 -0
- cognite_toolkit/_cdf_tk/apps/_upload_app.py +45 -6
- cognite_toolkit/_cdf_tk/builders/__init__.py +2 -2
- cognite_toolkit/_cdf_tk/builders/_base.py +28 -42
- cognite_toolkit/_cdf_tk/cdf_toml.py +20 -1
- cognite_toolkit/_cdf_tk/client/_toolkit_client.py +23 -3
- cognite_toolkit/_cdf_tk/client/api/extended_functions.py +6 -9
- cognite_toolkit/_cdf_tk/client/api/infield.py +93 -1
- cognite_toolkit/_cdf_tk/client/api/migration.py +175 -1
- cognite_toolkit/_cdf_tk/client/api/streams.py +84 -0
- cognite_toolkit/_cdf_tk/client/api/three_d.py +50 -0
- cognite_toolkit/_cdf_tk/client/data_classes/base.py +25 -1
- cognite_toolkit/_cdf_tk/client/data_classes/canvas.py +46 -3
- cognite_toolkit/_cdf_tk/client/data_classes/charts.py +3 -3
- cognite_toolkit/_cdf_tk/client/data_classes/charts_data.py +95 -213
- cognite_toolkit/_cdf_tk/client/data_classes/infield.py +32 -18
- cognite_toolkit/_cdf_tk/client/data_classes/migration.py +10 -2
- cognite_toolkit/_cdf_tk/client/data_classes/streams.py +90 -0
- cognite_toolkit/_cdf_tk/client/data_classes/three_d.py +47 -0
- cognite_toolkit/_cdf_tk/client/testing.py +18 -2
- cognite_toolkit/_cdf_tk/commands/__init__.py +6 -6
- cognite_toolkit/_cdf_tk/commands/_changes.py +3 -42
- cognite_toolkit/_cdf_tk/commands/_download.py +21 -11
- cognite_toolkit/_cdf_tk/commands/_migrate/__init__.py +0 -2
- cognite_toolkit/_cdf_tk/commands/_migrate/command.py +22 -20
- cognite_toolkit/_cdf_tk/commands/_migrate/conversion.py +133 -91
- cognite_toolkit/_cdf_tk/commands/_migrate/data_classes.py +73 -22
- cognite_toolkit/_cdf_tk/commands/_migrate/data_mapper.py +311 -43
- cognite_toolkit/_cdf_tk/commands/_migrate/default_mappings.py +5 -5
- cognite_toolkit/_cdf_tk/commands/_migrate/issues.py +33 -0
- cognite_toolkit/_cdf_tk/commands/_migrate/migration_io.py +157 -8
- cognite_toolkit/_cdf_tk/commands/_migrate/selectors.py +9 -4
- cognite_toolkit/_cdf_tk/commands/_purge.py +27 -28
- cognite_toolkit/_cdf_tk/commands/_questionary_style.py +16 -0
- cognite_toolkit/_cdf_tk/commands/_upload.py +109 -86
- cognite_toolkit/_cdf_tk/commands/about.py +221 -0
- cognite_toolkit/_cdf_tk/commands/auth.py +19 -12
- cognite_toolkit/_cdf_tk/commands/build_cmd.py +15 -61
- cognite_toolkit/_cdf_tk/commands/clean.py +63 -16
- cognite_toolkit/_cdf_tk/commands/deploy.py +20 -17
- cognite_toolkit/_cdf_tk/commands/dump_resource.py +6 -4
- cognite_toolkit/_cdf_tk/commands/init.py +225 -3
- cognite_toolkit/_cdf_tk/commands/modules.py +20 -44
- cognite_toolkit/_cdf_tk/commands/pull.py +6 -19
- cognite_toolkit/_cdf_tk/commands/resources.py +179 -0
- cognite_toolkit/_cdf_tk/constants.py +20 -1
- cognite_toolkit/_cdf_tk/cruds/__init__.py +19 -5
- cognite_toolkit/_cdf_tk/cruds/_base_cruds.py +14 -70
- cognite_toolkit/_cdf_tk/cruds/_data_cruds.py +8 -17
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/__init__.py +4 -1
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/agent.py +11 -9
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/auth.py +4 -14
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/classic.py +44 -43
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/configuration.py +4 -11
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/data_organization.py +4 -13
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/datamodel.py +205 -66
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/extraction_pipeline.py +5 -17
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/fieldops.py +116 -27
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/file.py +6 -27
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/function.py +9 -28
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/hosted_extractors.py +12 -30
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/industrial_tool.py +3 -7
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/location.py +3 -15
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/migration.py +4 -12
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/raw.py +4 -10
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/relationship.py +3 -8
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/robotics.py +15 -44
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/streams.py +94 -0
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/three_d_model.py +3 -7
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/timeseries.py +5 -15
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/transformation.py +39 -31
- cognite_toolkit/_cdf_tk/cruds/_resource_cruds/workflow.py +20 -40
- cognite_toolkit/_cdf_tk/cruds/_worker.py +24 -36
- cognite_toolkit/_cdf_tk/feature_flags.py +16 -36
- cognite_toolkit/_cdf_tk/plugins.py +2 -1
- cognite_toolkit/_cdf_tk/resource_classes/__init__.py +4 -0
- cognite_toolkit/_cdf_tk/resource_classes/capabilities.py +12 -0
- cognite_toolkit/_cdf_tk/resource_classes/functions.py +3 -1
- cognite_toolkit/_cdf_tk/resource_classes/infield_cdm_location_config.py +109 -0
- cognite_toolkit/_cdf_tk/resource_classes/migration.py +8 -17
- cognite_toolkit/_cdf_tk/resource_classes/streams.py +29 -0
- cognite_toolkit/_cdf_tk/storageio/__init__.py +9 -21
- cognite_toolkit/_cdf_tk/storageio/_annotations.py +19 -16
- cognite_toolkit/_cdf_tk/storageio/_applications.py +338 -26
- cognite_toolkit/_cdf_tk/storageio/_asset_centric.py +67 -104
- cognite_toolkit/_cdf_tk/storageio/_base.py +61 -29
- cognite_toolkit/_cdf_tk/storageio/_datapoints.py +276 -20
- cognite_toolkit/_cdf_tk/storageio/_file_content.py +436 -0
- cognite_toolkit/_cdf_tk/storageio/_instances.py +34 -2
- cognite_toolkit/_cdf_tk/storageio/_raw.py +26 -0
- cognite_toolkit/_cdf_tk/storageio/selectors/__init__.py +62 -4
- cognite_toolkit/_cdf_tk/storageio/selectors/_base.py +14 -2
- cognite_toolkit/_cdf_tk/storageio/selectors/_canvas.py +14 -0
- cognite_toolkit/_cdf_tk/storageio/selectors/_charts.py +14 -0
- cognite_toolkit/_cdf_tk/storageio/selectors/_datapoints.py +23 -3
- cognite_toolkit/_cdf_tk/storageio/selectors/_file_content.py +164 -0
- cognite_toolkit/_cdf_tk/tk_warnings/other.py +4 -0
- cognite_toolkit/_cdf_tk/tracker.py +2 -2
- cognite_toolkit/_cdf_tk/utils/dtype_conversion.py +9 -3
- cognite_toolkit/_cdf_tk/utils/fileio/__init__.py +2 -0
- cognite_toolkit/_cdf_tk/utils/fileio/_base.py +5 -1
- cognite_toolkit/_cdf_tk/utils/fileio/_readers.py +112 -20
- cognite_toolkit/_cdf_tk/utils/fileio/_writers.py +15 -15
- cognite_toolkit/_cdf_tk/utils/http_client/_client.py +284 -18
- cognite_toolkit/_cdf_tk/utils/http_client/_data_classes.py +50 -4
- cognite_toolkit/_cdf_tk/utils/http_client/_data_classes2.py +187 -0
- cognite_toolkit/_cdf_tk/utils/interactive_select.py +9 -14
- cognite_toolkit/_cdf_tk/utils/sql_parser.py +2 -3
- cognite_toolkit/_cdf_tk/utils/useful_types.py +6 -2
- cognite_toolkit/_cdf_tk/validation.py +79 -1
- cognite_toolkit/_repo_files/GitHub/.github/workflows/deploy.yaml +1 -1
- cognite_toolkit/_repo_files/GitHub/.github/workflows/dry-run.yaml +1 -1
- cognite_toolkit/_resources/cdf.toml +5 -4
- cognite_toolkit/_version.py +1 -1
- cognite_toolkit/config.dev.yaml +13 -0
- {cognite_toolkit-0.6.97.dist-info → cognite_toolkit-0.7.30.dist-info}/METADATA +24 -24
- {cognite_toolkit-0.6.97.dist-info → cognite_toolkit-0.7.30.dist-info}/RECORD +153 -143
- cognite_toolkit-0.7.30.dist-info/WHEEL +4 -0
- {cognite_toolkit-0.6.97.dist-info → cognite_toolkit-0.7.30.dist-info}/entry_points.txt +1 -0
- cognite_toolkit/_cdf_tk/commands/_migrate/canvas.py +0 -201
- cognite_toolkit/_cdf_tk/commands/dump_data.py +0 -489
- cognite_toolkit/_cdf_tk/commands/featureflag.py +0 -27
- cognite_toolkit/_cdf_tk/utils/table_writers.py +0 -434
- cognite_toolkit-0.6.97.dist-info/WHEEL +0 -4
- cognite_toolkit-0.6.97.dist-info/licenses/LICENSE +0 -18
|
@@ -1,201 +0,0 @@
|
|
|
1
|
-
from uuid import uuid4
|
|
2
|
-
|
|
3
|
-
from cognite.client.data_classes.capabilities import (
|
|
4
|
-
Capability,
|
|
5
|
-
DataModelInstancesAcl,
|
|
6
|
-
DataModelsAcl,
|
|
7
|
-
SpaceIDScope,
|
|
8
|
-
)
|
|
9
|
-
from cognite.client.exceptions import CogniteException
|
|
10
|
-
|
|
11
|
-
from cognite_toolkit._cdf_tk.client import ToolkitClient
|
|
12
|
-
from cognite_toolkit._cdf_tk.client.data_classes.canvas import (
|
|
13
|
-
CANVAS_INSTANCE_SPACE,
|
|
14
|
-
Canvas,
|
|
15
|
-
ContainerReferenceApply,
|
|
16
|
-
FdmInstanceContainerReferenceApply,
|
|
17
|
-
)
|
|
18
|
-
from cognite_toolkit._cdf_tk.client.data_classes.migration import InstanceSource
|
|
19
|
-
from cognite_toolkit._cdf_tk.commands._base import ToolkitCommand
|
|
20
|
-
from cognite_toolkit._cdf_tk.commands._migrate.data_model import (
|
|
21
|
-
INSTANCE_SOURCE_VIEW_ID,
|
|
22
|
-
MODEL_ID,
|
|
23
|
-
RESOURCE_VIEW_MAPPING_VIEW_ID,
|
|
24
|
-
)
|
|
25
|
-
from cognite_toolkit._cdf_tk.exceptions import AuthenticationError, ToolkitMigrationError
|
|
26
|
-
from cognite_toolkit._cdf_tk.tk_warnings import HighSeverityWarning, LowSeverityWarning, MediumSeverityWarning
|
|
27
|
-
from cognite_toolkit._cdf_tk.utils import humanize_collection
|
|
28
|
-
from cognite_toolkit._cdf_tk.utils.interactive_select import InteractiveCanvasSelect
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class MigrationCanvasCommand(ToolkitCommand):
|
|
32
|
-
canvas_schema_space = Canvas.get_source().space
|
|
33
|
-
# Note sequences are not supported in Canvas, so we do not include them here.
|
|
34
|
-
asset_centric_resource_types = frozenset({"asset", "event", "file", "timeseries"})
|
|
35
|
-
|
|
36
|
-
def migrate_canvas(
|
|
37
|
-
self,
|
|
38
|
-
client: ToolkitClient,
|
|
39
|
-
external_ids: list[str] | None = None,
|
|
40
|
-
dry_run: bool = False,
|
|
41
|
-
verbose: bool = False,
|
|
42
|
-
) -> None:
|
|
43
|
-
self.validate_access(
|
|
44
|
-
client,
|
|
45
|
-
instance_spaces=[CANVAS_INSTANCE_SPACE],
|
|
46
|
-
schema_spaces=[self.canvas_schema_space, INSTANCE_SOURCE_VIEW_ID.space],
|
|
47
|
-
)
|
|
48
|
-
self.validate_migration_model_available(client)
|
|
49
|
-
external_ids = external_ids or InteractiveCanvasSelect(client).select_external_ids()
|
|
50
|
-
if external_ids is None or not external_ids:
|
|
51
|
-
self.console("No canvases selected for migration.")
|
|
52
|
-
return
|
|
53
|
-
action = "Would migrate" if dry_run else "Migrating"
|
|
54
|
-
self.console(f"{action} {len(external_ids)} canvases.")
|
|
55
|
-
for external_id in external_ids:
|
|
56
|
-
self._migrate_single_canvas(client, external_id, dry_run=dry_run, verbose=verbose)
|
|
57
|
-
|
|
58
|
-
def _migrate_single_canvas(
|
|
59
|
-
self,
|
|
60
|
-
client: ToolkitClient,
|
|
61
|
-
external_id: str,
|
|
62
|
-
dry_run: bool = False,
|
|
63
|
-
verbose: bool = False,
|
|
64
|
-
) -> None:
|
|
65
|
-
canvas = client.canvas.industrial.retrieve(external_id=external_id)
|
|
66
|
-
if canvas is None:
|
|
67
|
-
self.warn(MediumSeverityWarning(f"Canvas with external ID '{external_id}' not found. Skipping.. "))
|
|
68
|
-
return
|
|
69
|
-
update = canvas.as_write()
|
|
70
|
-
to_migrate = [
|
|
71
|
-
ref
|
|
72
|
-
for ref in update.container_references
|
|
73
|
-
if ref.container_reference_type in self.asset_centric_resource_types
|
|
74
|
-
]
|
|
75
|
-
if not to_migrate:
|
|
76
|
-
self.warn(
|
|
77
|
-
LowSeverityWarning(
|
|
78
|
-
f"Canvas with name '{canvas.canvas.name}' does not have any asset-centric references. Skipping.. "
|
|
79
|
-
)
|
|
80
|
-
)
|
|
81
|
-
if verbose:
|
|
82
|
-
self.console(f"Found canvas: {canvas.canvas.name}")
|
|
83
|
-
reference_ids = [ref.as_asset_centric_id() for ref in to_migrate]
|
|
84
|
-
instance_sources = client.migration.instance_source.retrieve(reference_ids)
|
|
85
|
-
source_by_reference_id = {source.as_asset_centric_id(): source for source in instance_sources}
|
|
86
|
-
missing = set(reference_ids) - set(source_by_reference_id.keys())
|
|
87
|
-
if missing:
|
|
88
|
-
self.warn(
|
|
89
|
-
HighSeverityWarning(
|
|
90
|
-
f"Canvas '{canvas.canvas.name}' has references to resources that are not been migrated: {humanize_collection(missing)}. Skipping.. "
|
|
91
|
-
)
|
|
92
|
-
)
|
|
93
|
-
return
|
|
94
|
-
if dry_run:
|
|
95
|
-
self.console(
|
|
96
|
-
f"Canvas '{canvas.canvas.name}' is ready for migration all {len(instance_sources)} references asset-centric resources found."
|
|
97
|
-
)
|
|
98
|
-
return
|
|
99
|
-
if verbose:
|
|
100
|
-
self.console(
|
|
101
|
-
f"Migrating canvas '{canvas.canvas.name}' with {len(instance_sources)} references to asset-centric resources."
|
|
102
|
-
)
|
|
103
|
-
backup = canvas.as_write().create_backup()
|
|
104
|
-
|
|
105
|
-
update.container_references = [
|
|
106
|
-
ref
|
|
107
|
-
for ref in update.container_references
|
|
108
|
-
if ref.container_reference_type not in self.asset_centric_resource_types
|
|
109
|
-
]
|
|
110
|
-
for ref in to_migrate:
|
|
111
|
-
source = source_by_reference_id[ref.as_asset_centric_id()]
|
|
112
|
-
fdm_ref = self.migrate_container_reference(ref, source, canvas.canvas.external_id)
|
|
113
|
-
update.fdm_instance_container_references.append(fdm_ref)
|
|
114
|
-
|
|
115
|
-
try:
|
|
116
|
-
client.canvas.industrial.create(backup)
|
|
117
|
-
except CogniteException as e:
|
|
118
|
-
raise ToolkitMigrationError(f"Failed to create backup for canvas '{canvas.canvas.name}': {e!s}. ") from e
|
|
119
|
-
try:
|
|
120
|
-
client.canvas.industrial.update(update)
|
|
121
|
-
except CogniteException as e:
|
|
122
|
-
raise ToolkitMigrationError(
|
|
123
|
-
f"Failed to migrate canvas '{canvas.canvas.name}': {e!s}. A backup was created with external ID '{backup.canvas.external_id}'."
|
|
124
|
-
) from e
|
|
125
|
-
else:
|
|
126
|
-
self.console(
|
|
127
|
-
f'Canvas "{canvas.canvas.name}" migrated successfully with {len(to_migrate)} references to data model instances.'
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
@classmethod
|
|
131
|
-
def migrate_container_reference(
|
|
132
|
-
cls, reference: ContainerReferenceApply, source: InstanceSource, canvas_external_id: str
|
|
133
|
-
) -> FdmInstanceContainerReferenceApply:
|
|
134
|
-
"""Migrate a single container reference by replacing the asset-centric ID with the data model instance ID."""
|
|
135
|
-
consumer_view = source.consumer_view()
|
|
136
|
-
new_id = str(uuid4())
|
|
137
|
-
new_external_id = f"{canvas_external_id}_{new_id}"
|
|
138
|
-
return FdmInstanceContainerReferenceApply(
|
|
139
|
-
external_id=new_external_id,
|
|
140
|
-
id_=new_id,
|
|
141
|
-
container_reference_type="fdmInstance",
|
|
142
|
-
instance_space=source.space,
|
|
143
|
-
instance_external_id=source.external_id,
|
|
144
|
-
view_space=consumer_view.space,
|
|
145
|
-
view_external_id=consumer_view.external_id,
|
|
146
|
-
view_version=consumer_view.version,
|
|
147
|
-
label=reference.label,
|
|
148
|
-
properties_=reference.properties_,
|
|
149
|
-
x=reference.x,
|
|
150
|
-
y=reference.y,
|
|
151
|
-
width=reference.width,
|
|
152
|
-
height=reference.height,
|
|
153
|
-
max_width=reference.max_width,
|
|
154
|
-
max_height=reference.max_height,
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
@staticmethod
|
|
158
|
-
def validate_access(
|
|
159
|
-
client: ToolkitClient,
|
|
160
|
-
instance_spaces: list[str] | None = None,
|
|
161
|
-
schema_spaces: list[str] | None = None,
|
|
162
|
-
) -> None:
|
|
163
|
-
required_capabilities: list[Capability] = []
|
|
164
|
-
if instance_spaces is not None:
|
|
165
|
-
required_capabilities.append(
|
|
166
|
-
DataModelInstancesAcl(
|
|
167
|
-
actions=[
|
|
168
|
-
DataModelInstancesAcl.Action.Read,
|
|
169
|
-
DataModelInstancesAcl.Action.Write,
|
|
170
|
-
DataModelInstancesAcl.Action.Write_Properties,
|
|
171
|
-
],
|
|
172
|
-
scope=SpaceIDScope(instance_spaces),
|
|
173
|
-
)
|
|
174
|
-
)
|
|
175
|
-
if schema_spaces is not None:
|
|
176
|
-
required_capabilities.append(
|
|
177
|
-
DataModelsAcl(actions=[DataModelsAcl.Action.Read], scope=SpaceIDScope(schema_spaces)),
|
|
178
|
-
)
|
|
179
|
-
if missing := client.iam.verify_capabilities(required_capabilities):
|
|
180
|
-
raise AuthenticationError(f"Missing required capabilities: {humanize_collection(missing)}.", missing)
|
|
181
|
-
|
|
182
|
-
@staticmethod
|
|
183
|
-
def validate_migration_model_available(client: ToolkitClient) -> None:
|
|
184
|
-
models = client.data_modeling.data_models.retrieve([MODEL_ID], inline_views=False)
|
|
185
|
-
if not models:
|
|
186
|
-
raise ToolkitMigrationError(
|
|
187
|
-
f"The migration data model {MODEL_ID!r} does not exist. "
|
|
188
|
-
"Please run the `cdf migrate prepare` command to deploy the migration data model."
|
|
189
|
-
)
|
|
190
|
-
elif len(models) > 1:
|
|
191
|
-
raise ToolkitMigrationError(
|
|
192
|
-
f"Multiple migration models {MODEL_ID!r}. "
|
|
193
|
-
"Please delete the duplicate models before proceeding with the migration."
|
|
194
|
-
)
|
|
195
|
-
model = models[0]
|
|
196
|
-
missing_views = {INSTANCE_SOURCE_VIEW_ID, RESOURCE_VIEW_MAPPING_VIEW_ID} - set(model.views or [])
|
|
197
|
-
if missing_views:
|
|
198
|
-
raise ToolkitMigrationError(
|
|
199
|
-
f"Invalid migration model. Missing views {humanize_collection(missing_views)}. "
|
|
200
|
-
f"Please run the `cdf migrate prepare` command to deploy the migration data model."
|
|
201
|
-
)
|
|
@@ -1,489 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from collections.abc import Callable, Iterable, Iterator
|
|
4
|
-
from functools import lru_cache
|
|
5
|
-
from itertools import groupby
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Any, ClassVar, Generic, Literal
|
|
8
|
-
|
|
9
|
-
from cognite.client.data_classes import (
|
|
10
|
-
Asset,
|
|
11
|
-
AssetFilter,
|
|
12
|
-
DataSetList,
|
|
13
|
-
Event,
|
|
14
|
-
EventFilter,
|
|
15
|
-
FileMetadata,
|
|
16
|
-
FileMetadataFilter,
|
|
17
|
-
LabelDefinitionList,
|
|
18
|
-
TimeSeries,
|
|
19
|
-
TimeSeriesFilter,
|
|
20
|
-
)
|
|
21
|
-
from cognite.client.data_classes._base import T_CogniteResource
|
|
22
|
-
from rich.console import Console
|
|
23
|
-
from rich.progress import track
|
|
24
|
-
|
|
25
|
-
from cognite_toolkit._cdf_tk.client import ToolkitClient
|
|
26
|
-
from cognite_toolkit._cdf_tk.commands._base import ToolkitCommand
|
|
27
|
-
from cognite_toolkit._cdf_tk.cruds import (
|
|
28
|
-
AssetCRUD,
|
|
29
|
-
DataSetsCRUD,
|
|
30
|
-
EventCRUD,
|
|
31
|
-
FileMetadataCRUD,
|
|
32
|
-
LabelCRUD,
|
|
33
|
-
ResourceCRUD,
|
|
34
|
-
TimeSeriesCRUD,
|
|
35
|
-
)
|
|
36
|
-
from cognite_toolkit._cdf_tk.exceptions import (
|
|
37
|
-
ToolkitFileExistsError,
|
|
38
|
-
ToolkitIsADirectoryError,
|
|
39
|
-
ToolkitValueError,
|
|
40
|
-
)
|
|
41
|
-
from cognite_toolkit._cdf_tk.utils import humanize_collection
|
|
42
|
-
from cognite_toolkit._cdf_tk.utils.cdf import metadata_key_counts
|
|
43
|
-
from cognite_toolkit._cdf_tk.utils.file import safe_rmtree
|
|
44
|
-
from cognite_toolkit._cdf_tk.utils.producer_worker import ProducerWorkerExecutor
|
|
45
|
-
from cognite_toolkit._cdf_tk.utils.table_writers import (
|
|
46
|
-
FileFormat,
|
|
47
|
-
Schema,
|
|
48
|
-
SchemaColumn,
|
|
49
|
-
SchemaColumnList,
|
|
50
|
-
TableFileWriter,
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
class DataFinder:
|
|
55
|
-
supported_formats: ClassVar[frozenset[FileFormat]] = frozenset()
|
|
56
|
-
# This is the standard maximum items that can be returns by most CDF endpoints.
|
|
57
|
-
chunk_size: ClassVar[int] = 1000
|
|
58
|
-
|
|
59
|
-
def validate_format(self, format_: str) -> Literal[FileFormat]:
|
|
60
|
-
if format_ in self.supported_formats:
|
|
61
|
-
return format_ # type: ignore[return-value]
|
|
62
|
-
raise ToolkitValueError(
|
|
63
|
-
f"Unsupported format {format_}. Supported formats are {humanize_collection(self.supported_formats)}."
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
@abstractmethod
|
|
67
|
-
def create_iterators(
|
|
68
|
-
self, format_: FileFormat, limit: int | None
|
|
69
|
-
) -> Iterator[tuple[Schema, int, Iterable, Callable]]:
|
|
70
|
-
"""Create an iterator for the specified format."""
|
|
71
|
-
raise NotImplementedError("This method should be implemented in subclasses.")
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
class AssetCentricFinder(DataFinder, ABC, Generic[T_CogniteResource]):
|
|
75
|
-
def __init__(self, client: ToolkitClient, hierarchies: list[str], data_sets: list[str]):
|
|
76
|
-
self.client = client
|
|
77
|
-
self.hierarchies = hierarchies
|
|
78
|
-
self.data_sets = data_sets
|
|
79
|
-
self.loader = self._create_loader(client)
|
|
80
|
-
self._hierarchy_set = set(self.hierarchies)
|
|
81
|
-
self._data_set_set = set(self.data_sets)
|
|
82
|
-
self._used_labels: set[str] = set()
|
|
83
|
-
self._used_data_sets: set[str] = set()
|
|
84
|
-
|
|
85
|
-
@abstractmethod
|
|
86
|
-
def _create_loader(self, client: ToolkitClient) -> ResourceCRUD:
|
|
87
|
-
"""Create the appropriate loader for the finder."""
|
|
88
|
-
raise NotImplementedError()
|
|
89
|
-
|
|
90
|
-
@lru_cache
|
|
91
|
-
def aggregate_count(self, hierarchies: tuple[str, ...], data_sets: tuple[str, ...]) -> int:
|
|
92
|
-
return self._aggregate_count(list(hierarchies), list(data_sets))
|
|
93
|
-
|
|
94
|
-
@abstractmethod
|
|
95
|
-
def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
|
|
96
|
-
raise NotImplementedError()
|
|
97
|
-
|
|
98
|
-
@abstractmethod
|
|
99
|
-
def _get_resource_columns(self) -> SchemaColumnList:
|
|
100
|
-
"""Get the columns for the schema."""
|
|
101
|
-
raise NotImplementedError()
|
|
102
|
-
|
|
103
|
-
@abstractmethod
|
|
104
|
-
def create_resource_iterator(self, limit: int | None) -> Iterable:
|
|
105
|
-
raise NotImplementedError()
|
|
106
|
-
|
|
107
|
-
@abstractmethod
|
|
108
|
-
def _resource_processor(self, items: Iterable[T_CogniteResource]) -> list[tuple[str, list[dict[str, Any]]]]:
|
|
109
|
-
"""Process the resources and return them in a format suitable for writing."""
|
|
110
|
-
raise NotImplementedError()
|
|
111
|
-
|
|
112
|
-
def _to_write(self, items: Iterable[T_CogniteResource]) -> list[dict[str, Any]]:
|
|
113
|
-
write_items: list[dict[str, Any]] = []
|
|
114
|
-
for item in items:
|
|
115
|
-
dumped = self.loader.dump_resource(item)
|
|
116
|
-
if "metadata" in dumped:
|
|
117
|
-
metadata = dumped.pop("metadata")
|
|
118
|
-
for key, value in metadata.items():
|
|
119
|
-
dumped[f"metadata.{key}"] = value
|
|
120
|
-
if isinstance(dumped.get("labels"), list):
|
|
121
|
-
dumped["labels"] = [label["externalId"] for label in dumped["labels"]]
|
|
122
|
-
self._used_labels.update(dumped["labels"])
|
|
123
|
-
if "dataSetExternalId" in dumped:
|
|
124
|
-
self._used_data_sets.add(dumped["dataSetExternalId"])
|
|
125
|
-
write_items.append(dumped)
|
|
126
|
-
return write_items
|
|
127
|
-
|
|
128
|
-
def create_iterators(
|
|
129
|
-
self, format_: FileFormat, limit: int | None
|
|
130
|
-
) -> Iterator[tuple[Schema, int, Iterable, Callable]]:
|
|
131
|
-
total = self.aggregate_count(tuple(self.hierarchies), tuple(self.data_sets))
|
|
132
|
-
columns = self._get_resource_columns()
|
|
133
|
-
|
|
134
|
-
iteration_count = total // self.chunk_size + (1 if total % self.chunk_size > 0 else 0)
|
|
135
|
-
if iteration_count == 0:
|
|
136
|
-
return
|
|
137
|
-
|
|
138
|
-
yield (
|
|
139
|
-
Schema(
|
|
140
|
-
display_name=self.loader.display_name,
|
|
141
|
-
format_=format_,
|
|
142
|
-
columns=columns,
|
|
143
|
-
folder_name=self.loader.folder_name,
|
|
144
|
-
kind=self.loader.kind,
|
|
145
|
-
),
|
|
146
|
-
iteration_count,
|
|
147
|
-
self.create_resource_iterator(limit),
|
|
148
|
-
self._resource_processor,
|
|
149
|
-
)
|
|
150
|
-
if self._used_data_sets:
|
|
151
|
-
yield self._data_sets()
|
|
152
|
-
if self._used_labels:
|
|
153
|
-
yield self._labels()
|
|
154
|
-
|
|
155
|
-
def _data_sets(self) -> tuple[Schema, int, Iterable, Callable]:
|
|
156
|
-
data_sets = self.client.data_sets.retrieve_multiple(
|
|
157
|
-
external_ids=list(self._used_data_sets), ignore_unknown_ids=True
|
|
158
|
-
)
|
|
159
|
-
loader = DataSetsCRUD.create_loader(self.client)
|
|
160
|
-
|
|
161
|
-
def process_data_sets(items: DataSetList) -> list[tuple[str, list[dict[str, Any]]]]:
|
|
162
|
-
# All data sets are written to a single group, thus the empty string as the group key.
|
|
163
|
-
# (Group keys are for example used in CSV files to create separate files for each
|
|
164
|
-
# data set an asset belongs to.)
|
|
165
|
-
return [("", [loader.dump_resource(item) for item in items])]
|
|
166
|
-
|
|
167
|
-
return (
|
|
168
|
-
# YAML format does not need columns.
|
|
169
|
-
Schema(
|
|
170
|
-
display_name=loader.display_name,
|
|
171
|
-
format_="yaml",
|
|
172
|
-
columns=SchemaColumnList(),
|
|
173
|
-
folder_name=loader.folder_name,
|
|
174
|
-
kind=loader.kind,
|
|
175
|
-
),
|
|
176
|
-
1,
|
|
177
|
-
[data_sets],
|
|
178
|
-
process_data_sets,
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
def _labels(self) -> tuple[Schema, int, Iterable, Callable]:
|
|
182
|
-
labels = self.client.labels.retrieve(external_id=list(self._used_labels))
|
|
183
|
-
loader = LabelCRUD.create_loader(self.client)
|
|
184
|
-
|
|
185
|
-
def process_labels(items: LabelDefinitionList) -> list[tuple[str, list[dict[str, Any]]]]:
|
|
186
|
-
# All labels are written to a single group, thus the empty string as the group key.
|
|
187
|
-
# (Group keys are for example used in CSV files to create separate files for each
|
|
188
|
-
# label an asset belongs to.)
|
|
189
|
-
return [("", [loader.dump_resource(item) for item in items])]
|
|
190
|
-
|
|
191
|
-
return (
|
|
192
|
-
# YAML format does not need columns.
|
|
193
|
-
Schema(
|
|
194
|
-
display_name=loader.display_name,
|
|
195
|
-
format_="yaml",
|
|
196
|
-
columns=SchemaColumnList(),
|
|
197
|
-
folder_name=loader.folder_name,
|
|
198
|
-
kind=loader.kind,
|
|
199
|
-
),
|
|
200
|
-
1,
|
|
201
|
-
[labels],
|
|
202
|
-
process_labels,
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
class AssetFinder(AssetCentricFinder[Asset]):
|
|
207
|
-
supported_formats = frozenset({"csv", "parquet", "yaml"})
|
|
208
|
-
|
|
209
|
-
def _create_loader(self, client: ToolkitClient) -> ResourceCRUD:
|
|
210
|
-
return AssetCRUD.create_loader(client)
|
|
211
|
-
|
|
212
|
-
def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
|
|
213
|
-
return self.client.assets.aggregate_count(
|
|
214
|
-
filter=AssetFilter(
|
|
215
|
-
data_set_ids=[{"externalId": item} for item in data_sets] or None,
|
|
216
|
-
asset_subtree_ids=[{"externalId": item} for item in hierarchies] or None,
|
|
217
|
-
)
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
def create_resource_iterator(self, limit: int | None) -> Iterator:
|
|
221
|
-
return self.client.assets(
|
|
222
|
-
chunk_size=self.chunk_size,
|
|
223
|
-
asset_subtree_external_ids=self.hierarchies or None,
|
|
224
|
-
data_set_external_ids=self.data_sets or None,
|
|
225
|
-
limit=limit,
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
def _resource_processor(self, assets: Iterable[Asset]) -> list[tuple[str, list[dict[str, Any]]]]:
|
|
229
|
-
grouped_assets: list[tuple[str, list[dict[str, object]]]] = []
|
|
230
|
-
for group, asset_group in groupby(
|
|
231
|
-
sorted([(self._group(asset), asset) for asset in assets], key=lambda x: x[0]), key=lambda x: x[0]
|
|
232
|
-
):
|
|
233
|
-
grouped_assets.append((group, self._to_write([asset for _, asset in asset_group])))
|
|
234
|
-
return grouped_assets
|
|
235
|
-
|
|
236
|
-
def _group(self, item: Asset) -> str:
|
|
237
|
-
if self.hierarchies and self.data_sets:
|
|
238
|
-
asset_external_id = self.client.lookup.assets.external_id(item.root_id or 0)
|
|
239
|
-
data_set_external_id = self.client.lookup.data_sets.external_id(item.data_set_id or 0)
|
|
240
|
-
if asset_external_id and data_set_external_id:
|
|
241
|
-
return f"{asset_external_id}.{data_set_external_id}"
|
|
242
|
-
elif asset_external_id:
|
|
243
|
-
return asset_external_id
|
|
244
|
-
elif data_set_external_id:
|
|
245
|
-
return data_set_external_id
|
|
246
|
-
return ""
|
|
247
|
-
elif self.hierarchies:
|
|
248
|
-
return self.client.lookup.assets.external_id(item.root_id or 0) or ""
|
|
249
|
-
elif self.data_sets:
|
|
250
|
-
return self.client.lookup.data_sets.external_id(item.data_set_id or 0) or ""
|
|
251
|
-
return ""
|
|
252
|
-
|
|
253
|
-
def _get_resource_columns(self) -> SchemaColumnList:
|
|
254
|
-
columns = SchemaColumnList(
|
|
255
|
-
[
|
|
256
|
-
SchemaColumn(name="externalId", type="string"),
|
|
257
|
-
SchemaColumn(name="name", type="string"),
|
|
258
|
-
SchemaColumn(name="parentExternalId", type="string"),
|
|
259
|
-
SchemaColumn(name="description", type="string"),
|
|
260
|
-
SchemaColumn(name="dataSetExternalId", type="string"),
|
|
261
|
-
SchemaColumn(name="source", type="string"),
|
|
262
|
-
SchemaColumn(name="labels", type="string", is_array=True),
|
|
263
|
-
SchemaColumn(name="geoLocation", type="json"),
|
|
264
|
-
]
|
|
265
|
-
)
|
|
266
|
-
data_set_ids = self.client.lookup.data_sets.id(self.data_sets) if self.data_sets else []
|
|
267
|
-
root_ids = self.client.lookup.assets.id(self.hierarchies) if self.hierarchies else []
|
|
268
|
-
metadata_keys = metadata_key_counts(self.client, "assets", data_set_ids or None, root_ids or None)
|
|
269
|
-
sorted_keys = sorted([key for key, count in metadata_keys if count > 0])
|
|
270
|
-
columns.extend([SchemaColumn(name=f"metadata.{key}", type="string") for key in sorted_keys])
|
|
271
|
-
return columns
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
class FileMetadataFinder(AssetCentricFinder[FileMetadata]):
|
|
275
|
-
supported_formats = frozenset({"csv", "parquet"})
|
|
276
|
-
|
|
277
|
-
def _create_loader(self, client: ToolkitClient) -> ResourceCRUD:
|
|
278
|
-
return FileMetadataCRUD.create_loader(client)
|
|
279
|
-
|
|
280
|
-
def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
|
|
281
|
-
result = self.client.files.aggregate(
|
|
282
|
-
filter=FileMetadataFilter(
|
|
283
|
-
data_set_ids=[{"externalId": item} for item in data_sets] or None,
|
|
284
|
-
asset_subtree_ids=[{"externalId": item} for item in hierarchies] or None,
|
|
285
|
-
)
|
|
286
|
-
)
|
|
287
|
-
return result[0].count if result else 0
|
|
288
|
-
|
|
289
|
-
def _get_resource_columns(self) -> SchemaColumnList:
|
|
290
|
-
columns = SchemaColumnList(
|
|
291
|
-
[
|
|
292
|
-
SchemaColumn(name="externalId", type="string"),
|
|
293
|
-
SchemaColumn(name="name", type="string"),
|
|
294
|
-
SchemaColumn(name="directory", type="string"),
|
|
295
|
-
SchemaColumn(name="source", type="string"),
|
|
296
|
-
SchemaColumn(name="mimeType", type="string"),
|
|
297
|
-
SchemaColumn(name="assetExternalIds", type="string", is_array=True),
|
|
298
|
-
SchemaColumn(name="dataSetExternalId", type="string"),
|
|
299
|
-
SchemaColumn(name="sourceCreatedTime", type="integer"),
|
|
300
|
-
SchemaColumn(name="sourceModifiedTime", type="integer"),
|
|
301
|
-
SchemaColumn(name="securityCategories", type="string", is_array=True),
|
|
302
|
-
SchemaColumn(name="labels", type="string", is_array=True),
|
|
303
|
-
SchemaColumn(name="geoLocation", type="json"),
|
|
304
|
-
]
|
|
305
|
-
)
|
|
306
|
-
data_set_ids = self.client.lookup.data_sets.id(self.data_sets) if self.data_sets else []
|
|
307
|
-
root_ids = self.client.lookup.assets.id(self.hierarchies) if self.hierarchies else []
|
|
308
|
-
metadata_keys = metadata_key_counts(self.client, "files", data_set_ids or None, root_ids or None)
|
|
309
|
-
sorted_keys = sorted([key for key, count in metadata_keys if count > 0])
|
|
310
|
-
columns.extend([SchemaColumn(name=f"metadata.{key}", type="string") for key in sorted_keys])
|
|
311
|
-
return columns
|
|
312
|
-
|
|
313
|
-
def create_resource_iterator(self, limit: int | None) -> Iterable:
|
|
314
|
-
return self.client.files(
|
|
315
|
-
chunk_size=self.chunk_size,
|
|
316
|
-
asset_subtree_external_ids=self.hierarchies or None,
|
|
317
|
-
data_set_external_ids=self.data_sets or None,
|
|
318
|
-
limit=limit,
|
|
319
|
-
)
|
|
320
|
-
|
|
321
|
-
def _resource_processor(self, items: Iterable[FileMetadata]) -> list[tuple[str, list[dict[str, Any]]]]:
|
|
322
|
-
return [("", self._to_write(items))]
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
class TimeSeriesFinder(AssetCentricFinder[TimeSeries]):
|
|
326
|
-
supported_formats = frozenset({"csv", "parquet", "yaml"})
|
|
327
|
-
|
|
328
|
-
def _create_loader(self, client: ToolkitClient) -> TimeSeriesCRUD:
|
|
329
|
-
return TimeSeriesCRUD.create_loader(client)
|
|
330
|
-
|
|
331
|
-
def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
|
|
332
|
-
return self.client.time_series.aggregate_count(
|
|
333
|
-
filter=TimeSeriesFilter(
|
|
334
|
-
data_set_ids=[{"externalId": item} for item in data_sets] or None,
|
|
335
|
-
asset_subtree_ids=[{"externalId": item} for item in hierarchies] or None,
|
|
336
|
-
)
|
|
337
|
-
)
|
|
338
|
-
|
|
339
|
-
def create_resource_iterator(self, limit: int | None) -> Iterator:
|
|
340
|
-
return self.client.time_series(
|
|
341
|
-
chunk_size=self.chunk_size,
|
|
342
|
-
asset_subtree_external_ids=self.hierarchies or None,
|
|
343
|
-
data_set_external_ids=self.data_sets or None,
|
|
344
|
-
limit=limit,
|
|
345
|
-
)
|
|
346
|
-
|
|
347
|
-
def _resource_processor(self, time_series: Iterable[TimeSeries]) -> list[tuple[str, list[dict[str, Any]]]]:
|
|
348
|
-
return [("", self._to_write(time_series))]
|
|
349
|
-
|
|
350
|
-
def _get_resource_columns(self) -> SchemaColumnList:
|
|
351
|
-
columns = SchemaColumnList(
|
|
352
|
-
[
|
|
353
|
-
SchemaColumn(name="externalId", type="string"),
|
|
354
|
-
SchemaColumn(name="name", type="string"),
|
|
355
|
-
SchemaColumn(name="isString", type="boolean"),
|
|
356
|
-
SchemaColumn(name="unit", type="string"),
|
|
357
|
-
SchemaColumn(name="unitExternalId", type="string"),
|
|
358
|
-
SchemaColumn(name="assetExternalId", type="string"),
|
|
359
|
-
SchemaColumn(name="isStep", type="boolean"),
|
|
360
|
-
SchemaColumn(name="description", type="string"),
|
|
361
|
-
SchemaColumn(name="dataSetExternalId", type="string"),
|
|
362
|
-
SchemaColumn(name="securityCategories", type="string", is_array=True),
|
|
363
|
-
]
|
|
364
|
-
)
|
|
365
|
-
data_set_ids = self.client.lookup.data_sets.id(self.data_sets) if self.data_sets else []
|
|
366
|
-
root_ids = self.client.lookup.assets.id(self.hierarchies) if self.hierarchies else []
|
|
367
|
-
metadata_keys = metadata_key_counts(self.client, "timeseries", data_set_ids or None, root_ids or None)
|
|
368
|
-
sorted_keys = sorted([key for key, count in metadata_keys if count > 0])
|
|
369
|
-
columns.extend([SchemaColumn(name=f"metadata.{key}", type="string") for key in sorted_keys])
|
|
370
|
-
return columns
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
class EventFinder(AssetCentricFinder[Event]):
|
|
374
|
-
supported_formats = frozenset({"csv", "parquet"})
|
|
375
|
-
|
|
376
|
-
def _create_loader(self, client: ToolkitClient) -> ResourceCRUD:
|
|
377
|
-
return EventCRUD.create_loader(client)
|
|
378
|
-
|
|
379
|
-
def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
|
|
380
|
-
return self.client.events.aggregate_count(
|
|
381
|
-
filter=EventFilter(
|
|
382
|
-
data_set_ids=[{"externalId": item} for item in data_sets] or None,
|
|
383
|
-
asset_subtree_ids=[{"externalId": item} for item in hierarchies] or None,
|
|
384
|
-
)
|
|
385
|
-
)
|
|
386
|
-
|
|
387
|
-
def _get_resource_columns(self) -> SchemaColumnList:
|
|
388
|
-
columns = SchemaColumnList(
|
|
389
|
-
[
|
|
390
|
-
SchemaColumn(name="externalId", type="string"),
|
|
391
|
-
SchemaColumn(name="dataSetExternalId", type="string"),
|
|
392
|
-
SchemaColumn(name="startTime", type="integer"),
|
|
393
|
-
SchemaColumn(name="endTime", type="integer"),
|
|
394
|
-
SchemaColumn(name="type", type="string"),
|
|
395
|
-
SchemaColumn(name="subtype", type="string"),
|
|
396
|
-
SchemaColumn(name="description", type="string"),
|
|
397
|
-
SchemaColumn(name="assetExternalIds", type="string", is_array=True),
|
|
398
|
-
SchemaColumn(name="source", type="string"),
|
|
399
|
-
]
|
|
400
|
-
)
|
|
401
|
-
data_set_ids = self.client.lookup.data_sets.id(self.data_sets) if self.data_sets else []
|
|
402
|
-
root_ids = self.client.lookup.assets.id(self.hierarchies) if self.hierarchies else []
|
|
403
|
-
metadata_keys = metadata_key_counts(self.client, "events", data_set_ids or None, root_ids or None)
|
|
404
|
-
sorted_keys = sorted([key for key, count in metadata_keys if count > 0])
|
|
405
|
-
columns.extend([SchemaColumn(name=f"metadata.{key}", type="string") for key in sorted_keys])
|
|
406
|
-
return columns
|
|
407
|
-
|
|
408
|
-
def create_resource_iterator(self, limit: int | None) -> Iterable:
|
|
409
|
-
return self.client.events(
|
|
410
|
-
chunk_size=self.chunk_size,
|
|
411
|
-
asset_subtree_external_ids=self.hierarchies or None,
|
|
412
|
-
data_set_external_ids=self.data_sets or None,
|
|
413
|
-
limit=limit,
|
|
414
|
-
)
|
|
415
|
-
|
|
416
|
-
def _resource_processor(self, items: Iterable[Event]) -> list[tuple[str, list[dict[str, Any]]]]:
|
|
417
|
-
return [("", self._to_write(items))]
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
class DumpDataCommand(ToolkitCommand):
|
|
421
|
-
def dump_table(
|
|
422
|
-
self,
|
|
423
|
-
finder: DataFinder,
|
|
424
|
-
output_dir: Path,
|
|
425
|
-
clean: bool,
|
|
426
|
-
limit: int | None = None,
|
|
427
|
-
format_: str = "csv",
|
|
428
|
-
verbose: bool = False,
|
|
429
|
-
parallel_threshold: int = 10,
|
|
430
|
-
max_queue_size: int = 10,
|
|
431
|
-
) -> None:
|
|
432
|
-
"""Dumps data from CDF to a file
|
|
433
|
-
|
|
434
|
-
Args:
|
|
435
|
-
finder (DataFinder): The finder object to use for fetching data.
|
|
436
|
-
output_dir (Path): The directory to write the output files to.
|
|
437
|
-
clean (bool): Whether to clean the output directory before writing files.
|
|
438
|
-
limit (int | None, optional): The maximum number of rows to write. Defaults to None.
|
|
439
|
-
format_ (Literal["yaml", "csv", "parquet"], optional): The format of the output file. Defaults to "csv".
|
|
440
|
-
verbose (bool, optional): Whether to print detailed progress information. Defaults to False.
|
|
441
|
-
parallel_threshold (int, optional): The iteration threshold for parallel processing. Defaults to 10.
|
|
442
|
-
max_queue_size (int, optional): If using parallel processing, the maximum size of the queue. Defaults to 10.
|
|
443
|
-
|
|
444
|
-
"""
|
|
445
|
-
valid_format = finder.validate_format(format_)
|
|
446
|
-
self.validate_directory(output_dir, clean)
|
|
447
|
-
|
|
448
|
-
console = Console()
|
|
449
|
-
# The ignore is used as MyPy does not understand that is_supported_format
|
|
450
|
-
# above guarantees that the format is valid.
|
|
451
|
-
for schema, iteration_count, resource_iterator, resource_processor in finder.create_iterators(
|
|
452
|
-
valid_format, limit
|
|
453
|
-
):
|
|
454
|
-
writer_cls = TableFileWriter.get_write_cls(schema.format_)
|
|
455
|
-
row_counts = 0
|
|
456
|
-
t0 = time.perf_counter()
|
|
457
|
-
with writer_cls(schema, output_dir) as writer:
|
|
458
|
-
if iteration_count > parallel_threshold:
|
|
459
|
-
executor = ProducerWorkerExecutor(
|
|
460
|
-
download_iterable=resource_iterator,
|
|
461
|
-
process=resource_processor,
|
|
462
|
-
write=writer.write_rows,
|
|
463
|
-
iteration_count=iteration_count,
|
|
464
|
-
max_queue_size=max_queue_size,
|
|
465
|
-
download_description=f"Downloading {schema.display_name}",
|
|
466
|
-
process_description=f"Processing {schema.display_name}",
|
|
467
|
-
write_description=f"Writing {schema.display_name} to file",
|
|
468
|
-
)
|
|
469
|
-
executor.run()
|
|
470
|
-
executor.raise_on_error()
|
|
471
|
-
row_counts = executor.total_items
|
|
472
|
-
else:
|
|
473
|
-
for resources in track(
|
|
474
|
-
resource_iterator, total=iteration_count, description=f"Dumping {schema.display_name}"
|
|
475
|
-
):
|
|
476
|
-
row_counts += len(resources)
|
|
477
|
-
processed = resource_processor(resources)
|
|
478
|
-
writer.write_rows(processed)
|
|
479
|
-
elapsed = time.perf_counter() - t0
|
|
480
|
-
console.print(f"Dumped {row_counts:,} rows to {output_dir} in {elapsed:,.2f} seconds.")
|
|
481
|
-
|
|
482
|
-
@staticmethod
|
|
483
|
-
def validate_directory(output_dir: Path, clean: bool) -> None:
|
|
484
|
-
if output_dir.exists() and clean:
|
|
485
|
-
safe_rmtree(output_dir)
|
|
486
|
-
elif output_dir.exists():
|
|
487
|
-
raise ToolkitFileExistsError(f"Output directory {output_dir!s} already exists. Use --clean to remove it.")
|
|
488
|
-
elif output_dir.suffix:
|
|
489
|
-
raise ToolkitIsADirectoryError(f"Output directory {output_dir!s} is not a directory.")
|