cognite-toolkit 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -95,13 +95,13 @@ class UploadApp(typer.Typer):
95
95
  typer.echo("No selection made for deploying resources. Exiting.")
96
96
  raise typer.Exit(code=1)
97
97
 
98
- client = EnvironmentVariables.create_from_environment().get_client()
99
- cmd.run(
100
- lambda: cmd.upload(
101
- input_dir=input_dir,
102
- dry_run=dry_run,
103
- verbose=verbose,
104
- deploy_resources=deploy_resources,
105
- client=client,
106
- )
98
+ client = EnvironmentVariables.create_from_environment().get_client()
99
+ cmd.run(
100
+ lambda: cmd.upload(
101
+ input_dir=input_dir,
102
+ dry_run=dry_run,
103
+ verbose=verbose,
104
+ deploy_resources=deploy_resources,
105
+ client=client,
107
106
  )
107
+ )
@@ -11,7 +11,6 @@ from .build_cmd import BuildCommand
11
11
  from .clean import CleanCommand
12
12
  from .collect import CollectCommand
13
13
  from .deploy import DeployCommand
14
- from .dump_data import DumpDataCommand
15
14
  from .dump_resource import DumpResourceCommand
16
15
  from .featureflag import FeatureFlagCommand
17
16
  from .init import InitCommand
@@ -27,7 +26,6 @@ __all__ = [
27
26
  "CollectCommand",
28
27
  "DeployCommand",
29
28
  "DownloadCommand",
30
- "DumpDataCommand",
31
29
  "DumpResourceCommand",
32
30
  "FeatureFlagCommand",
33
31
  "InitCommand",
@@ -19,7 +19,7 @@ from cognite_toolkit._cdf_tk.exceptions import ToolkitMissingDependencyError, To
19
19
  from cognite_toolkit._cdf_tk.utils._auxiliary import get_concrete_subclasses
20
20
  from cognite_toolkit._cdf_tk.utils.collection import humanize_collection
21
21
  from cognite_toolkit._cdf_tk.utils.file import sanitize_filename
22
- from cognite_toolkit._cdf_tk.utils.table_writers import DataType
22
+ from cognite_toolkit._cdf_tk.utils.useful_types import DataType
23
23
 
24
24
  from ._base import T_IO, CellValue, Chunk, FileIO, SchemaColumn
25
25
  from ._compression import Compression, Uncompressed
@@ -12,7 +12,7 @@ jobs:
12
12
  environment: dev
13
13
  name: Deploy
14
14
  container:
15
- image: cognite/toolkit:0.7.0
15
+ image: cognite/toolkit:0.7.1
16
16
  env:
17
17
  CDF_CLUSTER: ${{ vars.CDF_CLUSTER }}
18
18
  CDF_PROJECT: ${{ vars.CDF_PROJECT }}
@@ -10,7 +10,7 @@ jobs:
10
10
  environment: dev
11
11
  name: Deploy Dry Run
12
12
  container:
13
- image: cognite/toolkit:0.7.0
13
+ image: cognite/toolkit:0.7.1
14
14
  env:
15
15
  CDF_CLUSTER: ${{ vars.CDF_CLUSTER }}
16
16
  CDF_PROJECT: ${{ vars.CDF_PROJECT }}
@@ -4,7 +4,7 @@ default_env = "<DEFAULT_ENV_PLACEHOLDER>"
4
4
  [modules]
5
5
  # This is the version of the modules. It should not be changed manually.
6
6
  # It will be updated by the 'cdf modules upgrade' command.
7
- version = "0.7.0"
7
+ version = "0.7.1"
8
8
 
9
9
 
10
10
  [plugins]
@@ -1 +1 @@
1
- __version__ = "0.7.0"
1
+ __version__ = "0.7.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cognite_toolkit
3
- Version: 0.7.0
3
+ Version: 0.7.1
4
4
  Summary: Official Cognite Data Fusion tool for project templates and configuration deployment
5
5
  Project-URL: Homepage, https://docs.cognite.com/cdf/deploy/cdf_toolkit/
6
6
  Project-URL: Changelog, https://github.com/cognitedata/toolkit/releases
@@ -1,6 +1,6 @@
1
1
  cognite_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  cognite_toolkit/_cdf.py,sha256=PzDig6dgbDX5VL88AeijQuTeYb2SS_yvenw9gr4fnxY,5794
3
- cognite_toolkit/_version.py,sha256=RaANGbRu5e-vehwXI1-Qe2ggPPfs1TQaZj072JdbLk4,22
3
+ cognite_toolkit/_version.py,sha256=2KJZDSMOG7KS82AxYOrZ4ZihYxX0wjfUjDsIZh3L024,22
4
4
  cognite_toolkit/_cdf_tk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  cognite_toolkit/_cdf_tk/cdf_toml.py,sha256=VSWV9h44HusWIaKpWgjrOMrc3hDoPTTXBXlp6-NOrIM,9079
6
6
  cognite_toolkit/_cdf_tk/constants.py,sha256=3UpFZ60xXdqgPqqpqCITQuAvjnVExH_IlbASxoelvu8,7236
@@ -25,7 +25,7 @@ cognite_toolkit/_cdf_tk/apps/_profile_app.py,sha256=vSRJW54bEvIul8_4rOqyOYA7ztXx
25
25
  cognite_toolkit/_cdf_tk/apps/_purge.py,sha256=KYI1wFy7yHFEM1qJnTYc4_8E2FVGu4QhPsWsxop1sZA,14242
26
26
  cognite_toolkit/_cdf_tk/apps/_repo_app.py,sha256=jOf_s7oUWJqnRyz89JFiSzT2l8GlyQ7wqidHUQavGo0,1455
27
27
  cognite_toolkit/_cdf_tk/apps/_run.py,sha256=eXua4n0hW4qRMkzaxR0PiZh-JFLf8gnWw1_5O-0-vm0,8987
28
- cognite_toolkit/_cdf_tk/apps/_upload_app.py,sha256=BgJrcm_KikLLdr2ZUPG9CdL2hrLr7T0gR6cxih5kll0,4267
28
+ cognite_toolkit/_cdf_tk/apps/_upload_app.py,sha256=1nF0-7oCAXLlmTGyUOKTmxkZqvA0Xo6U6lqk-SqKmCc,4227
29
29
  cognite_toolkit/_cdf_tk/builders/__init__.py,sha256=Y-AJ4VrcUCRquGNEgDCiwmWW3iGWnJl2DrL17gsUIBg,1172
30
30
  cognite_toolkit/_cdf_tk/builders/_base.py,sha256=N32Y17hfepp45rMW_o4qeUY9nsysmtcxpX4GkF-tsio,7829
31
31
  cognite_toolkit/_cdf_tk/builders/_datamodels.py,sha256=hN3fWQAktrWdaGAItZ0tHpBXqJDu0JfH6t7pO7EIl2Q,3541
@@ -100,7 +100,7 @@ cognite_toolkit/_cdf_tk/client/data_classes/streams.py,sha256=DHSDrBax81fUzneIik
100
100
  cognite_toolkit/_cdf_tk/client/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
101
  cognite_toolkit/_cdf_tk/client/utils/_concurrency.py,sha256=3GtQbKDaosyKHEt-KzxKK9Yie4TvZPdoou2vUk6dUa8,2298
102
102
  cognite_toolkit/_cdf_tk/client/utils/_http_client.py,sha256=oXNKrIaizG4WiSAhL_kSCHAuL4aaaEhCU4pOJGxh6Xs,483
103
- cognite_toolkit/_cdf_tk/commands/__init__.py,sha256=OJYtHiERtUBXm3cjUTyPVaYIMVQpu9HJv1QNGPL-AIQ,1418
103
+ cognite_toolkit/_cdf_tk/commands/__init__.py,sha256=gHA3yWI3UacMD79ZpCyh8MjA1fzuEg5pxZGts2VsXLs,1356
104
104
  cognite_toolkit/_cdf_tk/commands/_base.py,sha256=1gl8Y-yqfedRMfdbwM3iPTIUIZriX1UvC1deLsJSJwM,2667
105
105
  cognite_toolkit/_cdf_tk/commands/_changes.py,sha256=fvw2C5N2BVf-7MUpiB1FkDVCJ0xIy4lfDyFgpWaLPeo,24651
106
106
  cognite_toolkit/_cdf_tk/commands/_cli_commands.py,sha256=TK6U_rm6VZT_V941kTyHMoulWgJzbDC8YIIQDPJ5x3w,1011
@@ -116,7 +116,6 @@ cognite_toolkit/_cdf_tk/commands/build_cmd.py,sha256=6m-lK0vccje1gaQ_fd68UvA4Cbh
116
116
  cognite_toolkit/_cdf_tk/commands/clean.py,sha256=KDcUn1MEpvk_K7WqQPBiZcIlGV61JVG6D0DcYUXj7BM,16567
117
117
  cognite_toolkit/_cdf_tk/commands/collect.py,sha256=zBMKhhvjOpuASMnwP0eeHRI02tANcvFEZgv0CQO1ECc,627
118
118
  cognite_toolkit/_cdf_tk/commands/deploy.py,sha256=PO9r9iK1UEoDdoATS4hgjCP11DLGc2xSaY0g14nyATY,23519
119
- cognite_toolkit/_cdf_tk/commands/dump_data.py,sha256=8l4M2kqV4DjiV5js5s7EbFVNxV0Np4ld8ogw19vaJp0,21804
120
119
  cognite_toolkit/_cdf_tk/commands/dump_resource.py,sha256=ylAFST3GgkWT1Qa-JIzmQXbrQgNCB1UrptrBf3WsyvY,39658
121
120
  cognite_toolkit/_cdf_tk/commands/featureflag.py,sha256=lgLMwuNIwFjvvKn1sNMunkq4VTwdNqXtrZfdGFTrNcI,968
122
121
  cognite_toolkit/_cdf_tk/commands/init.py,sha256=pcxFhZheXm3FPU1pkeh10M0WXPg7EcLFUgJlrE817tE,9257
@@ -282,7 +281,6 @@ cognite_toolkit/_cdf_tk/utils/progress_tracker.py,sha256=LGpC22iSTTlo6FWi38kqBu_
282
281
  cognite_toolkit/_cdf_tk/utils/repository.py,sha256=voQLZ6NiNvdAFxqeWHbvzDLsLHl6spjQBihiLyCsGW8,4104
283
282
  cognite_toolkit/_cdf_tk/utils/sentry_utils.py,sha256=Q3ekrR0bWMtlPVQrfUSsETlkLIaDUZ2u-RdNFFr9-dg,564
284
283
  cognite_toolkit/_cdf_tk/utils/sql_parser.py,sha256=jernu2amPQ54cQZ4vFZm1gEhFZfGcjU-yLQQG_RFo_M,6458
285
- cognite_toolkit/_cdf_tk/utils/table_writers.py,sha256=6BS_CMsIY5WE2O9u693Q4b0b-0E3-nlTuQ7NXk9OzX4,17870
286
284
  cognite_toolkit/_cdf_tk/utils/text.py,sha256=1-LQMo633_hEhNhishQo7Buj-7np5Pe4qKk0TQofMzE,3906
287
285
  cognite_toolkit/_cdf_tk/utils/thread_safe_dict.py,sha256=NbRHcZvWpF9xHP5OkOMGFpxrPNbi0Q3Eea6PUNbGlt4,3426
288
286
  cognite_toolkit/_cdf_tk/utils/useful_types.py,sha256=oK88W6G_aK3hebORSQKZjWrq7jG-pO2lkLWSWYMlngM,1872
@@ -291,7 +289,7 @@ cognite_toolkit/_cdf_tk/utils/fileio/__init__.py,sha256=0rJsL3jClj_smxh_Omqchf0K
291
289
  cognite_toolkit/_cdf_tk/utils/fileio/_base.py,sha256=eC6mRIwSD4LjyFa83BoBnhO0t3l-ctQMW295LIyxXLk,827
292
290
  cognite_toolkit/_cdf_tk/utils/fileio/_compression.py,sha256=8BAPgg5OKc3vkEEkqOvYsuyh12iXVNuEmC0omWwyJNQ,2355
293
291
  cognite_toolkit/_cdf_tk/utils/fileio/_readers.py,sha256=i9TTqG2aml0B2Z6ZFKe7Z-bOAOa-wHz3fEemJGvIQww,15813
294
- cognite_toolkit/_cdf_tk/utils/fileio/_writers.py,sha256=4buAPp73Qfc0hw_LMyFI3g2DhdM4hbrasXuwMCiAcCQ,17732
292
+ cognite_toolkit/_cdf_tk/utils/fileio/_writers.py,sha256=mc23m0kJgl57FUDvwLmS7yR3xVZWQguPJa_63-qQ_L0,17731
295
293
  cognite_toolkit/_cdf_tk/utils/http_client/__init__.py,sha256=G8b7Bg4yIet5R4Igh3dS2SntWzE6I0iTGBeNlNsSxkQ,857
296
294
  cognite_toolkit/_cdf_tk/utils/http_client/_client.py,sha256=NTRfloXkCiS_rl5Vl1D_hsyTTowMKWDsiIR4oGwTADI,11208
297
295
  cognite_toolkit/_cdf_tk/utils/http_client/_data_classes.py,sha256=gNEJLb-tCoRh-OQA0BcJpESWl416ctC_6xKhWdwI4BU,13920
@@ -302,13 +300,13 @@ cognite_toolkit/_repo_files/.gitignore,sha256=ip9kf9tcC5OguF4YF4JFEApnKYw0nG0vPi
302
300
  cognite_toolkit/_repo_files/AzureDevOps/.devops/README.md,sha256=OLA0D7yCX2tACpzvkA0IfkgQ4_swSd-OlJ1tYcTBpsA,240
303
301
  cognite_toolkit/_repo_files/AzureDevOps/.devops/deploy-pipeline.yml,sha256=brULcs8joAeBC_w_aoWjDDUHs3JheLMIR9ajPUK96nc,693
304
302
  cognite_toolkit/_repo_files/AzureDevOps/.devops/dry-run-pipeline.yml,sha256=OBFDhFWK1mlT4Dc6mDUE2Es834l8sAlYG50-5RxRtHk,723
305
- cognite_toolkit/_repo_files/GitHub/.github/workflows/deploy.yaml,sha256=S8MFGAgtuHTYilFDxn907tu9kRL65RQO1XbFwzIWTJk,666
306
- cognite_toolkit/_repo_files/GitHub/.github/workflows/dry-run.yaml,sha256=obiNW_GgW2PeViZmYU0pUBh1KZvixdzjPmwEzBKQl4k,2429
307
- cognite_toolkit/_resources/cdf.toml,sha256=2plDWdhchZQwul76okB71n0RbjV5bBKKo72gwDWP7Eo,474
303
+ cognite_toolkit/_repo_files/GitHub/.github/workflows/deploy.yaml,sha256=PpKK3jflxLwNjCldzTWYHhg2hQ0Omi1BonQrnjWmgXo,666
304
+ cognite_toolkit/_repo_files/GitHub/.github/workflows/dry-run.yaml,sha256=ASOi_Om6-yPgKXo8NQE4xSju0f7c0BCGPVjFlNKTyeg,2429
305
+ cognite_toolkit/_resources/cdf.toml,sha256=A4-O_130gdFgF96eY8qDobuPNcGvhb2LPE6Rbs8PnkI,474
308
306
  cognite_toolkit/demo/__init__.py,sha256=-m1JoUiwRhNCL18eJ6t7fZOL7RPfowhCuqhYFtLgrss,72
309
307
  cognite_toolkit/demo/_base.py,sha256=6xKBUQpXZXGQ3fJ5f7nj7oT0s2n7OTAGIa17ZlKHZ5U,8052
310
- cognite_toolkit-0.7.0.dist-info/METADATA,sha256=w2PxW2DrLeWISkNeWn3vNiTIZdu6k_NzAIOFACAOVQE,4500
311
- cognite_toolkit-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
312
- cognite_toolkit-0.7.0.dist-info/entry_points.txt,sha256=JlR7MH1_UMogC3QOyN4-1l36VbrCX9xUdQoHGkuJ6-4,83
313
- cognite_toolkit-0.7.0.dist-info/licenses/LICENSE,sha256=CW0DRcx5tL-pCxLEN7ts2S9g2sLRAsWgHVEX4SN9_Mc,752
314
- cognite_toolkit-0.7.0.dist-info/RECORD,,
308
+ cognite_toolkit-0.7.1.dist-info/METADATA,sha256=n8jC9M1BlwW_IosE5ybERRNMUPxfyKTxvN4QDL7PltM,4500
309
+ cognite_toolkit-0.7.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
310
+ cognite_toolkit-0.7.1.dist-info/entry_points.txt,sha256=JlR7MH1_UMogC3QOyN4-1l36VbrCX9xUdQoHGkuJ6-4,83
311
+ cognite_toolkit-0.7.1.dist-info/licenses/LICENSE,sha256=CW0DRcx5tL-pCxLEN7ts2S9g2sLRAsWgHVEX4SN9_Mc,752
312
+ cognite_toolkit-0.7.1.dist-info/RECORD,,
@@ -1,489 +0,0 @@
1
- import time
2
- from abc import ABC, abstractmethod
3
- from collections.abc import Callable, Iterable, Iterator
4
- from functools import lru_cache
5
- from itertools import groupby
6
- from pathlib import Path
7
- from typing import Any, ClassVar, Generic, Literal
8
-
9
- from cognite.client.data_classes import (
10
- Asset,
11
- AssetFilter,
12
- DataSetList,
13
- Event,
14
- EventFilter,
15
- FileMetadata,
16
- FileMetadataFilter,
17
- LabelDefinitionList,
18
- TimeSeries,
19
- TimeSeriesFilter,
20
- )
21
- from cognite.client.data_classes._base import T_CogniteResource
22
- from rich.console import Console
23
- from rich.progress import track
24
-
25
- from cognite_toolkit._cdf_tk.client import ToolkitClient
26
- from cognite_toolkit._cdf_tk.commands._base import ToolkitCommand
27
- from cognite_toolkit._cdf_tk.cruds import (
28
- AssetCRUD,
29
- DataSetsCRUD,
30
- EventCRUD,
31
- FileMetadataCRUD,
32
- LabelCRUD,
33
- ResourceCRUD,
34
- TimeSeriesCRUD,
35
- )
36
- from cognite_toolkit._cdf_tk.exceptions import (
37
- ToolkitFileExistsError,
38
- ToolkitIsADirectoryError,
39
- ToolkitValueError,
40
- )
41
- from cognite_toolkit._cdf_tk.utils import humanize_collection
42
- from cognite_toolkit._cdf_tk.utils.cdf import metadata_key_counts
43
- from cognite_toolkit._cdf_tk.utils.file import safe_rmtree
44
- from cognite_toolkit._cdf_tk.utils.producer_worker import ProducerWorkerExecutor
45
- from cognite_toolkit._cdf_tk.utils.table_writers import (
46
- FileFormat,
47
- Schema,
48
- SchemaColumn,
49
- SchemaColumnList,
50
- TableFileWriter,
51
- )
52
-
53
-
54
- class DataFinder:
55
- supported_formats: ClassVar[frozenset[FileFormat]] = frozenset()
56
- # This is the standard maximum items that can be returns by most CDF endpoints.
57
- chunk_size: ClassVar[int] = 1000
58
-
59
- def validate_format(self, format_: str) -> Literal[FileFormat]:
60
- if format_ in self.supported_formats:
61
- return format_ # type: ignore[return-value]
62
- raise ToolkitValueError(
63
- f"Unsupported format {format_}. Supported formats are {humanize_collection(self.supported_formats)}."
64
- )
65
-
66
- @abstractmethod
67
- def create_iterators(
68
- self, format_: FileFormat, limit: int | None
69
- ) -> Iterator[tuple[Schema, int, Iterable, Callable]]:
70
- """Create an iterator for the specified format."""
71
- raise NotImplementedError("This method should be implemented in subclasses.")
72
-
73
-
74
- class AssetCentricFinder(DataFinder, ABC, Generic[T_CogniteResource]):
75
- def __init__(self, client: ToolkitClient, hierarchies: list[str], data_sets: list[str]):
76
- self.client = client
77
- self.hierarchies = hierarchies
78
- self.data_sets = data_sets
79
- self.loader = self._create_loader(client)
80
- self._hierarchy_set = set(self.hierarchies)
81
- self._data_set_set = set(self.data_sets)
82
- self._used_labels: set[str] = set()
83
- self._used_data_sets: set[str] = set()
84
-
85
- @abstractmethod
86
- def _create_loader(self, client: ToolkitClient) -> ResourceCRUD:
87
- """Create the appropriate loader for the finder."""
88
- raise NotImplementedError()
89
-
90
- @lru_cache
91
- def aggregate_count(self, hierarchies: tuple[str, ...], data_sets: tuple[str, ...]) -> int:
92
- return self._aggregate_count(list(hierarchies), list(data_sets))
93
-
94
- @abstractmethod
95
- def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
96
- raise NotImplementedError()
97
-
98
- @abstractmethod
99
- def _get_resource_columns(self) -> SchemaColumnList:
100
- """Get the columns for the schema."""
101
- raise NotImplementedError()
102
-
103
- @abstractmethod
104
- def create_resource_iterator(self, limit: int | None) -> Iterable:
105
- raise NotImplementedError()
106
-
107
- @abstractmethod
108
- def _resource_processor(self, items: Iterable[T_CogniteResource]) -> list[tuple[str, list[dict[str, Any]]]]:
109
- """Process the resources and return them in a format suitable for writing."""
110
- raise NotImplementedError()
111
-
112
- def _to_write(self, items: Iterable[T_CogniteResource]) -> list[dict[str, Any]]:
113
- write_items: list[dict[str, Any]] = []
114
- for item in items:
115
- dumped = self.loader.dump_resource(item)
116
- if "metadata" in dumped:
117
- metadata = dumped.pop("metadata")
118
- for key, value in metadata.items():
119
- dumped[f"metadata.{key}"] = value
120
- if isinstance(dumped.get("labels"), list):
121
- dumped["labels"] = [label["externalId"] for label in dumped["labels"]]
122
- self._used_labels.update(dumped["labels"])
123
- if "dataSetExternalId" in dumped:
124
- self._used_data_sets.add(dumped["dataSetExternalId"])
125
- write_items.append(dumped)
126
- return write_items
127
-
128
- def create_iterators(
129
- self, format_: FileFormat, limit: int | None
130
- ) -> Iterator[tuple[Schema, int, Iterable, Callable]]:
131
- total = self.aggregate_count(tuple(self.hierarchies), tuple(self.data_sets))
132
- columns = self._get_resource_columns()
133
-
134
- iteration_count = total // self.chunk_size + (1 if total % self.chunk_size > 0 else 0)
135
- if iteration_count == 0:
136
- return
137
-
138
- yield (
139
- Schema(
140
- display_name=self.loader.display_name,
141
- format_=format_,
142
- columns=columns,
143
- folder_name=self.loader.folder_name,
144
- kind=self.loader.kind,
145
- ),
146
- iteration_count,
147
- self.create_resource_iterator(limit),
148
- self._resource_processor,
149
- )
150
- if self._used_data_sets:
151
- yield self._data_sets()
152
- if self._used_labels:
153
- yield self._labels()
154
-
155
- def _data_sets(self) -> tuple[Schema, int, Iterable, Callable]:
156
- data_sets = self.client.data_sets.retrieve_multiple(
157
- external_ids=list(self._used_data_sets), ignore_unknown_ids=True
158
- )
159
- loader = DataSetsCRUD.create_loader(self.client)
160
-
161
- def process_data_sets(items: DataSetList) -> list[tuple[str, list[dict[str, Any]]]]:
162
- # All data sets are written to a single group, thus the empty string as the group key.
163
- # (Group keys are for example used in CSV files to create separate files for each
164
- # data set an asset belongs to.)
165
- return [("", [loader.dump_resource(item) for item in items])]
166
-
167
- return (
168
- # YAML format does not need columns.
169
- Schema(
170
- display_name=loader.display_name,
171
- format_="yaml",
172
- columns=SchemaColumnList(),
173
- folder_name=loader.folder_name,
174
- kind=loader.kind,
175
- ),
176
- 1,
177
- [data_sets],
178
- process_data_sets,
179
- )
180
-
181
- def _labels(self) -> tuple[Schema, int, Iterable, Callable]:
182
- labels = self.client.labels.retrieve(external_id=list(self._used_labels))
183
- loader = LabelCRUD.create_loader(self.client)
184
-
185
- def process_labels(items: LabelDefinitionList) -> list[tuple[str, list[dict[str, Any]]]]:
186
- # All labels are written to a single group, thus the empty string as the group key.
187
- # (Group keys are for example used in CSV files to create separate files for each
188
- # label an asset belongs to.)
189
- return [("", [loader.dump_resource(item) for item in items])]
190
-
191
- return (
192
- # YAML format does not need columns.
193
- Schema(
194
- display_name=loader.display_name,
195
- format_="yaml",
196
- columns=SchemaColumnList(),
197
- folder_name=loader.folder_name,
198
- kind=loader.kind,
199
- ),
200
- 1,
201
- [labels],
202
- process_labels,
203
- )
204
-
205
-
206
- class AssetFinder(AssetCentricFinder[Asset]):
207
- supported_formats = frozenset({"csv", "parquet", "yaml"})
208
-
209
- def _create_loader(self, client: ToolkitClient) -> ResourceCRUD:
210
- return AssetCRUD.create_loader(client)
211
-
212
- def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
213
- return self.client.assets.aggregate_count(
214
- filter=AssetFilter(
215
- data_set_ids=[{"externalId": item} for item in data_sets] or None,
216
- asset_subtree_ids=[{"externalId": item} for item in hierarchies] or None,
217
- )
218
- )
219
-
220
- def create_resource_iterator(self, limit: int | None) -> Iterator:
221
- return self.client.assets(
222
- chunk_size=self.chunk_size,
223
- asset_subtree_external_ids=self.hierarchies or None,
224
- data_set_external_ids=self.data_sets or None,
225
- limit=limit,
226
- )
227
-
228
- def _resource_processor(self, assets: Iterable[Asset]) -> list[tuple[str, list[dict[str, Any]]]]:
229
- grouped_assets: list[tuple[str, list[dict[str, object]]]] = []
230
- for group, asset_group in groupby(
231
- sorted([(self._group(asset), asset) for asset in assets], key=lambda x: x[0]), key=lambda x: x[0]
232
- ):
233
- grouped_assets.append((group, self._to_write([asset for _, asset in asset_group])))
234
- return grouped_assets
235
-
236
- def _group(self, item: Asset) -> str:
237
- if self.hierarchies and self.data_sets:
238
- asset_external_id = self.client.lookup.assets.external_id(item.root_id or 0)
239
- data_set_external_id = self.client.lookup.data_sets.external_id(item.data_set_id or 0)
240
- if asset_external_id and data_set_external_id:
241
- return f"{asset_external_id}.{data_set_external_id}"
242
- elif asset_external_id:
243
- return asset_external_id
244
- elif data_set_external_id:
245
- return data_set_external_id
246
- return ""
247
- elif self.hierarchies:
248
- return self.client.lookup.assets.external_id(item.root_id or 0) or ""
249
- elif self.data_sets:
250
- return self.client.lookup.data_sets.external_id(item.data_set_id or 0) or ""
251
- return ""
252
-
253
- def _get_resource_columns(self) -> SchemaColumnList:
254
- columns = SchemaColumnList(
255
- [
256
- SchemaColumn(name="externalId", type="string"),
257
- SchemaColumn(name="name", type="string"),
258
- SchemaColumn(name="parentExternalId", type="string"),
259
- SchemaColumn(name="description", type="string"),
260
- SchemaColumn(name="dataSetExternalId", type="string"),
261
- SchemaColumn(name="source", type="string"),
262
- SchemaColumn(name="labels", type="string", is_array=True),
263
- SchemaColumn(name="geoLocation", type="json"),
264
- ]
265
- )
266
- data_set_ids = self.client.lookup.data_sets.id(self.data_sets) if self.data_sets else []
267
- root_ids = self.client.lookup.assets.id(self.hierarchies) if self.hierarchies else []
268
- metadata_keys = metadata_key_counts(self.client, "assets", data_set_ids or None, root_ids or None)
269
- sorted_keys = sorted([key for key, count in metadata_keys if count > 0])
270
- columns.extend([SchemaColumn(name=f"metadata.{key}", type="string") for key in sorted_keys])
271
- return columns
272
-
273
-
274
- class FileMetadataFinder(AssetCentricFinder[FileMetadata]):
275
- supported_formats = frozenset({"csv", "parquet"})
276
-
277
- def _create_loader(self, client: ToolkitClient) -> ResourceCRUD:
278
- return FileMetadataCRUD.create_loader(client)
279
-
280
- def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
281
- result = self.client.files.aggregate(
282
- filter=FileMetadataFilter(
283
- data_set_ids=[{"externalId": item} for item in data_sets] or None,
284
- asset_subtree_ids=[{"externalId": item} for item in hierarchies] or None,
285
- )
286
- )
287
- return result[0].count if result else 0
288
-
289
- def _get_resource_columns(self) -> SchemaColumnList:
290
- columns = SchemaColumnList(
291
- [
292
- SchemaColumn(name="externalId", type="string"),
293
- SchemaColumn(name="name", type="string"),
294
- SchemaColumn(name="directory", type="string"),
295
- SchemaColumn(name="source", type="string"),
296
- SchemaColumn(name="mimeType", type="string"),
297
- SchemaColumn(name="assetExternalIds", type="string", is_array=True),
298
- SchemaColumn(name="dataSetExternalId", type="string"),
299
- SchemaColumn(name="sourceCreatedTime", type="integer"),
300
- SchemaColumn(name="sourceModifiedTime", type="integer"),
301
- SchemaColumn(name="securityCategories", type="string", is_array=True),
302
- SchemaColumn(name="labels", type="string", is_array=True),
303
- SchemaColumn(name="geoLocation", type="json"),
304
- ]
305
- )
306
- data_set_ids = self.client.lookup.data_sets.id(self.data_sets) if self.data_sets else []
307
- root_ids = self.client.lookup.assets.id(self.hierarchies) if self.hierarchies else []
308
- metadata_keys = metadata_key_counts(self.client, "files", data_set_ids or None, root_ids or None)
309
- sorted_keys = sorted([key for key, count in metadata_keys if count > 0])
310
- columns.extend([SchemaColumn(name=f"metadata.{key}", type="string") for key in sorted_keys])
311
- return columns
312
-
313
- def create_resource_iterator(self, limit: int | None) -> Iterable:
314
- return self.client.files(
315
- chunk_size=self.chunk_size,
316
- asset_subtree_external_ids=self.hierarchies or None,
317
- data_set_external_ids=self.data_sets or None,
318
- limit=limit,
319
- )
320
-
321
- def _resource_processor(self, items: Iterable[FileMetadata]) -> list[tuple[str, list[dict[str, Any]]]]:
322
- return [("", self._to_write(items))]
323
-
324
-
325
- class TimeSeriesFinder(AssetCentricFinder[TimeSeries]):
326
- supported_formats = frozenset({"csv", "parquet", "yaml"})
327
-
328
- def _create_loader(self, client: ToolkitClient) -> TimeSeriesCRUD:
329
- return TimeSeriesCRUD.create_loader(client)
330
-
331
- def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
332
- return self.client.time_series.aggregate_count(
333
- filter=TimeSeriesFilter(
334
- data_set_ids=[{"externalId": item} for item in data_sets] or None,
335
- asset_subtree_ids=[{"externalId": item} for item in hierarchies] or None,
336
- )
337
- )
338
-
339
- def create_resource_iterator(self, limit: int | None) -> Iterator:
340
- return self.client.time_series(
341
- chunk_size=self.chunk_size,
342
- asset_subtree_external_ids=self.hierarchies or None,
343
- data_set_external_ids=self.data_sets or None,
344
- limit=limit,
345
- )
346
-
347
- def _resource_processor(self, time_series: Iterable[TimeSeries]) -> list[tuple[str, list[dict[str, Any]]]]:
348
- return [("", self._to_write(time_series))]
349
-
350
- def _get_resource_columns(self) -> SchemaColumnList:
351
- columns = SchemaColumnList(
352
- [
353
- SchemaColumn(name="externalId", type="string"),
354
- SchemaColumn(name="name", type="string"),
355
- SchemaColumn(name="isString", type="boolean"),
356
- SchemaColumn(name="unit", type="string"),
357
- SchemaColumn(name="unitExternalId", type="string"),
358
- SchemaColumn(name="assetExternalId", type="string"),
359
- SchemaColumn(name="isStep", type="boolean"),
360
- SchemaColumn(name="description", type="string"),
361
- SchemaColumn(name="dataSetExternalId", type="string"),
362
- SchemaColumn(name="securityCategories", type="string", is_array=True),
363
- ]
364
- )
365
- data_set_ids = self.client.lookup.data_sets.id(self.data_sets) if self.data_sets else []
366
- root_ids = self.client.lookup.assets.id(self.hierarchies) if self.hierarchies else []
367
- metadata_keys = metadata_key_counts(self.client, "timeseries", data_set_ids or None, root_ids or None)
368
- sorted_keys = sorted([key for key, count in metadata_keys if count > 0])
369
- columns.extend([SchemaColumn(name=f"metadata.{key}", type="string") for key in sorted_keys])
370
- return columns
371
-
372
-
373
- class EventFinder(AssetCentricFinder[Event]):
374
- supported_formats = frozenset({"csv", "parquet"})
375
-
376
- def _create_loader(self, client: ToolkitClient) -> ResourceCRUD:
377
- return EventCRUD.create_loader(client)
378
-
379
- def _aggregate_count(self, hierarchies: list[str], data_sets: list[str]) -> int:
380
- return self.client.events.aggregate_count(
381
- filter=EventFilter(
382
- data_set_ids=[{"externalId": item} for item in data_sets] or None,
383
- asset_subtree_ids=[{"externalId": item} for item in hierarchies] or None,
384
- )
385
- )
386
-
387
- def _get_resource_columns(self) -> SchemaColumnList:
388
- columns = SchemaColumnList(
389
- [
390
- SchemaColumn(name="externalId", type="string"),
391
- SchemaColumn(name="dataSetExternalId", type="string"),
392
- SchemaColumn(name="startTime", type="integer"),
393
- SchemaColumn(name="endTime", type="integer"),
394
- SchemaColumn(name="type", type="string"),
395
- SchemaColumn(name="subtype", type="string"),
396
- SchemaColumn(name="description", type="string"),
397
- SchemaColumn(name="assetExternalIds", type="string", is_array=True),
398
- SchemaColumn(name="source", type="string"),
399
- ]
400
- )
401
- data_set_ids = self.client.lookup.data_sets.id(self.data_sets) if self.data_sets else []
402
- root_ids = self.client.lookup.assets.id(self.hierarchies) if self.hierarchies else []
403
- metadata_keys = metadata_key_counts(self.client, "events", data_set_ids or None, root_ids or None)
404
- sorted_keys = sorted([key for key, count in metadata_keys if count > 0])
405
- columns.extend([SchemaColumn(name=f"metadata.{key}", type="string") for key in sorted_keys])
406
- return columns
407
-
408
- def create_resource_iterator(self, limit: int | None) -> Iterable:
409
- return self.client.events(
410
- chunk_size=self.chunk_size,
411
- asset_subtree_external_ids=self.hierarchies or None,
412
- data_set_external_ids=self.data_sets or None,
413
- limit=limit,
414
- )
415
-
416
- def _resource_processor(self, items: Iterable[Event]) -> list[tuple[str, list[dict[str, Any]]]]:
417
- return [("", self._to_write(items))]
418
-
419
-
420
- class DumpDataCommand(ToolkitCommand):
421
- def dump_table(
422
- self,
423
- finder: DataFinder,
424
- output_dir: Path,
425
- clean: bool,
426
- limit: int | None = None,
427
- format_: str = "csv",
428
- verbose: bool = False,
429
- parallel_threshold: int = 10,
430
- max_queue_size: int = 10,
431
- ) -> None:
432
- """Dumps data from CDF to a file
433
-
434
- Args:
435
- finder (DataFinder): The finder object to use for fetching data.
436
- output_dir (Path): The directory to write the output files to.
437
- clean (bool): Whether to clean the output directory before writing files.
438
- limit (int | None, optional): The maximum number of rows to write. Defaults to None.
439
- format_ (Literal["yaml", "csv", "parquet"], optional): The format of the output file. Defaults to "csv".
440
- verbose (bool, optional): Whether to print detailed progress information. Defaults to False.
441
- parallel_threshold (int, optional): The iteration threshold for parallel processing. Defaults to 10.
442
- max_queue_size (int, optional): If using parallel processing, the maximum size of the queue. Defaults to 10.
443
-
444
- """
445
- valid_format = finder.validate_format(format_)
446
- self.validate_directory(output_dir, clean)
447
-
448
- console = Console()
449
- # The ignore is used as MyPy does not understand that is_supported_format
450
- # above guarantees that the format is valid.
451
- for schema, iteration_count, resource_iterator, resource_processor in finder.create_iterators(
452
- valid_format, limit
453
- ):
454
- writer_cls = TableFileWriter.get_write_cls(schema.format_)
455
- row_counts = 0
456
- t0 = time.perf_counter()
457
- with writer_cls(schema, output_dir) as writer:
458
- if iteration_count > parallel_threshold:
459
- executor = ProducerWorkerExecutor(
460
- download_iterable=resource_iterator,
461
- process=resource_processor,
462
- write=writer.write_rows,
463
- iteration_count=iteration_count,
464
- max_queue_size=max_queue_size,
465
- download_description=f"Downloading {schema.display_name}",
466
- process_description=f"Processing {schema.display_name}",
467
- write_description=f"Writing {schema.display_name} to file",
468
- )
469
- executor.run()
470
- executor.raise_on_error()
471
- row_counts = executor.total_items
472
- else:
473
- for resources in track(
474
- resource_iterator, total=iteration_count, description=f"Dumping {schema.display_name}"
475
- ):
476
- row_counts += len(resources)
477
- processed = resource_processor(resources)
478
- writer.write_rows(processed)
479
- elapsed = time.perf_counter() - t0
480
- console.print(f"Dumped {row_counts:,} rows to {output_dir} in {elapsed:,.2f} seconds.")
481
-
482
- @staticmethod
483
- def validate_directory(output_dir: Path, clean: bool) -> None:
484
- if output_dir.exists() and clean:
485
- safe_rmtree(output_dir)
486
- elif output_dir.exists():
487
- raise ToolkitFileExistsError(f"Output directory {output_dir!s} already exists. Use --clean to remove it.")
488
- elif output_dir.suffix:
489
- raise ToolkitIsADirectoryError(f"Output directory {output_dir!s} is not a directory.")
@@ -1,434 +0,0 @@
1
- import csv
2
- import importlib.util
3
- import json
4
- import sys
5
- from abc import abstractmethod
6
- from collections.abc import Collection, Iterator, Mapping, Sequence
7
- from dataclasses import dataclass
8
- from datetime import date, datetime, timezone
9
- from functools import lru_cache
10
- from io import TextIOWrapper
11
- from pathlib import Path
12
- from types import MappingProxyType
13
- from typing import IO, TYPE_CHECKING, Any, ClassVar, Generic, Literal, SupportsIndex, TypeAlias, TypeVar, overload
14
-
15
- from cognite.client.data_classes.data_modeling import data_types as dt
16
- from cognite.client.data_classes.data_modeling.views import MappedProperty, ViewProperty
17
-
18
- from cognite_toolkit._cdf_tk.exceptions import ToolkitMissingDependencyError, ToolkitTypeError, ToolkitValueError
19
- from cognite_toolkit._cdf_tk.utils import humanize_collection, sanitize_filename
20
- from cognite_toolkit._cdf_tk.utils.file import yaml_safe_dump
21
-
22
- from .useful_types import JsonVal
23
-
24
- if sys.version_info >= (3, 11):
25
- from typing import Self
26
- else:
27
- from typing_extensions import Self
28
-
29
- if TYPE_CHECKING:
30
- import pyarrow as pa
31
- import pyarrow.parquet as pq
32
-
33
- FileFormat: TypeAlias = Literal["csv", "parquet", "yaml"]
34
- DataType: TypeAlias = Literal["string", "integer", "float", "boolean", "json", "date", "timestamp", "epoch"]
35
- PrimaryCellValue: TypeAlias = datetime | date | str | int | float | bool | JsonVal | None
36
- CellValue: TypeAlias = PrimaryCellValue | list[PrimaryCellValue]
37
- Rows: TypeAlias = list[dict[str, CellValue]]
38
-
39
-
40
- @dataclass(frozen=True)
41
- class SchemaColumn:
42
- name: str
43
- type: DataType
44
- is_array: bool = False
45
-
46
- def __post_init__(self) -> None:
47
- if self.type == "json" and self.is_array:
48
- raise ValueError("JSON columns cannot be arrays. Use 'is_array=False' for JSON columns.")
49
-
50
-
51
- class SchemaColumnList(list, Sequence[SchemaColumn]):
52
- # Implemented to get correct type hints
53
- def __init__(self, collection: Collection[SchemaColumn] | None = None) -> None:
54
- super().__init__(collection or [])
55
-
56
- def __iter__(self) -> Iterator[SchemaColumn]:
57
- return super().__iter__()
58
-
59
- @overload
60
- def __getitem__(self, index: SupportsIndex) -> SchemaColumn: ...
61
-
62
- @overload
63
- def __getitem__(self, index: slice) -> Self: ...
64
-
65
- def __getitem__(self, index: SupportsIndex | slice, /) -> SchemaColumn | Self:
66
- if isinstance(index, slice):
67
- return type(self)(super().__getitem__(index))
68
- return super().__getitem__(index)
69
-
70
- @classmethod
71
- def create_from_view_properties(cls, properties: Mapping[str, ViewProperty], support_edges: bool = False) -> Self:
72
- """Create a SchemaColumnList from a mapping of ViewProperty objects.
73
-
74
- Args:
75
- properties (Mapping[str, ViewProperty]): A mapping of property names to ViewProperty objects.
76
- support_edges (bool): Whether the the view supports edges. If True, the schema will include
77
- startNode and endNode columns.
78
-
79
- Returns:
80
- SchemaColumnList: A list of SchemaColumn objects representing the properties.
81
- """
82
- columns = [
83
- SchemaColumn("space", "string", is_array=False),
84
- SchemaColumn("externalId", "string", is_array=False),
85
- SchemaColumn("instanceType", "string"),
86
- SchemaColumn("existingVersion", "integer", is_array=False),
87
- SchemaColumn("type", "json", is_array=False),
88
- ]
89
- if support_edges:
90
- columns.append(SchemaColumn("startNode", "json", is_array=False))
91
- columns.append(SchemaColumn("endNode", "json", is_array=False))
92
- for name, prop in properties.items():
93
- if not isinstance(prop, MappedProperty):
94
- # We skip all properties that does not reside in a container.
95
- continue
96
- schema_type = cls._dms_to_schema_type(prop.type)
97
- is_array = (
98
- isinstance(prop.type, dt.ListablePropertyType)
99
- and prop.type.is_list
100
- and schema_type != "json" # JSON is not an array type
101
- )
102
- columns.append(SchemaColumn(name=f"properties.{name}", type=schema_type, is_array=is_array))
103
- return cls(columns)
104
-
105
- @classmethod
106
- def _dms_to_schema_type(cls, model_type: dt.PropertyType) -> DataType:
107
- if isinstance(model_type, dt.Text | dt.Enum | dt.CDFExternalIdReference):
108
- return "string"
109
- elif isinstance(model_type, dt.Boolean):
110
- return "boolean"
111
- elif isinstance(model_type, dt.Json | dt.DirectRelation):
112
- return "json"
113
- elif isinstance(model_type, dt.Int32 | dt.Int64):
114
- return "integer"
115
- elif isinstance(model_type, dt.Float32 | dt.Float64):
116
- return "float"
117
- elif isinstance(model_type, dt.Timestamp):
118
- return "timestamp"
119
- elif isinstance(model_type, dt.Date):
120
- return "date"
121
- else:
122
- raise ToolkitTypeError(
123
- f"Failed convertion from data modeling type to Table Schema. Unknown type: {type(model_type)!r}."
124
- )
125
-
126
-
127
- @dataclass
128
- class Schema:
129
- display_name: str
130
- folder_name: str
131
- kind: str
132
- format_: FileFormat
133
- columns: SchemaColumnList
134
-
135
-
136
- T_IO = TypeVar("T_IO", bound=IO)
137
-
138
-
139
- class TableFileWriter(Generic[T_IO]):
140
- encoding = "utf-8"
141
- newline = "\n"
142
- format: ClassVar[FileFormat]
143
-
144
- def __init__(self, schema: Schema, output_dir: Path, max_file_size_bytes: int = 128 * 1024 * 1024) -> None:
145
- self.max_file_size_bytes = max_file_size_bytes
146
- self.schema = schema
147
- self.output_dir = output_dir
148
- self._file_count = 1
149
- self._writer_by_filepath: dict[Path, T_IO] = {}
150
-
151
- def write_rows(self, rows_group_list: list[tuple[str, Rows]]) -> None:
152
- """Write rows to a file."""
153
- for group, group_rows in rows_group_list:
154
- if not group_rows:
155
- continue
156
- writer = self._get_writer(group)
157
- self._write_rows(writer, group_rows)
158
-
159
- @abstractmethod
160
- def _write_rows(self, writer: T_IO, rows: Rows) -> None:
161
- raise NotImplementedError()
162
-
163
- @abstractmethod
164
- def _create_writer(self, filepath: Path) -> T_IO:
165
- """Create a writer for the given file path."""
166
- raise NotImplementedError("This method should be implemented in subclasses.")
167
-
168
- @abstractmethod
169
- def _is_above_file_size_limit(self, filepath: Path, writer: T_IO) -> bool:
170
- """Check if the file size is above the limit."""
171
- raise NotImplementedError("This method should be implemented in subclasses.")
172
-
173
- def __enter__(self) -> "TableFileWriter":
174
- self._file_count = 1
175
- return self
176
-
177
- def __exit__(self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: Any | None) -> None:
178
- for writer in self._writer_by_filepath.values():
179
- writer.close()
180
- self._writer_by_filepath.clear()
181
- return None
182
-
183
- def _get_writer(self, group: str) -> T_IO:
184
- clean_name = f"{sanitize_filename(group)}-" if group else ""
185
- file_path = (
186
- self.output_dir
187
- / self.schema.folder_name
188
- / f"{clean_name}part-{self._file_count:04}.{self.schema.kind}.{self.format}"
189
- )
190
- file_path.parent.mkdir(parents=True, exist_ok=True)
191
- if file_path not in self._writer_by_filepath:
192
- self._writer_by_filepath[file_path] = self._create_writer(file_path)
193
- elif self._is_above_file_size_limit(file_path, self._writer_by_filepath[file_path]):
194
- self._writer_by_filepath[file_path].close()
195
- del self._writer_by_filepath[file_path]
196
- self._file_count += 1
197
- return self._get_writer(group)
198
- return self._writer_by_filepath[file_path]
199
-
200
- @classmethod
201
- def get_write_cls(cls, format_: FileFormat) -> "type[TableFileWriter]":
202
- """Get the writer class for the given format."""
203
- write_cls = _TABLEWRITER_CLASS_BY_FORMAT.get(format_)
204
- if write_cls is None:
205
- raise ToolkitValueError(
206
- f"Unsupported format {format_}. Supported formats are {humanize_collection(_TABLEWRITER_CLASS_BY_FORMAT.keys())}."
207
- )
208
- return write_cls
209
-
210
-
211
- class ParquetWriter(TableFileWriter["pq.ParquetWriter"]):
212
- """Parquet writer for CDF Toolkit.
213
-
214
- Caveat: This mutates the rows to convert JSON, timestamp, and date columns to appropriate formats.
215
- This is necessary because pyarrow does not support JSON, timestamp, and date types directly in the way we need.
216
- We avoid making a copy of each row for performance reasons, but this means that the rows passed to this writer
217
- will be modified in place.
218
- """
219
-
220
- format = "parquet"
221
-
222
- def __init__(self, schema: Schema, output_dir: Path, max_file_size_bytes: int = 128 * 1024 * 1024) -> None:
223
- super().__init__(schema, output_dir, max_file_size_bytes)
224
- self._check_pyarrow_dependency()
225
-
226
- def _create_writer(self, filepath: Path) -> "pq.ParquetWriter":
227
- import pyarrow.parquet as pq
228
-
229
- schema = self._create_schema()
230
- return pq.ParquetWriter(filepath, schema)
231
-
232
- def _write_rows(self, writer: "pq.ParquetWriter", rows: Rows) -> None:
233
- import pyarrow as pa
234
-
235
- if json_columns := self._json_columns():
236
- for row in rows:
237
- json_values = set(row.keys()) & json_columns
238
- for col in json_values:
239
- row[col] = json.dumps(row[col])
240
- if timestamp_columns := self._timestamp_columns():
241
- for row in rows:
242
- for col in set(row.keys()) & timestamp_columns:
243
- cell_value = row[col]
244
- if isinstance(cell_value, list):
245
- # MyPy does not understand that a list of PrimaryCellValue is valid here
246
- # It expects a union of PrimaryCellValue and list[PrimaryCellValue].
247
- row[col] = [self._to_datetime(value) for value in cell_value] # type: ignore[assignment]
248
- else:
249
- row[col] = self._to_datetime(cell_value)
250
- if date_columns := self._date_columns():
251
- for row in rows:
252
- for col in set(row.keys()) & date_columns:
253
- cell_value = row[col]
254
- if isinstance(cell_value, list):
255
- # MyPy does not understand that a list of PrimaryCellValue is valid here.
256
- # It expects a union of PrimaryCellValue and list[PrimaryCellValue].
257
- row[col] = [self._to_date(value) for value in cell_value] # type: ignore[assignment]
258
- else:
259
- row[col] = self._to_date(cell_value)
260
-
261
- table = pa.Table.from_pylist(rows, schema=self._create_schema())
262
- writer.write_table(table)
263
-
264
- def _is_above_file_size_limit(self, filepath: Path, writer: "pq.ParquetWriter") -> bool:
265
- return filepath.exists() and filepath.stat().st_size > self.max_file_size_bytes
266
-
267
- @lru_cache(maxsize=1)
268
- def _json_columns(self) -> set[str]:
269
- """Check if the writer supports JSON format."""
270
- return {col.name for col in self.schema.columns if col.type == "json"}
271
-
272
- @lru_cache(maxsize=1)
273
- def _timestamp_columns(self) -> set[str]:
274
- """Check if the writer supports timestamp format."""
275
- return {col.name for col in self.schema.columns if col.type == "timestamp"}
276
-
277
- @lru_cache(maxsize=1)
278
- def _date_columns(self) -> set[str]:
279
- return {col.name for col in self.schema.columns if col.type == "date"}
280
-
281
- @classmethod
282
- def _to_datetime(cls, value: CellValue) -> CellValue:
283
- if isinstance(value, datetime) or value is None:
284
- output = value
285
- elif isinstance(value, date):
286
- output = datetime.combine(value, datetime.min.time())
287
- elif isinstance(value, int | float):
288
- # Assuming the value is a timestamp in milliseconds
289
- output = datetime.fromtimestamp(value / 1000.0)
290
- elif isinstance(value, str):
291
- output = cls._convert_data_modelling_timestamp(value)
292
- else:
293
- raise ToolkitTypeError(
294
- f"Unsupported value type for datetime conversion: {type(value)}. Expected datetime, date, int, float, or str."
295
- )
296
- if output is not None and output.tzinfo is None:
297
- # Ensure the datetime is in UTC
298
- output = output.replace(tzinfo=timezone.utc)
299
- elif output is not None and output.tzinfo is not None:
300
- # Convert to UTC if it has a timezone
301
- output = output.astimezone(timezone.utc)
302
- return output
303
-
304
- @classmethod
305
- def _to_date(cls, value: CellValue) -> CellValue:
306
- if isinstance(value, date) or value is None:
307
- return value
308
- elif isinstance(value, datetime):
309
- return value.date()
310
- elif isinstance(value, int | float):
311
- # Assuming the value is a timestamp in milliseconds
312
- return date.fromtimestamp(value / 1000.0)
313
- elif isinstance(value, str):
314
- return cls._convert_data_modelling_timestamp(value).date()
315
- else:
316
- raise ToolkitTypeError(
317
- f"Unsupported value type for date conversion: {type(value)}. Expected date, datetime, int, float, or str."
318
- )
319
-
320
- @classmethod
321
- def _convert_data_modelling_timestamp(cls, timestamp: str) -> datetime:
322
- """Convert a timestamp string from the data modeling format to a datetime object."""
323
- try:
324
- return datetime.fromisoformat(timestamp)
325
- except ValueError:
326
- # Typically hits if the timestamp has truncated milliseconds,
327
- # For example, "2021-01-01T00:00:00.17+00:00".
328
- # In Python 3.10, the strptime requires exact formats so we need both formats below.
329
- # In Python 3.11-13, if the timestamp matches on the second it will match on the first,
330
- # so when we set lower bound to 3.11 the loop will not be needed.
331
- for format_ in ["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%dT%H:%M:%S%z"]:
332
- try:
333
- return datetime.strptime(timestamp, format_)
334
- except ValueError:
335
- continue
336
- raise ValueError(
337
- f"Invalid timestamp format: {timestamp}. Expected ISO 8601 format with optional milliseconds and timezone."
338
- )
339
-
340
- @lru_cache(maxsize=1)
341
- def _create_schema(self) -> "pa.Schema":
342
- """Create a pyarrow schema from the schema definition."""
343
- self._check_pyarrow_dependency()
344
- import pyarrow as pa
345
-
346
- fields: list[pa.Field] = []
347
- for prop in self.schema.columns:
348
- pa_type = self._as_pa_type(prop.type, prop.is_array)
349
- fields.append(pa.field(prop.name, pa_type, nullable=True))
350
- return pa.schema(fields)
351
-
352
- @staticmethod
353
- def _check_pyarrow_dependency() -> None:
354
- if importlib.util.find_spec("pyarrow") is None:
355
- raise ToolkitMissingDependencyError(
356
- "Writing to parquet requires pyarrow. Install with 'pip install \"cognite-toolkit[table]\"'"
357
- )
358
-
359
- @staticmethod
360
- def _as_pa_type(type_: DataType, is_array: bool) -> "pa.DataType":
361
- """Convert a data type to a pyarrow type."""
362
- import pyarrow as pa
363
-
364
- if type_ == "string":
365
- pa_type = pa.string()
366
- elif type_ == "integer":
367
- pa_type = pa.int64()
368
- elif type_ == "float":
369
- pa_type = pa.float64()
370
- elif type_ == "boolean":
371
- pa_type = pa.bool_()
372
- elif type_ == "date":
373
- pa_type = pa.date32()
374
- elif type_ == "time":
375
- pa_type = pa.time64("ms")
376
- elif type_ == "json":
377
- pa_type = pa.string()
378
- elif type_ == "timestamp":
379
- pa_type = pa.timestamp("ms", tz="UTC")
380
- else:
381
- raise ToolkitValueError(f"Unsupported data type {type_}.")
382
-
383
- if is_array:
384
- pa_type = pa.list_(pa_type)
385
- return pa_type
386
-
387
-
388
- class CSVWriter(TableFileWriter[TextIOWrapper]):
389
- format = "csv"
390
-
391
- def _create_writer(self, filepath: Path) -> TextIOWrapper:
392
- stream = filepath.open("a", encoding=self.encoding, newline=self.newline)
393
- writer = self._create_dict_writer(stream)
394
- if filepath.stat().st_size == 0:
395
- writer.writeheader()
396
- return stream
397
-
398
- def _is_above_file_size_limit(self, filepath: Path, writer: TextIOWrapper) -> bool:
399
- current_position = writer.tell()
400
- writer.seek(0, 2)
401
- if writer.tell() > self.max_file_size_bytes:
402
- return True
403
- writer.seek(current_position)
404
- return False
405
-
406
- def _write_rows(self, writer: TextIOWrapper, rows: Rows) -> None:
407
- dict_writer = self._create_dict_writer(writer)
408
- dict_writer.writerows(rows)
409
-
410
- def _create_dict_writer(self, writer: TextIOWrapper) -> csv.DictWriter:
411
- return csv.DictWriter(writer, fieldnames=[col.name for col in self.schema.columns], extrasaction="ignore")
412
-
413
-
414
- class YAMLWriter(TableFileWriter[TextIOWrapper]):
415
- format = "yaml"
416
-
417
- def _create_writer(self, filepath: Path) -> TextIOWrapper:
418
- return filepath.open("a", encoding=self.encoding, newline=self.newline)
419
-
420
- def _is_above_file_size_limit(self, filepath: Path, writer: TextIOWrapper) -> bool:
421
- current_position = writer.tell()
422
- writer.seek(0, 2)
423
- if writer.tell() > self.max_file_size_bytes:
424
- return True
425
- writer.seek(current_position)
426
- return False
427
-
428
- def _write_rows(self, writer: TextIOWrapper, rows: Rows) -> None:
429
- writer.write(yaml_safe_dump(rows))
430
-
431
-
432
- _TABLEWRITER_CLASS_BY_FORMAT: MappingProxyType[str, type[TableFileWriter]] = MappingProxyType(
433
- {w.format: w for w in TableFileWriter.__subclasses__()} # type: ignore[type-abstract]
434
- )