cognite-toolkit 0.6.97__py3-none-any.whl → 0.7.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. cognite_toolkit/_cdf.py +21 -23
  2. cognite_toolkit/_cdf_tk/apps/__init__.py +4 -0
  3. cognite_toolkit/_cdf_tk/apps/_core_app.py +19 -5
  4. cognite_toolkit/_cdf_tk/apps/_data_app.py +1 -1
  5. cognite_toolkit/_cdf_tk/apps/_dev_app.py +86 -0
  6. cognite_toolkit/_cdf_tk/apps/_download_app.py +693 -25
  7. cognite_toolkit/_cdf_tk/apps/_dump_app.py +44 -102
  8. cognite_toolkit/_cdf_tk/apps/_import_app.py +41 -0
  9. cognite_toolkit/_cdf_tk/apps/_landing_app.py +18 -4
  10. cognite_toolkit/_cdf_tk/apps/_migrate_app.py +424 -9
  11. cognite_toolkit/_cdf_tk/apps/_modules_app.py +0 -3
  12. cognite_toolkit/_cdf_tk/apps/_purge.py +15 -43
  13. cognite_toolkit/_cdf_tk/apps/_run.py +11 -0
  14. cognite_toolkit/_cdf_tk/apps/_upload_app.py +45 -6
  15. cognite_toolkit/_cdf_tk/builders/__init__.py +2 -2
  16. cognite_toolkit/_cdf_tk/builders/_base.py +28 -42
  17. cognite_toolkit/_cdf_tk/builders/_raw.py +1 -1
  18. cognite_toolkit/_cdf_tk/cdf_toml.py +20 -1
  19. cognite_toolkit/_cdf_tk/client/_toolkit_client.py +32 -12
  20. cognite_toolkit/_cdf_tk/client/api/infield.py +114 -17
  21. cognite_toolkit/_cdf_tk/client/api/{canvas.py → legacy/canvas.py} +15 -7
  22. cognite_toolkit/_cdf_tk/client/api/{charts.py → legacy/charts.py} +1 -1
  23. cognite_toolkit/_cdf_tk/client/api/{extended_data_modeling.py → legacy/extended_data_modeling.py} +1 -1
  24. cognite_toolkit/_cdf_tk/client/api/{extended_files.py → legacy/extended_files.py} +2 -2
  25. cognite_toolkit/_cdf_tk/client/api/{extended_functions.py → legacy/extended_functions.py} +15 -18
  26. cognite_toolkit/_cdf_tk/client/api/{extended_raw.py → legacy/extended_raw.py} +1 -1
  27. cognite_toolkit/_cdf_tk/client/api/{extended_timeseries.py → legacy/extended_timeseries.py} +5 -2
  28. cognite_toolkit/_cdf_tk/client/api/{location_filters.py → legacy/location_filters.py} +1 -1
  29. cognite_toolkit/_cdf_tk/client/api/legacy/robotics/__init__.py +8 -0
  30. cognite_toolkit/_cdf_tk/client/api/{robotics → legacy/robotics}/capabilities.py +1 -1
  31. cognite_toolkit/_cdf_tk/client/api/{robotics → legacy/robotics}/data_postprocessing.py +1 -1
  32. cognite_toolkit/_cdf_tk/client/api/{robotics → legacy/robotics}/frames.py +1 -1
  33. cognite_toolkit/_cdf_tk/client/api/{robotics → legacy/robotics}/locations.py +1 -1
  34. cognite_toolkit/_cdf_tk/client/api/{robotics → legacy/robotics}/maps.py +1 -1
  35. cognite_toolkit/_cdf_tk/client/api/{robotics → legacy/robotics}/robots.py +2 -2
  36. cognite_toolkit/_cdf_tk/client/api/{search_config.py → legacy/search_config.py} +5 -1
  37. cognite_toolkit/_cdf_tk/client/api/migration.py +177 -4
  38. cognite_toolkit/_cdf_tk/client/api/project.py +9 -8
  39. cognite_toolkit/_cdf_tk/client/api/search.py +2 -2
  40. cognite_toolkit/_cdf_tk/client/api/streams.py +88 -0
  41. cognite_toolkit/_cdf_tk/client/api/three_d.py +384 -0
  42. cognite_toolkit/_cdf_tk/client/data_classes/api_classes.py +13 -0
  43. cognite_toolkit/_cdf_tk/client/data_classes/base.py +37 -33
  44. cognite_toolkit/_cdf_tk/client/data_classes/charts_data.py +95 -213
  45. cognite_toolkit/_cdf_tk/client/data_classes/infield.py +32 -18
  46. cognite_toolkit/_cdf_tk/client/data_classes/instance_api.py +18 -13
  47. cognite_toolkit/_cdf_tk/client/data_classes/legacy/__init__.py +0 -0
  48. cognite_toolkit/_cdf_tk/client/data_classes/{canvas.py → legacy/canvas.py} +47 -4
  49. cognite_toolkit/_cdf_tk/client/data_classes/{charts.py → legacy/charts.py} +3 -3
  50. cognite_toolkit/_cdf_tk/client/data_classes/{migration.py → legacy/migration.py} +10 -2
  51. cognite_toolkit/_cdf_tk/client/data_classes/streams.py +90 -0
  52. cognite_toolkit/_cdf_tk/client/data_classes/three_d.py +112 -0
  53. cognite_toolkit/_cdf_tk/client/testing.py +42 -18
  54. cognite_toolkit/_cdf_tk/commands/__init__.py +7 -6
  55. cognite_toolkit/_cdf_tk/commands/_changes.py +3 -42
  56. cognite_toolkit/_cdf_tk/commands/_download.py +21 -11
  57. cognite_toolkit/_cdf_tk/commands/_migrate/__init__.py +0 -2
  58. cognite_toolkit/_cdf_tk/commands/_migrate/command.py +22 -20
  59. cognite_toolkit/_cdf_tk/commands/_migrate/conversion.py +140 -92
  60. cognite_toolkit/_cdf_tk/commands/_migrate/creators.py +1 -1
  61. cognite_toolkit/_cdf_tk/commands/_migrate/data_classes.py +108 -26
  62. cognite_toolkit/_cdf_tk/commands/_migrate/data_mapper.py +448 -45
  63. cognite_toolkit/_cdf_tk/commands/_migrate/data_model.py +1 -0
  64. cognite_toolkit/_cdf_tk/commands/_migrate/default_mappings.py +6 -6
  65. cognite_toolkit/_cdf_tk/commands/_migrate/issues.py +52 -1
  66. cognite_toolkit/_cdf_tk/commands/_migrate/migration_io.py +377 -11
  67. cognite_toolkit/_cdf_tk/commands/_migrate/selectors.py +9 -4
  68. cognite_toolkit/_cdf_tk/commands/_profile.py +1 -1
  69. cognite_toolkit/_cdf_tk/commands/_purge.py +36 -39
  70. cognite_toolkit/_cdf_tk/commands/_questionary_style.py +16 -0
  71. cognite_toolkit/_cdf_tk/commands/_upload.py +109 -86
  72. cognite_toolkit/_cdf_tk/commands/about.py +221 -0
  73. cognite_toolkit/_cdf_tk/commands/auth.py +19 -12
  74. cognite_toolkit/_cdf_tk/commands/build_cmd.py +16 -62
  75. cognite_toolkit/_cdf_tk/commands/build_v2/__init__.py +0 -0
  76. cognite_toolkit/_cdf_tk/commands/build_v2/build_cmd.py +241 -0
  77. cognite_toolkit/_cdf_tk/commands/build_v2/build_input.py +85 -0
  78. cognite_toolkit/_cdf_tk/commands/build_v2/build_issues.py +27 -0
  79. cognite_toolkit/_cdf_tk/commands/clean.py +63 -16
  80. cognite_toolkit/_cdf_tk/commands/deploy.py +20 -17
  81. cognite_toolkit/_cdf_tk/commands/dump_resource.py +10 -8
  82. cognite_toolkit/_cdf_tk/commands/init.py +225 -3
  83. cognite_toolkit/_cdf_tk/commands/modules.py +20 -44
  84. cognite_toolkit/_cdf_tk/commands/pull.py +6 -19
  85. cognite_toolkit/_cdf_tk/commands/resources.py +179 -0
  86. cognite_toolkit/_cdf_tk/commands/run.py +1 -1
  87. cognite_toolkit/_cdf_tk/constants.py +20 -1
  88. cognite_toolkit/_cdf_tk/cruds/__init__.py +19 -5
  89. cognite_toolkit/_cdf_tk/cruds/_base_cruds.py +14 -70
  90. cognite_toolkit/_cdf_tk/cruds/_data_cruds.py +10 -19
  91. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/__init__.py +4 -1
  92. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/agent.py +11 -9
  93. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/auth.py +5 -15
  94. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/classic.py +45 -44
  95. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/configuration.py +5 -12
  96. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/data_organization.py +4 -13
  97. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/datamodel.py +206 -67
  98. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/extraction_pipeline.py +6 -18
  99. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/fieldops.py +126 -35
  100. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/file.py +7 -28
  101. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/function.py +23 -30
  102. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/hosted_extractors.py +12 -30
  103. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/industrial_tool.py +4 -8
  104. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/location.py +4 -16
  105. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/migration.py +5 -13
  106. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/raw.py +5 -11
  107. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/relationship.py +3 -8
  108. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/robotics.py +16 -45
  109. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/streams.py +94 -0
  110. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/three_d_model.py +3 -7
  111. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/timeseries.py +5 -15
  112. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/transformation.py +75 -32
  113. cognite_toolkit/_cdf_tk/cruds/_resource_cruds/workflow.py +20 -40
  114. cognite_toolkit/_cdf_tk/cruds/_worker.py +24 -36
  115. cognite_toolkit/_cdf_tk/data_classes/_module_toml.py +1 -0
  116. cognite_toolkit/_cdf_tk/feature_flags.py +16 -36
  117. cognite_toolkit/_cdf_tk/plugins.py +2 -1
  118. cognite_toolkit/_cdf_tk/resource_classes/__init__.py +4 -0
  119. cognite_toolkit/_cdf_tk/resource_classes/capabilities.py +12 -0
  120. cognite_toolkit/_cdf_tk/resource_classes/functions.py +3 -1
  121. cognite_toolkit/_cdf_tk/resource_classes/infield_cdm_location_config.py +109 -0
  122. cognite_toolkit/_cdf_tk/resource_classes/migration.py +8 -17
  123. cognite_toolkit/_cdf_tk/resource_classes/search_config.py +1 -1
  124. cognite_toolkit/_cdf_tk/resource_classes/streams.py +29 -0
  125. cognite_toolkit/_cdf_tk/resource_classes/workflow_version.py +164 -5
  126. cognite_toolkit/_cdf_tk/storageio/__init__.py +9 -21
  127. cognite_toolkit/_cdf_tk/storageio/_annotations.py +19 -16
  128. cognite_toolkit/_cdf_tk/storageio/_applications.py +340 -28
  129. cognite_toolkit/_cdf_tk/storageio/_asset_centric.py +67 -104
  130. cognite_toolkit/_cdf_tk/storageio/_base.py +61 -29
  131. cognite_toolkit/_cdf_tk/storageio/_datapoints.py +276 -20
  132. cognite_toolkit/_cdf_tk/storageio/_file_content.py +435 -0
  133. cognite_toolkit/_cdf_tk/storageio/_instances.py +35 -3
  134. cognite_toolkit/_cdf_tk/storageio/_raw.py +26 -0
  135. cognite_toolkit/_cdf_tk/storageio/selectors/__init__.py +71 -4
  136. cognite_toolkit/_cdf_tk/storageio/selectors/_base.py +14 -2
  137. cognite_toolkit/_cdf_tk/storageio/selectors/_canvas.py +14 -0
  138. cognite_toolkit/_cdf_tk/storageio/selectors/_charts.py +14 -0
  139. cognite_toolkit/_cdf_tk/storageio/selectors/_datapoints.py +23 -3
  140. cognite_toolkit/_cdf_tk/storageio/selectors/_file_content.py +164 -0
  141. cognite_toolkit/_cdf_tk/storageio/selectors/_three_d.py +34 -0
  142. cognite_toolkit/_cdf_tk/tk_warnings/other.py +4 -0
  143. cognite_toolkit/_cdf_tk/tracker.py +2 -2
  144. cognite_toolkit/_cdf_tk/utils/cdf.py +1 -1
  145. cognite_toolkit/_cdf_tk/utils/dtype_conversion.py +9 -3
  146. cognite_toolkit/_cdf_tk/utils/fileio/__init__.py +2 -0
  147. cognite_toolkit/_cdf_tk/utils/fileio/_base.py +5 -1
  148. cognite_toolkit/_cdf_tk/utils/fileio/_readers.py +112 -20
  149. cognite_toolkit/_cdf_tk/utils/fileio/_writers.py +15 -15
  150. cognite_toolkit/_cdf_tk/utils/http_client/__init__.py +28 -0
  151. cognite_toolkit/_cdf_tk/utils/http_client/_client.py +285 -18
  152. cognite_toolkit/_cdf_tk/utils/http_client/_data_classes.py +56 -4
  153. cognite_toolkit/_cdf_tk/utils/http_client/_data_classes2.py +247 -0
  154. cognite_toolkit/_cdf_tk/utils/http_client/_tracker.py +5 -2
  155. cognite_toolkit/_cdf_tk/utils/interactive_select.py +60 -18
  156. cognite_toolkit/_cdf_tk/utils/sql_parser.py +2 -3
  157. cognite_toolkit/_cdf_tk/utils/useful_types.py +6 -2
  158. cognite_toolkit/_cdf_tk/validation.py +83 -1
  159. cognite_toolkit/_repo_files/GitHub/.github/workflows/deploy.yaml +1 -1
  160. cognite_toolkit/_repo_files/GitHub/.github/workflows/dry-run.yaml +1 -1
  161. cognite_toolkit/_resources/cdf.toml +5 -4
  162. cognite_toolkit/_version.py +1 -1
  163. cognite_toolkit/config.dev.yaml +13 -0
  164. {cognite_toolkit-0.6.97.dist-info → cognite_toolkit-0.7.39.dist-info}/METADATA +24 -24
  165. cognite_toolkit-0.7.39.dist-info/RECORD +322 -0
  166. cognite_toolkit-0.7.39.dist-info/WHEEL +4 -0
  167. {cognite_toolkit-0.6.97.dist-info → cognite_toolkit-0.7.39.dist-info}/entry_points.txt +1 -0
  168. cognite_toolkit/_cdf_tk/client/api/robotics/__init__.py +0 -3
  169. cognite_toolkit/_cdf_tk/commands/_migrate/canvas.py +0 -201
  170. cognite_toolkit/_cdf_tk/commands/dump_data.py +0 -489
  171. cognite_toolkit/_cdf_tk/commands/featureflag.py +0 -27
  172. cognite_toolkit/_cdf_tk/prototypes/import_app.py +0 -41
  173. cognite_toolkit/_cdf_tk/utils/table_writers.py +0 -434
  174. cognite_toolkit-0.6.97.dist-info/RECORD +0 -306
  175. cognite_toolkit-0.6.97.dist-info/WHEEL +0 -4
  176. cognite_toolkit-0.6.97.dist-info/licenses/LICENSE +0 -18
  177. /cognite_toolkit/_cdf_tk/{prototypes/commands → client/api/legacy}/__init__.py +0 -0
  178. /cognite_toolkit/_cdf_tk/client/api/{dml.py → legacy/dml.py} +0 -0
  179. /cognite_toolkit/_cdf_tk/client/api/{fixed_transformations.py → legacy/fixed_transformations.py} +0 -0
  180. /cognite_toolkit/_cdf_tk/client/api/{robotics → legacy/robotics}/api.py +0 -0
  181. /cognite_toolkit/_cdf_tk/client/api/{robotics → legacy/robotics}/utlis.py +0 -0
  182. /cognite_toolkit/_cdf_tk/client/data_classes/{apm_config_v1.py → legacy/apm_config_v1.py} +0 -0
  183. /cognite_toolkit/_cdf_tk/client/data_classes/{extendable_cognite_file.py → legacy/extendable_cognite_file.py} +0 -0
  184. /cognite_toolkit/_cdf_tk/client/data_classes/{extended_filemetadata.py → legacy/extended_filemetadata.py} +0 -0
  185. /cognite_toolkit/_cdf_tk/client/data_classes/{extended_filemetdata.py → legacy/extended_filemetdata.py} +0 -0
  186. /cognite_toolkit/_cdf_tk/client/data_classes/{extended_timeseries.py → legacy/extended_timeseries.py} +0 -0
  187. /cognite_toolkit/_cdf_tk/client/data_classes/{functions.py → legacy/functions.py} +0 -0
  188. /cognite_toolkit/_cdf_tk/client/data_classes/{graphql_data_models.py → legacy/graphql_data_models.py} +0 -0
  189. /cognite_toolkit/_cdf_tk/client/data_classes/{instances.py → legacy/instances.py} +0 -0
  190. /cognite_toolkit/_cdf_tk/client/data_classes/{location_filters.py → legacy/location_filters.py} +0 -0
  191. /cognite_toolkit/_cdf_tk/client/data_classes/{pending_instances_ids.py → legacy/pending_instances_ids.py} +0 -0
  192. /cognite_toolkit/_cdf_tk/client/data_classes/{project.py → legacy/project.py} +0 -0
  193. /cognite_toolkit/_cdf_tk/client/data_classes/{raw.py → legacy/raw.py} +0 -0
  194. /cognite_toolkit/_cdf_tk/client/data_classes/{robotics.py → legacy/robotics.py} +0 -0
  195. /cognite_toolkit/_cdf_tk/client/data_classes/{search_config.py → legacy/search_config.py} +0 -0
  196. /cognite_toolkit/_cdf_tk/client/data_classes/{sequences.py → legacy/sequences.py} +0 -0
  197. /cognite_toolkit/_cdf_tk/client/data_classes/{streamlit_.py → legacy/streamlit_.py} +0 -0
  198. /cognite_toolkit/_cdf_tk/{prototypes/commands/import_.py → commands/_import_cmd.py} +0 -0
@@ -1,434 +0,0 @@
1
- import csv
2
- import importlib.util
3
- import json
4
- import sys
5
- from abc import abstractmethod
6
- from collections.abc import Collection, Iterator, Mapping, Sequence
7
- from dataclasses import dataclass
8
- from datetime import date, datetime, timezone
9
- from functools import lru_cache
10
- from io import TextIOWrapper
11
- from pathlib import Path
12
- from types import MappingProxyType
13
- from typing import IO, TYPE_CHECKING, Any, ClassVar, Generic, Literal, SupportsIndex, TypeAlias, TypeVar, overload
14
-
15
- from cognite.client.data_classes.data_modeling import data_types as dt
16
- from cognite.client.data_classes.data_modeling.views import MappedProperty, ViewProperty
17
-
18
- from cognite_toolkit._cdf_tk.exceptions import ToolkitMissingDependencyError, ToolkitTypeError, ToolkitValueError
19
- from cognite_toolkit._cdf_tk.utils import humanize_collection, sanitize_filename
20
- from cognite_toolkit._cdf_tk.utils.file import yaml_safe_dump
21
-
22
- from .useful_types import JsonVal
23
-
24
- if sys.version_info >= (3, 11):
25
- from typing import Self
26
- else:
27
- from typing_extensions import Self
28
-
29
- if TYPE_CHECKING:
30
- import pyarrow as pa
31
- import pyarrow.parquet as pq
32
-
33
- FileFormat: TypeAlias = Literal["csv", "parquet", "yaml"]
34
- DataType: TypeAlias = Literal["string", "integer", "float", "boolean", "json", "date", "timestamp", "epoch"]
35
- PrimaryCellValue: TypeAlias = datetime | date | str | int | float | bool | JsonVal | None
36
- CellValue: TypeAlias = PrimaryCellValue | list[PrimaryCellValue]
37
- Rows: TypeAlias = list[dict[str, CellValue]]
38
-
39
-
40
- @dataclass(frozen=True)
41
- class SchemaColumn:
42
- name: str
43
- type: DataType
44
- is_array: bool = False
45
-
46
- def __post_init__(self) -> None:
47
- if self.type == "json" and self.is_array:
48
- raise ValueError("JSON columns cannot be arrays. Use 'is_array=False' for JSON columns.")
49
-
50
-
51
- class SchemaColumnList(list, Sequence[SchemaColumn]):
52
- # Implemented to get correct type hints
53
- def __init__(self, collection: Collection[SchemaColumn] | None = None) -> None:
54
- super().__init__(collection or [])
55
-
56
- def __iter__(self) -> Iterator[SchemaColumn]:
57
- return super().__iter__()
58
-
59
- @overload
60
- def __getitem__(self, index: SupportsIndex) -> SchemaColumn: ...
61
-
62
- @overload
63
- def __getitem__(self, index: slice) -> Self: ...
64
-
65
- def __getitem__(self, index: SupportsIndex | slice, /) -> SchemaColumn | Self:
66
- if isinstance(index, slice):
67
- return type(self)(super().__getitem__(index))
68
- return super().__getitem__(index)
69
-
70
- @classmethod
71
- def create_from_view_properties(cls, properties: Mapping[str, ViewProperty], support_edges: bool = False) -> Self:
72
- """Create a SchemaColumnList from a mapping of ViewProperty objects.
73
-
74
- Args:
75
- properties (Mapping[str, ViewProperty]): A mapping of property names to ViewProperty objects.
76
- support_edges (bool): Whether the the view supports edges. If True, the schema will include
77
- startNode and endNode columns.
78
-
79
- Returns:
80
- SchemaColumnList: A list of SchemaColumn objects representing the properties.
81
- """
82
- columns = [
83
- SchemaColumn("space", "string", is_array=False),
84
- SchemaColumn("externalId", "string", is_array=False),
85
- SchemaColumn("instanceType", "string"),
86
- SchemaColumn("existingVersion", "integer", is_array=False),
87
- SchemaColumn("type", "json", is_array=False),
88
- ]
89
- if support_edges:
90
- columns.append(SchemaColumn("startNode", "json", is_array=False))
91
- columns.append(SchemaColumn("endNode", "json", is_array=False))
92
- for name, prop in properties.items():
93
- if not isinstance(prop, MappedProperty):
94
- # We skip all properties that does not reside in a container.
95
- continue
96
- schema_type = cls._dms_to_schema_type(prop.type)
97
- is_array = (
98
- isinstance(prop.type, dt.ListablePropertyType)
99
- and prop.type.is_list
100
- and schema_type != "json" # JSON is not an array type
101
- )
102
- columns.append(SchemaColumn(name=f"properties.{name}", type=schema_type, is_array=is_array))
103
- return cls(columns)
104
-
105
- @classmethod
106
- def _dms_to_schema_type(cls, model_type: dt.PropertyType) -> DataType:
107
- if isinstance(model_type, dt.Text | dt.Enum | dt.CDFExternalIdReference):
108
- return "string"
109
- elif isinstance(model_type, dt.Boolean):
110
- return "boolean"
111
- elif isinstance(model_type, dt.Json | dt.DirectRelation):
112
- return "json"
113
- elif isinstance(model_type, dt.Int32 | dt.Int64):
114
- return "integer"
115
- elif isinstance(model_type, dt.Float32 | dt.Float64):
116
- return "float"
117
- elif isinstance(model_type, dt.Timestamp):
118
- return "timestamp"
119
- elif isinstance(model_type, dt.Date):
120
- return "date"
121
- else:
122
- raise ToolkitTypeError(
123
- f"Failed convertion from data modeling type to Table Schema. Unknown type: {type(model_type)!r}."
124
- )
125
-
126
-
127
- @dataclass
128
- class Schema:
129
- display_name: str
130
- folder_name: str
131
- kind: str
132
- format_: FileFormat
133
- columns: SchemaColumnList
134
-
135
-
136
- T_IO = TypeVar("T_IO", bound=IO)
137
-
138
-
139
- class TableFileWriter(Generic[T_IO]):
140
- encoding = "utf-8"
141
- newline = "\n"
142
- format: ClassVar[FileFormat]
143
-
144
- def __init__(self, schema: Schema, output_dir: Path, max_file_size_bytes: int = 128 * 1024 * 1024) -> None:
145
- self.max_file_size_bytes = max_file_size_bytes
146
- self.schema = schema
147
- self.output_dir = output_dir
148
- self._file_count = 1
149
- self._writer_by_filepath: dict[Path, T_IO] = {}
150
-
151
- def write_rows(self, rows_group_list: list[tuple[str, Rows]]) -> None:
152
- """Write rows to a file."""
153
- for group, group_rows in rows_group_list:
154
- if not group_rows:
155
- continue
156
- writer = self._get_writer(group)
157
- self._write_rows(writer, group_rows)
158
-
159
- @abstractmethod
160
- def _write_rows(self, writer: T_IO, rows: Rows) -> None:
161
- raise NotImplementedError()
162
-
163
- @abstractmethod
164
- def _create_writer(self, filepath: Path) -> T_IO:
165
- """Create a writer for the given file path."""
166
- raise NotImplementedError("This method should be implemented in subclasses.")
167
-
168
- @abstractmethod
169
- def _is_above_file_size_limit(self, filepath: Path, writer: T_IO) -> bool:
170
- """Check if the file size is above the limit."""
171
- raise NotImplementedError("This method should be implemented in subclasses.")
172
-
173
- def __enter__(self) -> "TableFileWriter":
174
- self._file_count = 1
175
- return self
176
-
177
- def __exit__(self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: Any | None) -> None:
178
- for writer in self._writer_by_filepath.values():
179
- writer.close()
180
- self._writer_by_filepath.clear()
181
- return None
182
-
183
- def _get_writer(self, group: str) -> T_IO:
184
- clean_name = f"{sanitize_filename(group)}-" if group else ""
185
- file_path = (
186
- self.output_dir
187
- / self.schema.folder_name
188
- / f"{clean_name}part-{self._file_count:04}.{self.schema.kind}.{self.format}"
189
- )
190
- file_path.parent.mkdir(parents=True, exist_ok=True)
191
- if file_path not in self._writer_by_filepath:
192
- self._writer_by_filepath[file_path] = self._create_writer(file_path)
193
- elif self._is_above_file_size_limit(file_path, self._writer_by_filepath[file_path]):
194
- self._writer_by_filepath[file_path].close()
195
- del self._writer_by_filepath[file_path]
196
- self._file_count += 1
197
- return self._get_writer(group)
198
- return self._writer_by_filepath[file_path]
199
-
200
- @classmethod
201
- def get_write_cls(cls, format_: FileFormat) -> "type[TableFileWriter]":
202
- """Get the writer class for the given format."""
203
- write_cls = _TABLEWRITER_CLASS_BY_FORMAT.get(format_)
204
- if write_cls is None:
205
- raise ToolkitValueError(
206
- f"Unsupported format {format_}. Supported formats are {humanize_collection(_TABLEWRITER_CLASS_BY_FORMAT.keys())}."
207
- )
208
- return write_cls
209
-
210
-
211
- class ParquetWriter(TableFileWriter["pq.ParquetWriter"]):
212
- """Parquet writer for CDF Toolkit.
213
-
214
- Caveat: This mutates the rows to convert JSON, timestamp, and date columns to appropriate formats.
215
- This is necessary because pyarrow does not support JSON, timestamp, and date types directly in the way we need.
216
- We avoid making a copy of each row for performance reasons, but this means that the rows passed to this writer
217
- will be modified in place.
218
- """
219
-
220
- format = "parquet"
221
-
222
- def __init__(self, schema: Schema, output_dir: Path, max_file_size_bytes: int = 128 * 1024 * 1024) -> None:
223
- super().__init__(schema, output_dir, max_file_size_bytes)
224
- self._check_pyarrow_dependency()
225
-
226
- def _create_writer(self, filepath: Path) -> "pq.ParquetWriter":
227
- import pyarrow.parquet as pq
228
-
229
- schema = self._create_schema()
230
- return pq.ParquetWriter(filepath, schema)
231
-
232
- def _write_rows(self, writer: "pq.ParquetWriter", rows: Rows) -> None:
233
- import pyarrow as pa
234
-
235
- if json_columns := self._json_columns():
236
- for row in rows:
237
- json_values = set(row.keys()) & json_columns
238
- for col in json_values:
239
- row[col] = json.dumps(row[col])
240
- if timestamp_columns := self._timestamp_columns():
241
- for row in rows:
242
- for col in set(row.keys()) & timestamp_columns:
243
- cell_value = row[col]
244
- if isinstance(cell_value, list):
245
- # MyPy does not understand that a list of PrimaryCellValue is valid here
246
- # It expects a union of PrimaryCellValue and list[PrimaryCellValue].
247
- row[col] = [self._to_datetime(value) for value in cell_value] # type: ignore[assignment]
248
- else:
249
- row[col] = self._to_datetime(cell_value)
250
- if date_columns := self._date_columns():
251
- for row in rows:
252
- for col in set(row.keys()) & date_columns:
253
- cell_value = row[col]
254
- if isinstance(cell_value, list):
255
- # MyPy does not understand that a list of PrimaryCellValue is valid here.
256
- # It expects a union of PrimaryCellValue and list[PrimaryCellValue].
257
- row[col] = [self._to_date(value) for value in cell_value] # type: ignore[assignment]
258
- else:
259
- row[col] = self._to_date(cell_value)
260
-
261
- table = pa.Table.from_pylist(rows, schema=self._create_schema())
262
- writer.write_table(table)
263
-
264
- def _is_above_file_size_limit(self, filepath: Path, writer: "pq.ParquetWriter") -> bool:
265
- return filepath.exists() and filepath.stat().st_size > self.max_file_size_bytes
266
-
267
- @lru_cache(maxsize=1)
268
- def _json_columns(self) -> set[str]:
269
- """Check if the writer supports JSON format."""
270
- return {col.name for col in self.schema.columns if col.type == "json"}
271
-
272
- @lru_cache(maxsize=1)
273
- def _timestamp_columns(self) -> set[str]:
274
- """Check if the writer supports timestamp format."""
275
- return {col.name for col in self.schema.columns if col.type == "timestamp"}
276
-
277
- @lru_cache(maxsize=1)
278
- def _date_columns(self) -> set[str]:
279
- return {col.name for col in self.schema.columns if col.type == "date"}
280
-
281
- @classmethod
282
- def _to_datetime(cls, value: CellValue) -> CellValue:
283
- if isinstance(value, datetime) or value is None:
284
- output = value
285
- elif isinstance(value, date):
286
- output = datetime.combine(value, datetime.min.time())
287
- elif isinstance(value, int | float):
288
- # Assuming the value is a timestamp in milliseconds
289
- output = datetime.fromtimestamp(value / 1000.0)
290
- elif isinstance(value, str):
291
- output = cls._convert_data_modelling_timestamp(value)
292
- else:
293
- raise ToolkitTypeError(
294
- f"Unsupported value type for datetime conversion: {type(value)}. Expected datetime, date, int, float, or str."
295
- )
296
- if output is not None and output.tzinfo is None:
297
- # Ensure the datetime is in UTC
298
- output = output.replace(tzinfo=timezone.utc)
299
- elif output is not None and output.tzinfo is not None:
300
- # Convert to UTC if it has a timezone
301
- output = output.astimezone(timezone.utc)
302
- return output
303
-
304
- @classmethod
305
- def _to_date(cls, value: CellValue) -> CellValue:
306
- if isinstance(value, date) or value is None:
307
- return value
308
- elif isinstance(value, datetime):
309
- return value.date()
310
- elif isinstance(value, int | float):
311
- # Assuming the value is a timestamp in milliseconds
312
- return date.fromtimestamp(value / 1000.0)
313
- elif isinstance(value, str):
314
- return cls._convert_data_modelling_timestamp(value).date()
315
- else:
316
- raise ToolkitTypeError(
317
- f"Unsupported value type for date conversion: {type(value)}. Expected date, datetime, int, float, or str."
318
- )
319
-
320
- @classmethod
321
- def _convert_data_modelling_timestamp(cls, timestamp: str) -> datetime:
322
- """Convert a timestamp string from the data modeling format to a datetime object."""
323
- try:
324
- return datetime.fromisoformat(timestamp)
325
- except ValueError:
326
- # Typically hits if the timestamp has truncated milliseconds,
327
- # For example, "2021-01-01T00:00:00.17+00:00".
328
- # In Python 3.10, the strptime requires exact formats so we need both formats below.
329
- # In Python 3.11-13, if the timestamp matches on the second it will match on the first,
330
- # so when we set lower bound to 3.11 the loop will not be needed.
331
- for format_ in ["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%dT%H:%M:%S%z"]:
332
- try:
333
- return datetime.strptime(timestamp, format_)
334
- except ValueError:
335
- continue
336
- raise ValueError(
337
- f"Invalid timestamp format: {timestamp}. Expected ISO 8601 format with optional milliseconds and timezone."
338
- )
339
-
340
- @lru_cache(maxsize=1)
341
- def _create_schema(self) -> "pa.Schema":
342
- """Create a pyarrow schema from the schema definition."""
343
- self._check_pyarrow_dependency()
344
- import pyarrow as pa
345
-
346
- fields: list[pa.Field] = []
347
- for prop in self.schema.columns:
348
- pa_type = self._as_pa_type(prop.type, prop.is_array)
349
- fields.append(pa.field(prop.name, pa_type, nullable=True))
350
- return pa.schema(fields)
351
-
352
- @staticmethod
353
- def _check_pyarrow_dependency() -> None:
354
- if importlib.util.find_spec("pyarrow") is None:
355
- raise ToolkitMissingDependencyError(
356
- "Writing to parquet requires pyarrow. Install with 'pip install \"cognite-toolkit[table]\"'"
357
- )
358
-
359
- @staticmethod
360
- def _as_pa_type(type_: DataType, is_array: bool) -> "pa.DataType":
361
- """Convert a data type to a pyarrow type."""
362
- import pyarrow as pa
363
-
364
- if type_ == "string":
365
- pa_type = pa.string()
366
- elif type_ == "integer":
367
- pa_type = pa.int64()
368
- elif type_ == "float":
369
- pa_type = pa.float64()
370
- elif type_ == "boolean":
371
- pa_type = pa.bool_()
372
- elif type_ == "date":
373
- pa_type = pa.date32()
374
- elif type_ == "time":
375
- pa_type = pa.time64("ms")
376
- elif type_ == "json":
377
- pa_type = pa.string()
378
- elif type_ == "timestamp":
379
- pa_type = pa.timestamp("ms", tz="UTC")
380
- else:
381
- raise ToolkitValueError(f"Unsupported data type {type_}.")
382
-
383
- if is_array:
384
- pa_type = pa.list_(pa_type)
385
- return pa_type
386
-
387
-
388
- class CSVWriter(TableFileWriter[TextIOWrapper]):
389
- format = "csv"
390
-
391
- def _create_writer(self, filepath: Path) -> TextIOWrapper:
392
- stream = filepath.open("a", encoding=self.encoding, newline=self.newline)
393
- writer = self._create_dict_writer(stream)
394
- if filepath.stat().st_size == 0:
395
- writer.writeheader()
396
- return stream
397
-
398
- def _is_above_file_size_limit(self, filepath: Path, writer: TextIOWrapper) -> bool:
399
- current_position = writer.tell()
400
- writer.seek(0, 2)
401
- if writer.tell() > self.max_file_size_bytes:
402
- return True
403
- writer.seek(current_position)
404
- return False
405
-
406
- def _write_rows(self, writer: TextIOWrapper, rows: Rows) -> None:
407
- dict_writer = self._create_dict_writer(writer)
408
- dict_writer.writerows(rows)
409
-
410
- def _create_dict_writer(self, writer: TextIOWrapper) -> csv.DictWriter:
411
- return csv.DictWriter(writer, fieldnames=[col.name for col in self.schema.columns], extrasaction="ignore")
412
-
413
-
414
- class YAMLWriter(TableFileWriter[TextIOWrapper]):
415
- format = "yaml"
416
-
417
- def _create_writer(self, filepath: Path) -> TextIOWrapper:
418
- return filepath.open("a", encoding=self.encoding, newline=self.newline)
419
-
420
- def _is_above_file_size_limit(self, filepath: Path, writer: TextIOWrapper) -> bool:
421
- current_position = writer.tell()
422
- writer.seek(0, 2)
423
- if writer.tell() > self.max_file_size_bytes:
424
- return True
425
- writer.seek(current_position)
426
- return False
427
-
428
- def _write_rows(self, writer: TextIOWrapper, rows: Rows) -> None:
429
- writer.write(yaml_safe_dump(rows))
430
-
431
-
432
- _TABLEWRITER_CLASS_BY_FORMAT: MappingProxyType[str, type[TableFileWriter]] = MappingProxyType(
433
- {w.format: w for w in TableFileWriter.__subclasses__()} # type: ignore[type-abstract]
434
- )