tracdap-runtime 0.8.0rc2__py3-none-any.whl → 0.9.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracdap/rt/_impl/core/config_parser.py +29 -3
- tracdap/rt/_impl/core/data.py +627 -40
- tracdap/rt/_impl/core/repos.py +17 -8
- tracdap/rt/_impl/core/storage.py +25 -13
- tracdap/rt/_impl/core/struct.py +254 -60
- tracdap/rt/_impl/core/util.py +125 -11
- tracdap/rt/_impl/exec/context.py +35 -8
- tracdap/rt/_impl/exec/dev_mode.py +169 -127
- tracdap/rt/_impl/exec/engine.py +203 -140
- tracdap/rt/_impl/exec/functions.py +228 -263
- tracdap/rt/_impl/exec/graph.py +141 -126
- tracdap/rt/_impl/exec/graph_builder.py +428 -449
- tracdap/rt/_impl/grpc/codec.py +8 -13
- tracdap/rt/_impl/grpc/server.py +7 -7
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.py +25 -18
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.pyi +27 -9
- tracdap/rt/_impl/grpc/tracdap/metadata/common_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/config_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/custom_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/data_pb2.py +37 -35
- tracdap/rt/_impl/grpc/tracdap/metadata/data_pb2.pyi +37 -43
- tracdap/rt/_impl/grpc/tracdap/metadata/file_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/flow_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +67 -63
- tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +11 -2
- tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/object_id_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/object_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/search_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/storage_pb2.py +11 -9
- tracdap/rt/_impl/grpc/tracdap/metadata/storage_pb2.pyi +11 -2
- tracdap/rt/_impl/grpc/tracdap/metadata/tag_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/tag_update_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/type_pb2.py +23 -19
- tracdap/rt/_impl/grpc/tracdap/metadata/type_pb2.pyi +15 -2
- tracdap/rt/_impl/runtime.py +3 -9
- tracdap/rt/_impl/static_api.py +5 -6
- tracdap/rt/_plugins/format_csv.py +2 -2
- tracdap/rt/_plugins/repo_git.py +56 -11
- tracdap/rt/_plugins/storage_aws.py +165 -150
- tracdap/rt/_plugins/storage_azure.py +17 -11
- tracdap/rt/_plugins/storage_gcp.py +35 -18
- tracdap/rt/_version.py +1 -1
- tracdap/rt/api/model_api.py +45 -0
- tracdap/rt/config/__init__.py +7 -9
- tracdap/rt/config/common.py +3 -14
- tracdap/rt/config/job.py +17 -3
- tracdap/rt/config/platform.py +9 -32
- tracdap/rt/config/result.py +8 -4
- tracdap/rt/config/runtime.py +5 -10
- tracdap/rt/config/tenant.py +28 -0
- tracdap/rt/launch/cli.py +0 -8
- tracdap/rt/launch/launch.py +1 -3
- tracdap/rt/metadata/__init__.py +35 -35
- tracdap/rt/metadata/data.py +19 -31
- tracdap/rt/metadata/job.py +3 -1
- tracdap/rt/metadata/storage.py +9 -0
- tracdap/rt/metadata/type.py +9 -5
- {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/METADATA +5 -3
- {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/RECORD +64 -63
- {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/WHEEL +1 -1
- {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/licenses/LICENSE +0 -0
- {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/top_level.txt +0 -0
tracdap/rt/_impl/core/data.py
CHANGED
@@ -16,10 +16,12 @@
|
|
16
16
|
import abc
|
17
17
|
import copy
|
18
18
|
import dataclasses as dc
|
19
|
+
import pathlib
|
19
20
|
import typing as tp
|
20
21
|
import datetime as dt
|
21
22
|
import decimal
|
22
23
|
import platform
|
24
|
+
import random
|
23
25
|
|
24
26
|
import pyarrow as pa
|
25
27
|
import pyarrow.compute as pc
|
@@ -36,8 +38,21 @@ except ModuleNotFoundError:
|
|
36
38
|
|
37
39
|
import tracdap.rt.api.experimental as _api
|
38
40
|
import tracdap.rt.metadata as _meta
|
41
|
+
import tracdap.rt.config as _cfg
|
39
42
|
import tracdap.rt.exceptions as _ex
|
43
|
+
import tracdap.rt._impl.core.config_parser as _cfg_p
|
40
44
|
import tracdap.rt._impl.core.logging as _log
|
45
|
+
import tracdap.rt._impl.core.util as _util
|
46
|
+
|
47
|
+
|
48
|
+
@dc.dataclass(frozen=True)
|
49
|
+
class DataPartKey:
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
def for_root(cls) -> "DataPartKey":
|
53
|
+
return DataPartKey(opaque_key='part-root')
|
54
|
+
|
55
|
+
opaque_key: str
|
41
56
|
|
42
57
|
|
43
58
|
@dc.dataclass(frozen=True)
|
@@ -47,61 +62,495 @@ class DataSpec:
|
|
47
62
|
schema_type: _meta.SchemaType
|
48
63
|
data_item: str
|
49
64
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
65
|
+
definition: tp.Union[_meta.DataDefinition, _meta.FileDefinition]
|
66
|
+
storage: _meta.StorageDefinition
|
67
|
+
schema: tp.Optional[_meta.SchemaDefinition] = None
|
68
|
+
|
69
|
+
primary_id: _meta.TagHeader = None
|
70
|
+
storage_id: _meta.TagHeader = None
|
71
|
+
schema_id: tp.Optional[_meta.TagHeader] = None
|
72
|
+
context_key: tp.Optional[str] = None
|
73
|
+
|
74
|
+
metadata: tp.Optional[_api.RuntimeMetadata] = None
|
54
75
|
|
55
76
|
@staticmethod
|
56
77
|
def create_data_spec(
|
57
78
|
data_item: str,
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
if
|
63
|
-
schema_type =
|
64
|
-
elif
|
65
|
-
schema_type =
|
79
|
+
definition: _meta.DataDefinition,
|
80
|
+
storage: _meta.StorageDefinition,
|
81
|
+
schema: tp.Optional[_meta.SchemaDefinition] = None) -> "DataSpec":
|
82
|
+
|
83
|
+
if schema:
|
84
|
+
schema_type = schema.schemaType
|
85
|
+
elif definition.schema:
|
86
|
+
schema_type = definition.schema.schemaType
|
66
87
|
else:
|
67
88
|
schema_type = _meta.SchemaType.SCHEMA_TYPE_NOT_SET
|
68
89
|
|
69
90
|
return DataSpec(
|
70
91
|
_meta.ObjectType.DATA, schema_type, data_item,
|
71
|
-
|
72
|
-
storage_def=storage_def,
|
73
|
-
schema_def=schema_def,
|
74
|
-
file_def=None)
|
92
|
+
definition, storage, schema)
|
75
93
|
|
76
94
|
@staticmethod
|
77
95
|
def create_file_spec(
|
78
96
|
data_item: str,
|
79
|
-
|
80
|
-
|
97
|
+
definition: _meta.FileDefinition,
|
98
|
+
storage: _meta.StorageDefinition) -> "DataSpec":
|
81
99
|
|
82
100
|
return DataSpec(
|
83
101
|
_meta.ObjectType.FILE, _meta.SchemaType.SCHEMA_TYPE_NOT_SET, data_item,
|
84
|
-
|
85
|
-
storage_def=storage_def,
|
86
|
-
data_def=None,
|
87
|
-
schema_def=None)
|
102
|
+
definition, storage)
|
88
103
|
|
89
104
|
@staticmethod
|
90
105
|
def create_empty_spec(object_type: _meta.ObjectType, schema_type: _meta.SchemaType):
|
91
|
-
return DataSpec(object_type, schema_type,
|
106
|
+
return DataSpec(object_type, schema_type, "", None, None, None) # noqa
|
107
|
+
|
108
|
+
def with_ids(
|
109
|
+
self, primary_id: _meta.TagHeader,
|
110
|
+
storage_id: _meta.TagHeader,
|
111
|
+
schema_id: tp.Optional[_meta.TagHeader] = None,
|
112
|
+
context_key: tp.Optional[str] = None):
|
113
|
+
|
114
|
+
return dc.replace(self,
|
115
|
+
primary_id=primary_id,
|
116
|
+
storage_id=storage_id,
|
117
|
+
schema_id=schema_id,
|
118
|
+
context_key=context_key)
|
119
|
+
|
120
|
+
def with_metadata(self, metadata: tp.Optional[_api.RuntimeMetadata]):
|
121
|
+
return dc.replace(self, metadata=metadata)
|
92
122
|
|
93
123
|
def is_empty(self):
|
94
124
|
return self.data_item is None or len(self.data_item) == 0
|
95
125
|
|
96
126
|
|
97
|
-
|
98
|
-
|
127
|
+
class StorageLayout(metaclass=abc.ABCMeta):
|
128
|
+
|
129
|
+
__LAYOUTS: "tp.Dict[str, StorageLayout]" = dict()
|
99
130
|
|
100
131
|
@classmethod
|
101
|
-
def
|
102
|
-
return DataPartKey(opaque_key='part_root')
|
132
|
+
def select(cls, layout_key: tp.Union[str, _meta.StorageLayout]) -> "StorageLayout":
|
103
133
|
|
104
|
-
|
134
|
+
# Legacy compatibility - layout key not set in storage definition
|
135
|
+
if not layout_key or layout_key == "":
|
136
|
+
layout_key = _meta.StorageLayout.OBJECT_ID_LAYOUT.name
|
137
|
+
|
138
|
+
if isinstance(layout_key, _meta.StorageLayout):
|
139
|
+
layout_key = layout_key.name
|
140
|
+
|
141
|
+
layout = cls.__LAYOUTS.get(layout_key)
|
142
|
+
|
143
|
+
if layout is not None:
|
144
|
+
return layout
|
145
|
+
|
146
|
+
if layout_key == _meta.StorageLayout.OBJECT_ID_LAYOUT.name:
|
147
|
+
layout = ObjectIdLayout()
|
148
|
+
elif layout_key == _meta.StorageLayout.DEVELOPER_LAYOUT.name:
|
149
|
+
layout = DevelopmentLayout()
|
150
|
+
else:
|
151
|
+
raise _ex.ETracInternal(f"Unknown storage layout [{layout_key}]")
|
152
|
+
|
153
|
+
cls.__LAYOUTS[layout_key] = layout
|
154
|
+
|
155
|
+
return layout
|
156
|
+
|
157
|
+
@abc.abstractmethod
|
158
|
+
def layout_key(self) -> _meta.StorageLayout:
|
159
|
+
pass
|
160
|
+
|
161
|
+
@abc.abstractmethod
|
162
|
+
def new_data_spec(
|
163
|
+
self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
164
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
165
|
+
sys_config: _cfg.RuntimeConfig) -> DataSpec:
|
166
|
+
pass
|
167
|
+
|
168
|
+
@abc.abstractmethod
|
169
|
+
def new_data_version(
|
170
|
+
self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
171
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
172
|
+
prior_spec: DataSpec) -> DataSpec:
|
173
|
+
pass
|
174
|
+
|
175
|
+
@abc.abstractmethod
|
176
|
+
def new_file_spec(
|
177
|
+
self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
178
|
+
context_key: str, file_type: _meta.FileType,
|
179
|
+
sys_config: _cfg.RuntimeConfig) -> DataSpec:
|
180
|
+
pass
|
181
|
+
|
182
|
+
@abc.abstractmethod
|
183
|
+
def new_file_version(
|
184
|
+
self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
185
|
+
context_key: str, file_type: _meta.FileType,
|
186
|
+
prior_spec: DataSpec) -> DataSpec:
|
187
|
+
pass
|
188
|
+
|
189
|
+
|
190
|
+
class BaseLayout(StorageLayout, metaclass=abc.ABCMeta):
|
191
|
+
|
192
|
+
__DATA_ITEM_TEMPLATE = "data/{}/{}/{}/snap-{:d}/delta-{:d}"
|
193
|
+
__FILE_ITEM_TEMPLATE = "file/{}/version-{}"
|
194
|
+
|
195
|
+
@abc.abstractmethod
|
196
|
+
def _data_storage_path(
|
197
|
+
self, data_id: _meta.TagHeader, context_key: str, trac_schema: _meta.SchemaDefinition,
|
198
|
+
part_key: _meta.PartKey, snap_index: int, delta_index: int, storage_format: str,
|
199
|
+
prior_copy: tp.Optional[_meta.StorageCopy]):
|
200
|
+
pass
|
201
|
+
|
202
|
+
@abc.abstractmethod
|
203
|
+
def _file_storage_path(
|
204
|
+
self, file_id: _meta.TagHeader, file_def: _meta.FileDefinition,
|
205
|
+
prior_copy: tp.Optional[_meta.StorageCopy]):
|
206
|
+
pass
|
207
|
+
|
208
|
+
def new_data_spec(
|
209
|
+
self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
210
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
211
|
+
sys_config: _cfg.RuntimeConfig) -> DataSpec:
|
212
|
+
|
213
|
+
part_key = _meta.PartKey("part-root", _meta.PartType.PART_ROOT)
|
214
|
+
snap_index = 0
|
215
|
+
|
216
|
+
data_item = self.__DATA_ITEM_TEMPLATE.format(
|
217
|
+
trac_schema.schemaType.name.lower(), data_id.objectId,
|
218
|
+
part_key.opaqueKey, snap_index, 0)
|
219
|
+
|
220
|
+
# Blank data definition with no parts
|
221
|
+
new_data_def = _meta.DataDefinition(
|
222
|
+
schema=trac_schema,
|
223
|
+
storageId=_util.selector_for_latest(storage_id))
|
224
|
+
|
225
|
+
data_def = self._add_new_snap(new_data_def, data_item, part_key, snap_index)
|
226
|
+
|
227
|
+
# Take default location from the storage config
|
228
|
+
storage_key = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_LOCATION)
|
229
|
+
if trac_schema.schemaType == _meta.SchemaType.STRUCT_SCHEMA:
|
230
|
+
storage_format = "text/json"
|
231
|
+
else:
|
232
|
+
storage_format = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_FORMAT, "text/csv")
|
233
|
+
storage_path = self._data_storage_path(data_id, context_key, trac_schema, part_key, snap_index, 0, storage_format, prior_copy=None)
|
234
|
+
|
235
|
+
storage_copy = _meta.StorageCopy(
|
236
|
+
storageKey=storage_key,
|
237
|
+
storagePath=storage_path,
|
238
|
+
storageFormat=storage_format,
|
239
|
+
copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
|
240
|
+
copyTimestamp=data_id.objectTimestamp)
|
241
|
+
|
242
|
+
new_storage_def = _meta.StorageDefinition()
|
243
|
+
|
244
|
+
storage_def = self._add_storage_copy(new_storage_def, data_item, storage_copy)
|
245
|
+
|
246
|
+
# Dynamic data def will always use an embedded schema (this is no ID for an external schema)
|
247
|
+
|
248
|
+
return DataSpec \
|
249
|
+
.create_data_spec(data_item, data_def, storage_def, schema=None) \
|
250
|
+
.with_ids(data_id, storage_id)
|
251
|
+
|
252
|
+
def new_data_version(
|
253
|
+
self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
254
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
255
|
+
prior_spec: DataSpec) -> DataSpec:
|
256
|
+
|
257
|
+
part_key = _meta.PartKey("part-root", _meta.PartType.PART_ROOT)
|
258
|
+
snap_index = prior_spec.primary_id.objectVersion # snap index is zero-based
|
259
|
+
|
260
|
+
data_item = self.__DATA_ITEM_TEMPLATE.format(
|
261
|
+
trac_schema.schemaType.name.lower(), data_id.objectId,
|
262
|
+
part_key.opaqueKey, snap_index, 0)
|
263
|
+
|
264
|
+
data_def = self._add_new_snap(prior_spec.definition, data_item, part_key, snap_index)
|
265
|
+
|
266
|
+
prior_item = next(iter(prior_spec.storage.dataItems.keys()), None)
|
267
|
+
prior_copy = self._find_storage_copy(prior_item, prior_spec.storage)
|
268
|
+
|
269
|
+
if prior_copy is None:
|
270
|
+
raise _ex.ETracInternal(f"Missing prior metadata for [{_util.object_key(data_id)}]")
|
271
|
+
|
272
|
+
storage_key = prior_copy.storageKey
|
273
|
+
storage_format = prior_copy.storageFormat
|
274
|
+
storage_path = self._data_storage_path(data_id, context_key, trac_schema, part_key, snap_index, 0, storage_format, prior_copy)
|
275
|
+
|
276
|
+
storage_copy = _meta.StorageCopy(
|
277
|
+
storageKey=storage_key,
|
278
|
+
storagePath=storage_path,
|
279
|
+
storageFormat=storage_format,
|
280
|
+
copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
|
281
|
+
copyTimestamp=data_id.objectTimestamp)
|
282
|
+
|
283
|
+
storage_def = self._add_storage_copy(prior_spec.storage, data_item, storage_copy)
|
284
|
+
|
285
|
+
return DataSpec \
|
286
|
+
.create_data_spec(data_item, data_def, storage_def, schema=None) \
|
287
|
+
.with_ids(data_id, storage_id)
|
288
|
+
|
289
|
+
def new_file_spec(
|
290
|
+
self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
291
|
+
context_key: str, file_type: _meta.FileType,
|
292
|
+
sys_config: _cfg.RuntimeConfig) -> DataSpec:
|
293
|
+
|
294
|
+
data_item = self.__FILE_ITEM_TEMPLATE.format(file_id.objectId, file_id.objectVersion)
|
295
|
+
|
296
|
+
file_def = _meta.FileDefinition(
|
297
|
+
name=f"{context_key}.{file_type.extension}",
|
298
|
+
extension=file_type.extension,
|
299
|
+
mimeType=file_type.mimeType,
|
300
|
+
dataItem=data_item,
|
301
|
+
storageId=_util.selector_for_latest(storage_id),
|
302
|
+
size=0)
|
303
|
+
|
304
|
+
storage_key = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_LOCATION)
|
305
|
+
storage_format = file_def.mimeType
|
306
|
+
storage_path = self._file_storage_path(file_id, file_def, prior_copy=None)
|
307
|
+
|
308
|
+
storage_copy = _meta.StorageCopy(
|
309
|
+
storageKey=storage_key,
|
310
|
+
storagePath=storage_path,
|
311
|
+
storageFormat=storage_format,
|
312
|
+
copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
|
313
|
+
copyTimestamp=file_id.objectTimestamp)
|
314
|
+
|
315
|
+
new_storage_def = _meta.StorageDefinition()
|
316
|
+
new_storage_def.layout = self.layout_key()
|
317
|
+
|
318
|
+
storage_def = self._add_storage_copy(new_storage_def, data_item, storage_copy)
|
319
|
+
|
320
|
+
return DataSpec \
|
321
|
+
.create_file_spec(data_item, file_def, storage_def) \
|
322
|
+
.with_ids(file_id, storage_id)
|
323
|
+
|
324
|
+
def new_file_version(
|
325
|
+
self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
326
|
+
context_key: str, file_type: _meta.FileType, prior_spec: DataSpec) -> DataSpec:
|
327
|
+
|
328
|
+
data_item = self.__FILE_ITEM_TEMPLATE.format(file_id.objectId, file_id.objectVersion)
|
329
|
+
|
330
|
+
file_def = _meta.FileDefinition(
|
331
|
+
name=f"{context_key}.{file_type.extension}",
|
332
|
+
extension=file_type.extension,
|
333
|
+
mimeType=file_type.mimeType,
|
334
|
+
dataItem=data_item,
|
335
|
+
storageId=_util.selector_for_latest(storage_id),
|
336
|
+
size=0)
|
337
|
+
|
338
|
+
prior_copy = self._find_storage_copy(prior_spec.definition.dataItem, prior_spec.storage)
|
339
|
+
|
340
|
+
if prior_copy is None:
|
341
|
+
raise _ex.ETracInternal(f"Missing prior metadata for [{_util.object_key(file_id)}]")
|
342
|
+
|
343
|
+
storage_key = prior_copy.storageKey
|
344
|
+
storage_format = file_def.mimeType
|
345
|
+
storage_path = self._file_storage_path(file_id, file_def, prior_copy=None)
|
346
|
+
|
347
|
+
storage_copy = _meta.StorageCopy(
|
348
|
+
storageKey=storage_key,
|
349
|
+
storagePath=storage_path,
|
350
|
+
storageFormat=storage_format,
|
351
|
+
copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
|
352
|
+
copyTimestamp=file_id.objectTimestamp)
|
353
|
+
|
354
|
+
storage_def = self._add_storage_copy(prior_spec.storage, data_item, storage_copy)
|
355
|
+
|
356
|
+
return DataSpec \
|
357
|
+
.create_file_spec(data_item, file_def, storage_def) \
|
358
|
+
.with_ids(file_id, storage_id)
|
359
|
+
|
360
|
+
@classmethod
|
361
|
+
def _add_new_snap(
|
362
|
+
cls, data_def: _meta.DataDefinition,data_item: str,
|
363
|
+
part_key: _meta.PartKey, snap_index: int):
|
364
|
+
|
365
|
+
delta = _meta.DataDefinition.Delta(
|
366
|
+
deltaIndex=0,
|
367
|
+
dataItem=data_item)
|
368
|
+
|
369
|
+
snap = _meta.DataDefinition.Snap(
|
370
|
+
snapIndex=snap_index,
|
371
|
+
deltas=[delta])
|
372
|
+
|
373
|
+
part = _meta.DataDefinition.Part(
|
374
|
+
partKey=part_key,
|
375
|
+
snap=snap)
|
376
|
+
|
377
|
+
data_def = copy.copy(data_def)
|
378
|
+
data_def.parts = copy.copy(data_def.parts)
|
379
|
+
data_def.parts[part_key.opaqueKey] = part
|
380
|
+
|
381
|
+
return data_def
|
382
|
+
|
383
|
+
@classmethod
|
384
|
+
def _add_storage_copy(cls, storage_def: _meta.StorageDefinition, data_item: str, storage_copy: _meta.StorageCopy):
|
385
|
+
|
386
|
+
new_incarnation = _meta.StorageIncarnation(
|
387
|
+
copies=[storage_copy],
|
388
|
+
incarnationIndex=0,
|
389
|
+
incarnationTimestamp=storage_copy.copyTimestamp,
|
390
|
+
incarnationStatus=_meta.IncarnationStatus.INCARNATION_AVAILABLE)
|
391
|
+
|
392
|
+
new_item = _meta.StorageItem(incarnations=[new_incarnation])
|
393
|
+
|
394
|
+
storage_def = copy.copy(storage_def)
|
395
|
+
storage_def.dataItems = copy.copy(storage_def.dataItems)
|
396
|
+
storage_def.dataItems[data_item] = new_item
|
397
|
+
|
398
|
+
return storage_def
|
399
|
+
|
400
|
+
@classmethod
|
401
|
+
def _find_storage_copy(cls, data_item: str, storage_def: _meta.StorageDefinition) -> tp.Optional[_meta.StorageCopy]:
|
402
|
+
|
403
|
+
if data_item is None:
|
404
|
+
return None
|
405
|
+
|
406
|
+
storage_item = storage_def.dataItems.get(data_item)
|
407
|
+
|
408
|
+
if storage_item is None:
|
409
|
+
return None
|
410
|
+
|
411
|
+
# Latest available incarnation
|
412
|
+
incarnation = next(filter(
|
413
|
+
lambda i: i.incarnationStatus == _meta.IncarnationStatus.INCARNATION_AVAILABLE,
|
414
|
+
reversed(storage_item.incarnations)), None)
|
415
|
+
|
416
|
+
if incarnation is None:
|
417
|
+
return None
|
418
|
+
|
419
|
+
# Use any available copy (currently there is no location preference)
|
420
|
+
return next(filter(
|
421
|
+
lambda c: c.copyStatus == _meta.CopyStatus.COPY_AVAILABLE,
|
422
|
+
incarnation.copies), None)
|
423
|
+
|
424
|
+
|
425
|
+
class ObjectIdLayout(BaseLayout):
|
426
|
+
|
427
|
+
__DATA_STORAGE_TEMPLATE = "data/{}/{}/{}/snap-{:d}/delta-{:d}-x{:0>6x}"
|
428
|
+
__FILE_STORAGE_TEMPLATE = "file/{}/version-{:d}-x{:0>6x}/{}.{}"
|
429
|
+
|
430
|
+
def __init__(self):
|
431
|
+
self.__random = random.Random()
|
432
|
+
self.__random.seed()
|
433
|
+
|
434
|
+
def layout_key(self) -> _meta.StorageLayout:
|
435
|
+
return _meta.StorageLayout.OBJECT_ID_LAYOUT
|
436
|
+
|
437
|
+
def _data_storage_path(
|
438
|
+
self, data_id, context_key, trac_schema,
|
439
|
+
part_key, snap_index, delta_index,
|
440
|
+
storage_format, prior_copy):
|
441
|
+
|
442
|
+
schema_type = trac_schema.schemaType.name.lower()
|
443
|
+
version_suffix = self.__random.randint(0, 1 << 24)
|
444
|
+
|
445
|
+
base_path = self.__DATA_STORAGE_TEMPLATE.format(
|
446
|
+
schema_type, data_id.objectId,
|
447
|
+
part_key.opaqueKey, snap_index, delta_index,
|
448
|
+
version_suffix)
|
449
|
+
|
450
|
+
# STRUCT stored as a single file, not directory layout
|
451
|
+
if trac_schema.schemaType == _meta.SchemaType.STRUCT_SCHEMA:
|
452
|
+
return base_path + ".json"
|
453
|
+
else:
|
454
|
+
return base_path
|
455
|
+
|
456
|
+
def _file_storage_path(self, file_id, file_def, prior_copy):
|
457
|
+
|
458
|
+
version_suffix = self.__random.randint(0, 1 << 24)
|
459
|
+
|
460
|
+
return self.__FILE_STORAGE_TEMPLATE.format(
|
461
|
+
file_id.objectId, file_id.objectVersion, version_suffix,
|
462
|
+
file_def.name, file_def.extension.lower())
|
463
|
+
|
464
|
+
|
465
|
+
class DevelopmentLayout(BaseLayout):
|
466
|
+
|
467
|
+
__DEFAULT_OUTPUT_DIR = "Dev Outputs"
|
468
|
+
|
469
|
+
__DATA_STORAGE_PATH = "{}/{}{}.{}"
|
470
|
+
__FILE_STORAGE_PATH = "{}/{}{}.{}"
|
471
|
+
|
472
|
+
def layout_key(self) -> _meta.StorageLayout:
|
473
|
+
return _meta.StorageLayout.DEVELOPER_LAYOUT
|
474
|
+
|
475
|
+
def _data_storage_path(
|
476
|
+
self, data_id, context_key, trac_schema,
|
477
|
+
part_key, snap_index, delta_index,
|
478
|
+
storage_format, prior_copy):
|
479
|
+
|
480
|
+
storage_dir = self._dev_storage_dir(prior_copy)
|
481
|
+
suffix = f"-{data_id.objectVersion}" if data_id.objectVersion > 1 else ""
|
482
|
+
|
483
|
+
if prior_copy is not None:
|
484
|
+
prior_path = pathlib.Path(prior_copy.storagePath)
|
485
|
+
file_name = prior_path.stem
|
486
|
+
if data_id.objectVersion > 2 and "-" in file_name:
|
487
|
+
file_name = file_name[:file_name.rfind("-")]
|
488
|
+
else:
|
489
|
+
file_name = context_key
|
490
|
+
|
491
|
+
return self.__DATA_STORAGE_PATH.format(storage_dir, file_name, suffix, storage_format.lower())
|
492
|
+
|
493
|
+
def _file_storage_path(self, file_id, file_def, prior_copy):
|
494
|
+
|
495
|
+
storage_dir = self._dev_storage_dir(prior_copy)
|
496
|
+
suffix = f"-{file_id.objectVersion}" if file_id.objectVersion > 1 else ""
|
497
|
+
|
498
|
+
return self.__FILE_STORAGE_PATH.format(storage_dir, file_def.name, suffix, file_def.extension.lower())
|
499
|
+
|
500
|
+
def _dev_storage_dir(self, prior_copy: _meta.StorageCopy):
|
501
|
+
|
502
|
+
if prior_copy is None:
|
503
|
+
return self.__DEFAULT_OUTPUT_DIR
|
504
|
+
|
505
|
+
prior_path = pathlib.Path(prior_copy.storagePath)
|
506
|
+
|
507
|
+
if len(prior_path.parts) > 1:
|
508
|
+
return prior_path.parent
|
509
|
+
else:
|
510
|
+
return self.__DEFAULT_OUTPUT_DIR
|
511
|
+
|
512
|
+
|
513
|
+
def build_data_spec(
|
514
|
+
data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
515
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
516
|
+
sys_config: _cfg.RuntimeConfig,
|
517
|
+
prior_spec: tp.Optional[DataSpec] = None,
|
518
|
+
metadata: tp.Optional[_api.RuntimeMetadata] = None) \
|
519
|
+
-> DataSpec:
|
520
|
+
|
521
|
+
if prior_spec is None:
|
522
|
+
layout_key = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_LAYOUT, _cfg_p.ConfigKDefaults.STORAGE_DEFAULT_LAYOUT)
|
523
|
+
layout = StorageLayout.select(layout_key)
|
524
|
+
spec = layout.new_data_spec(data_id, storage_id, context_key, trac_schema, sys_config)
|
525
|
+
else:
|
526
|
+
layout_key = prior_spec.storage.layout
|
527
|
+
layout = StorageLayout.select(layout_key)
|
528
|
+
spec = layout.new_data_version(data_id, storage_id, context_key, trac_schema, prior_spec)
|
529
|
+
|
530
|
+
# Attach metadata if it is available
|
531
|
+
return spec.with_metadata(metadata) if metadata is not None else spec
|
532
|
+
|
533
|
+
|
534
|
+
def build_file_spec(
|
535
|
+
file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
536
|
+
context_key: tp.Optional[str], file_type: _meta.FileType,
|
537
|
+
sys_config: _cfg.RuntimeConfig,
|
538
|
+
prior_spec: tp.Optional[DataSpec] = None,
|
539
|
+
metadata: tp.Optional[_api.RuntimeMetadata] = None) \
|
540
|
+
-> DataSpec:
|
541
|
+
|
542
|
+
if prior_spec is None:
|
543
|
+
layout_key = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_LAYOUT, _cfg_p.ConfigKDefaults.STORAGE_DEFAULT_LAYOUT)
|
544
|
+
layout = StorageLayout.select(layout_key)
|
545
|
+
spec = layout.new_file_spec(file_id, storage_id, context_key, file_type, sys_config)
|
546
|
+
|
547
|
+
else:
|
548
|
+
layout_key = prior_spec.storage.layout
|
549
|
+
layout = StorageLayout.select(layout_key)
|
550
|
+
spec = layout.new_file_version(file_id, storage_id, context_key, file_type, prior_spec)
|
551
|
+
|
552
|
+
# Attach metadata if it is available
|
553
|
+
return spec.with_metadata(metadata) if metadata is not None else spec
|
105
554
|
|
106
555
|
|
107
556
|
@dc.dataclass(frozen=True)
|
@@ -121,6 +570,8 @@ class DataItem:
|
|
121
570
|
schema: pa.Schema = None
|
122
571
|
table: tp.Optional[pa.Table] = None
|
123
572
|
|
573
|
+
metadata: tp.Optional[_api.RuntimeMetadata] = None
|
574
|
+
|
124
575
|
def is_empty(self) -> bool:
|
125
576
|
return self.content is None
|
126
577
|
|
@@ -138,7 +589,7 @@ class DataItem:
|
|
138
589
|
def for_table(table: pa.Table, schema: pa.Schema, trac_schema: _meta.SchemaDefinition) -> "DataItem":
|
139
590
|
|
140
591
|
return DataItem(
|
141
|
-
_meta.ObjectType.DATA, _meta.SchemaType.
|
592
|
+
_meta.ObjectType.DATA, _meta.SchemaType.TABLE_SCHEMA,
|
142
593
|
content=table, content_type=pa.Table,
|
143
594
|
trac_schema=trac_schema, native_schema=schema,
|
144
595
|
table=table, schema=schema)
|
@@ -147,7 +598,7 @@ class DataItem:
|
|
147
598
|
def for_struct(content: tp.Any):
|
148
599
|
|
149
600
|
return DataItem(
|
150
|
-
_meta.ObjectType.DATA, _meta.SchemaType.
|
601
|
+
_meta.ObjectType.DATA, _meta.SchemaType.STRUCT_SCHEMA,
|
151
602
|
content=content, content_type=type(content))
|
152
603
|
|
153
604
|
@staticmethod
|
@@ -157,6 +608,9 @@ class DataItem:
|
|
157
608
|
_meta.ObjectType.FILE, _meta.SchemaType.SCHEMA_TYPE_NOT_SET,
|
158
609
|
content=content, content_type=bytes)
|
159
610
|
|
611
|
+
def with_metadata(self, metadata: _api.RuntimeMetadata) -> "DataItem":
|
612
|
+
return dc.replace(self, metadata=metadata)
|
613
|
+
|
160
614
|
|
161
615
|
@dc.dataclass(frozen=True)
|
162
616
|
class DataView:
|
@@ -169,6 +623,8 @@ class DataView:
|
|
169
623
|
parts: tp.Dict[DataPartKey, tp.List[DataItem]] = None
|
170
624
|
file_item: tp.Optional[DataItem] = None
|
171
625
|
|
626
|
+
metadata: tp.Optional[_api.RuntimeMetadata] = None
|
627
|
+
|
172
628
|
@staticmethod
|
173
629
|
def create_empty(object_type: _meta.ObjectType = _meta.ObjectType.DATA) -> "DataView":
|
174
630
|
if object_type == _meta.ObjectType.DATA:
|
@@ -184,21 +640,41 @@ class DataView:
|
|
184
640
|
else:
|
185
641
|
return DataView(_meta.ObjectType.DATA, trac_schema, parts = dict())
|
186
642
|
|
643
|
+
@staticmethod
|
644
|
+
def for_arrow_schema(arrow_schema: pa.Schema):
|
645
|
+
trac_schema = DataMapping.arrow_to_trac_schema(arrow_schema)
|
646
|
+
return DataView(_meta.ObjectType.DATA, trac_schema, arrow_schema, dict())
|
647
|
+
|
187
648
|
@staticmethod
|
188
649
|
def for_file_item(file_item: DataItem):
|
189
650
|
return DataView(file_item.object_type, file_item=file_item)
|
190
651
|
|
191
652
|
def with_trac_schema(self, trac_schema: _meta.SchemaDefinition):
|
192
653
|
arrow_schema = DataMapping.trac_to_arrow_schema(trac_schema)
|
193
|
-
return
|
654
|
+
return dc.replace(self, trac_schema=trac_schema, arrow_schema=arrow_schema)
|
194
655
|
|
195
656
|
def with_part(self, part_key: DataPartKey, part: DataItem):
|
196
|
-
new_parts = copy.copy(self.parts)
|
657
|
+
new_parts = copy.copy(self.parts) if self.parts is not None else {}
|
197
658
|
new_parts[part_key] = [part]
|
198
|
-
return
|
659
|
+
return dc.replace(self, parts=new_parts)
|
199
660
|
|
200
661
|
def with_file_item(self, file_item: DataItem):
|
201
|
-
return
|
662
|
+
return dc.replace(self, file_item=file_item)
|
663
|
+
|
664
|
+
def with_metadata(self, metadata: _api.RuntimeMetadata) -> "DataView":
|
665
|
+
return dc.replace(self, metadata=metadata)
|
666
|
+
|
667
|
+
def get_metadata(self) -> tp.Optional[_api.RuntimeMetadata]:
|
668
|
+
if self.metadata:
|
669
|
+
return self.metadata
|
670
|
+
if self.object_type == _meta.ObjectType.FILE and self.file_item:
|
671
|
+
return self.file_item.metadata
|
672
|
+
if self.parts:
|
673
|
+
for items in self.parts.values():
|
674
|
+
for item in items:
|
675
|
+
if item and item.metadata:
|
676
|
+
return item.metadata
|
677
|
+
return None
|
202
678
|
|
203
679
|
def is_empty(self) -> bool:
|
204
680
|
if self.object_type == _meta.ObjectType.FILE:
|
@@ -259,8 +735,8 @@ class DataMapping:
|
|
259
735
|
pa.date64(): _meta.BasicType.DATE
|
260
736
|
}
|
261
737
|
|
262
|
-
@
|
263
|
-
def arrow_to_python_type(arrow_type: pa.DataType) -> type:
|
738
|
+
@classmethod
|
739
|
+
def arrow_to_python_type(cls, arrow_type: pa.DataType) -> type:
|
264
740
|
|
265
741
|
if pa.types.is_boolean(arrow_type):
|
266
742
|
return bool
|
@@ -283,6 +759,11 @@ class DataMapping:
|
|
283
759
|
if pa.types.is_timestamp(arrow_type):
|
284
760
|
return dt.datetime
|
285
761
|
|
762
|
+
# The python type for a dictionary-encoded field is its value type
|
763
|
+
if pa.types.is_dictionary(arrow_type):
|
764
|
+
if isinstance(arrow_type, pa.DictionaryType):
|
765
|
+
return cls.arrow_to_python_type(arrow_type.value_type)
|
766
|
+
|
286
767
|
raise _ex.ETracInternal(f"No Python type mapping available for Arrow type [{arrow_type}]")
|
287
768
|
|
288
769
|
@classmethod
|
@@ -340,7 +821,13 @@ class DataMapping:
|
|
340
821
|
def trac_to_arrow_field(cls, trac_field: _meta.FieldSchema):
|
341
822
|
|
342
823
|
arrow_type = cls.trac_to_arrow_basic_type(trac_field.fieldType)
|
343
|
-
|
824
|
+
|
825
|
+
# Categorical data uses an unordered dictionary with int32 index, ordered encoding not (currently) supported
|
826
|
+
# For legacy compatability, only use dictionary encoding if the categorical feature is enabled
|
827
|
+
if trac_field.categorical:
|
828
|
+
arrow_type = pa.dictionary(pa.int32(), arrow_type, False)
|
829
|
+
|
830
|
+
nullable = not (trac_field.notNull or trac_field.businessKey)
|
344
831
|
|
345
832
|
return pa.field(trac_field.fieldName, arrow_type, nullable)
|
346
833
|
|
@@ -369,12 +856,15 @@ class DataMapping:
|
|
369
856
|
field_type = cls.arrow_to_trac_type(field.type)
|
370
857
|
label = field.metadata["label"] if field.metadata and "label" in field.metadata else field.name
|
371
858
|
|
859
|
+
# When converting Arrow -> TRAC, always set the categorical flag for dictionary encoded fields
|
860
|
+
# This affects dynamic imports and is informational only (physical layout is controlled by Arrow schemas)
|
861
|
+
|
372
862
|
return _meta.FieldSchema(
|
373
863
|
field.name, field_index, field_type,
|
374
864
|
label=label,
|
375
865
|
businessKey=False,
|
376
866
|
notNull=not field.nullable,
|
377
|
-
categorical=
|
867
|
+
categorical=pa.types.is_dictionary(field.type))
|
378
868
|
|
379
869
|
@classmethod
|
380
870
|
def arrow_to_trac_type(cls, arrow_type: pa.DataType) -> _meta.BasicType:
|
@@ -390,6 +880,11 @@ class DataMapping:
|
|
390
880
|
if pa.types.is_timestamp(arrow_type):
|
391
881
|
return _meta.BasicType.DATETIME
|
392
882
|
|
883
|
+
# The basic type for a dictionary-encoded field is its value type
|
884
|
+
if pa.types.is_dictionary(arrow_type):
|
885
|
+
if isinstance(arrow_type, pa.DictionaryType):
|
886
|
+
return cls.arrow_to_trac_type(arrow_type.value_type)
|
887
|
+
|
393
888
|
raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
|
394
889
|
|
395
890
|
@classmethod
|
@@ -470,7 +965,7 @@ T_INTERNAL_DATA = tp.TypeVar("T_INTERNAL_DATA")
|
|
470
965
|
T_INTERNAL_SCHEMA = tp.TypeVar("T_INTERNAL_SCHEMA")
|
471
966
|
|
472
967
|
|
473
|
-
class DataConverter(tp.Generic[T_DATA_API, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
|
968
|
+
class DataConverter(tp.Generic[T_DATA_API, T_INTERNAL_DATA, T_INTERNAL_SCHEMA], metaclass=abc.ABCMeta):
|
474
969
|
|
475
970
|
# Available per-framework args, to enable framework-specific type-checking in public APIs
|
476
971
|
# These should (for a purist point of view) be in the individual converter classes
|
@@ -766,6 +1261,10 @@ class DataConformance:
|
|
766
1261
|
"Field [{field_name}] cannot be converted from {vector_type} to {field_type}, " + \
|
767
1262
|
"source and target have different time zones"
|
768
1263
|
|
1264
|
+
__E_WRONG_CATEGORICAL_TYPE = \
|
1265
|
+
"Field [{field_name}] categorical types do not match" + \
|
1266
|
+
"(expected {field_type}, got {vector_type})"
|
1267
|
+
|
769
1268
|
@classmethod
|
770
1269
|
def column_filter(cls, columns: tp.List[str], schema: tp.Optional[pa.Schema]) -> tp.Optional[tp.List[str]]:
|
771
1270
|
|
@@ -918,13 +1417,19 @@ class DataConformance:
|
|
918
1417
|
@classmethod
|
919
1418
|
def _coerce_vector(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
|
920
1419
|
|
1420
|
+
# Handle null vector
|
921
1421
|
if pa.types.is_null(vector.type):
|
922
|
-
|
923
1422
|
if field.nullable:
|
924
1423
|
return pa.nulls(size=len(vector), type=field.type)
|
925
1424
|
else:
|
926
1425
|
raise _ex.EDataConformance(f"All null values in non-null field [{field.name}]")
|
927
1426
|
|
1427
|
+
# If the vector is dict-encoded but the expected result is not, decode the dictionary
|
1428
|
+
if pa.types.is_dictionary(vector.type) and not pa.types.is_dictionary(field.type):
|
1429
|
+
if isinstance(vector, pa.DictionaryArray):
|
1430
|
+
dict_vector: pa.DictionaryArray = vector
|
1431
|
+
vector = dict_vector.dictionary_decode()
|
1432
|
+
|
928
1433
|
if pa.types.is_boolean(field.type):
|
929
1434
|
return cls._coerce_boolean(vector, field)
|
930
1435
|
|
@@ -946,6 +1451,9 @@ class DataConformance:
|
|
946
1451
|
if pa.types.is_timestamp(field.type):
|
947
1452
|
return cls._coerce_timestamp(vector, field)
|
948
1453
|
|
1454
|
+
if pa.types.is_dictionary(field.type):
|
1455
|
+
return cls._coerce_dictionary(vector, field)
|
1456
|
+
|
949
1457
|
error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
|
950
1458
|
cls.__log.error(error_message)
|
951
1459
|
raise _ex.EDataConformance(error_message)
|
@@ -1188,6 +1696,85 @@ class DataConformance:
|
|
1188
1696
|
|
1189
1697
|
return pc.cast(scaled_vector, field.type)
|
1190
1698
|
|
1699
|
+
@classmethod
|
1700
|
+
def _coerce_dictionary(cls, vector: pa.Array, field: pa.Field):
|
1701
|
+
|
1702
|
+
try:
|
1703
|
+
|
1704
|
+
if not isinstance(field.type, pa.DictionaryType):
|
1705
|
+
raise _ex.EUnexpected()
|
1706
|
+
|
1707
|
+
field_type: pa.DictionaryType = field.type
|
1708
|
+
|
1709
|
+
# Supplied vector is a dictionary (but the dictionary type is not an exact match)
|
1710
|
+
if pa.types.is_dictionary(vector.type):
|
1711
|
+
|
1712
|
+
if not isinstance(vector.type, pa.DictionaryType):
|
1713
|
+
raise _ex.EUnexpected()
|
1714
|
+
|
1715
|
+
vector_type: pa.DictionaryType = vector.type
|
1716
|
+
|
1717
|
+
# Do not allow coercion to a smaller index type or from unordered to ordered
|
1718
|
+
if (vector_type.index_type.bit_width > field_type.index_type.bit_width) or \
|
1719
|
+
(field_type.ordered and not vector_type.ordered):
|
1720
|
+
|
1721
|
+
error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
|
1722
|
+
cls.__log.error(error_message)
|
1723
|
+
raise _ex.EDataConformance(error_message)
|
1724
|
+
|
1725
|
+
# Value types are the same - basic cast should succeed
|
1726
|
+
if vector_type.value_type == field_type.value_type:
|
1727
|
+
return pc.cast(vector, field.type)
|
1728
|
+
|
1729
|
+
# Value types differ - try to coerce the underlying dictionary
|
1730
|
+
elif isinstance(vector, pa.DictionaryArray):
|
1731
|
+
try:
|
1732
|
+
values_field = pa.field(field.name, field_type.value_type, field.nullable)
|
1733
|
+
values_vector = cls._coerce_vector(vector.dictionary, values_field)
|
1734
|
+
dict_vector = pa.DictionaryArray.from_arrays(vector.indices, values_vector, ordered=field_type.ordered) # noqa
|
1735
|
+
return pc.cast(dict_vector, field.type)
|
1736
|
+
# Handle errors converting the value type
|
1737
|
+
except _ex.EDataConformance as e:
|
1738
|
+
error_message = cls._format_error(cls.__E_WRONG_CATEGORICAL_TYPE, vector, field)
|
1739
|
+
cls.__log.error(error_message)
|
1740
|
+
raise _ex.EDataConformance(error_message) from e
|
1741
|
+
|
1742
|
+
# Special handling for chunked dictionaries
|
1743
|
+
elif isinstance(vector, pa.ChunkedArray):
|
1744
|
+
if any(vector.chunks):
|
1745
|
+
chunks = [cls._coerce_dictionary(chunk, field) for chunk in vector.chunks]
|
1746
|
+
return pa.chunked_array(chunks)
|
1747
|
+
else:
|
1748
|
+
return pa.array([], type=field.type, size=0) # noqa
|
1749
|
+
|
1750
|
+
# Vector type not recognized, coercion is not possible
|
1751
|
+
else:
|
1752
|
+
error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
|
1753
|
+
cls.__log.error(error_message)
|
1754
|
+
raise _ex.EDataConformance(error_message)
|
1755
|
+
|
1756
|
+
# Supplied vector matches the dictionary value type - perform dictionary encoding
|
1757
|
+
elif vector.type == field_type.value_type and not field_type.ordered:
|
1758
|
+
return vector.dictionary_encode().cast(field.type)
|
1759
|
+
|
1760
|
+
# Fallback option - try to coerce the value type first, then perform dictionary encoding
|
1761
|
+
else:
|
1762
|
+
try:
|
1763
|
+
values_field = pa.field(field.name, field_type.value_type, field.nullable)
|
1764
|
+
values_vector = cls._coerce_vector(vector, values_field)
|
1765
|
+
return values_vector.dictionary_encode().cast(field.type)
|
1766
|
+
# Handle errors converting the value type
|
1767
|
+
except _ex.EDataConformance as e:
|
1768
|
+
error_message = cls._format_error(cls.__E_WRONG_CATEGORICAL_TYPE, vector, field)
|
1769
|
+
cls.__log.error(error_message)
|
1770
|
+
raise _ex.EDataConformance(error_message) from e
|
1771
|
+
|
1772
|
+
except pa.ArrowInvalid as e:
|
1773
|
+
|
1774
|
+
error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e)
|
1775
|
+
cls.__log.error(error_message)
|
1776
|
+
raise _ex.EDataConformance(error_message) from e
|
1777
|
+
|
1191
1778
|
@classmethod
|
1192
1779
|
def _format_error(cls, error_template: str, vector: pa.Array, field: pa.Field, e: Exception = None):
|
1193
1780
|
|