tracdap-runtime 0.8.0rc1__py3-none-any.whl → 0.9.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracdap/rt/_impl/core/data.py +578 -33
- tracdap/rt/_impl/core/repos.py +7 -0
- tracdap/rt/_impl/core/storage.py +10 -3
- tracdap/rt/_impl/core/util.py +54 -11
- tracdap/rt/_impl/exec/dev_mode.py +122 -100
- tracdap/rt/_impl/exec/engine.py +178 -109
- tracdap/rt/_impl/exec/functions.py +218 -257
- tracdap/rt/_impl/exec/graph.py +140 -125
- tracdap/rt/_impl/exec/graph_builder.py +411 -449
- tracdap/rt/_impl/grpc/codec.py +4 -2
- tracdap/rt/_impl/grpc/server.py +7 -7
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.py +25 -18
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.pyi +27 -9
- tracdap/rt/_impl/grpc/tracdap/metadata/common_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/config_pb2.py +40 -0
- tracdap/rt/_impl/grpc/tracdap/metadata/config_pb2.pyi +62 -0
- tracdap/rt/_impl/grpc/tracdap/metadata/custom_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/data_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/file_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/flow_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +67 -63
- tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +11 -2
- tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/object_id_pb2.py +3 -3
- tracdap/rt/_impl/grpc/tracdap/metadata/object_id_pb2.pyi +4 -0
- tracdap/rt/_impl/grpc/tracdap/metadata/object_pb2.py +8 -6
- tracdap/rt/_impl/grpc/tracdap/metadata/object_pb2.pyi +8 -2
- tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.py +18 -5
- tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.pyi +42 -2
- tracdap/rt/_impl/grpc/tracdap/metadata/search_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/storage_pb2.py +11 -9
- tracdap/rt/_impl/grpc/tracdap/metadata/storage_pb2.pyi +11 -2
- tracdap/rt/_impl/grpc/tracdap/metadata/tag_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/tag_update_pb2.py +1 -1
- tracdap/rt/_impl/grpc/tracdap/metadata/type_pb2.py +1 -1
- tracdap/rt/_impl/runtime.py +8 -0
- tracdap/rt/_plugins/repo_git.py +56 -11
- tracdap/rt/_version.py +1 -1
- tracdap/rt/config/__init__.py +6 -4
- tracdap/rt/config/common.py +5 -0
- tracdap/rt/config/dynamic.py +28 -0
- tracdap/rt/config/job.py +13 -3
- tracdap/rt/config/result.py +8 -4
- tracdap/rt/config/runtime.py +2 -0
- tracdap/rt/metadata/__init__.py +37 -30
- tracdap/rt/metadata/config.py +95 -0
- tracdap/rt/metadata/job.py +2 -0
- tracdap/rt/metadata/object.py +6 -0
- tracdap/rt/metadata/object_id.py +4 -0
- tracdap/rt/metadata/resource.py +41 -1
- tracdap/rt/metadata/storage.py +9 -0
- {tracdap_runtime-0.8.0rc1.dist-info → tracdap_runtime-0.9.0b1.dist-info}/METADATA +5 -2
- {tracdap_runtime-0.8.0rc1.dist-info → tracdap_runtime-0.9.0b1.dist-info}/RECORD +56 -52
- {tracdap_runtime-0.8.0rc1.dist-info → tracdap_runtime-0.9.0b1.dist-info}/WHEEL +1 -1
- {tracdap_runtime-0.8.0rc1.dist-info → tracdap_runtime-0.9.0b1.dist-info/licenses}/LICENSE +0 -0
- {tracdap_runtime-0.8.0rc1.dist-info → tracdap_runtime-0.9.0b1.dist-info}/top_level.txt +0 -0
tracdap/rt/_impl/core/data.py
CHANGED
@@ -16,10 +16,12 @@
|
|
16
16
|
import abc
|
17
17
|
import copy
|
18
18
|
import dataclasses as dc
|
19
|
+
import pathlib
|
19
20
|
import typing as tp
|
20
21
|
import datetime as dt
|
21
22
|
import decimal
|
22
23
|
import platform
|
24
|
+
import random
|
23
25
|
|
24
26
|
import pyarrow as pa
|
25
27
|
import pyarrow.compute as pc
|
@@ -36,8 +38,20 @@ except ModuleNotFoundError:
|
|
36
38
|
|
37
39
|
import tracdap.rt.api.experimental as _api
|
38
40
|
import tracdap.rt.metadata as _meta
|
41
|
+
import tracdap.rt.config as _cfg
|
39
42
|
import tracdap.rt.exceptions as _ex
|
40
43
|
import tracdap.rt._impl.core.logging as _log
|
44
|
+
import tracdap.rt._impl.core.util as _util
|
45
|
+
|
46
|
+
|
47
|
+
@dc.dataclass(frozen=True)
|
48
|
+
class DataPartKey:
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
def for_root(cls) -> "DataPartKey":
|
52
|
+
return DataPartKey(opaque_key='part-root')
|
53
|
+
|
54
|
+
opaque_key: str
|
41
55
|
|
42
56
|
|
43
57
|
@dc.dataclass(frozen=True)
|
@@ -47,61 +61,470 @@ class DataSpec:
|
|
47
61
|
schema_type: _meta.SchemaType
|
48
62
|
data_item: str
|
49
63
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
64
|
+
definition: tp.Union[_meta.DataDefinition, _meta.FileDefinition]
|
65
|
+
storage: _meta.StorageDefinition
|
66
|
+
schema: tp.Optional[_meta.SchemaDefinition] = None
|
67
|
+
|
68
|
+
primary_id: _meta.TagHeader = None
|
69
|
+
storage_id: _meta.TagHeader = None
|
70
|
+
schema_id: tp.Optional[_meta.TagHeader] = None
|
71
|
+
context_key: tp.Optional[str] = None
|
54
72
|
|
55
73
|
@staticmethod
|
56
74
|
def create_data_spec(
|
57
75
|
data_item: str,
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
if
|
63
|
-
schema_type =
|
64
|
-
elif
|
65
|
-
schema_type =
|
76
|
+
definition: _meta.DataDefinition,
|
77
|
+
storage: _meta.StorageDefinition,
|
78
|
+
schema: tp.Optional[_meta.SchemaDefinition] = None) -> "DataSpec":
|
79
|
+
|
80
|
+
if schema:
|
81
|
+
schema_type = schema.schemaType
|
82
|
+
elif definition.schema:
|
83
|
+
schema_type = definition.schema.schemaType
|
66
84
|
else:
|
67
85
|
schema_type = _meta.SchemaType.SCHEMA_TYPE_NOT_SET
|
68
86
|
|
69
87
|
return DataSpec(
|
70
88
|
_meta.ObjectType.DATA, schema_type, data_item,
|
71
|
-
|
72
|
-
storage_def=storage_def,
|
73
|
-
schema_def=schema_def,
|
74
|
-
file_def=None)
|
89
|
+
definition, storage, schema)
|
75
90
|
|
76
91
|
@staticmethod
|
77
92
|
def create_file_spec(
|
78
93
|
data_item: str,
|
79
|
-
|
80
|
-
|
94
|
+
definition: _meta.FileDefinition,
|
95
|
+
storage: _meta.StorageDefinition) -> "DataSpec":
|
81
96
|
|
82
97
|
return DataSpec(
|
83
98
|
_meta.ObjectType.FILE, _meta.SchemaType.SCHEMA_TYPE_NOT_SET, data_item,
|
84
|
-
|
85
|
-
storage_def=storage_def,
|
86
|
-
data_def=None,
|
87
|
-
schema_def=None)
|
99
|
+
definition, storage)
|
88
100
|
|
89
101
|
@staticmethod
|
90
102
|
def create_empty_spec(object_type: _meta.ObjectType, schema_type: _meta.SchemaType):
|
91
|
-
return DataSpec(object_type, schema_type,
|
103
|
+
return DataSpec(object_type, schema_type, "", None, None, None) # noqa
|
104
|
+
|
105
|
+
def with_ids(
|
106
|
+
self, primary_id: _meta.TagHeader,
|
107
|
+
storage_id: _meta.TagHeader,
|
108
|
+
schema_id: tp.Optional[_meta.TagHeader] = None,
|
109
|
+
context_key: tp.Optional[str] = None):
|
110
|
+
|
111
|
+
return DataSpec(
|
112
|
+
self.object_type, self.schema_type, self.data_item,
|
113
|
+
self.definition, self.storage, self.schema,
|
114
|
+
primary_id, storage_id, schema_id, context_key)
|
92
115
|
|
93
116
|
def is_empty(self):
|
94
117
|
return self.data_item is None or len(self.data_item) == 0
|
95
118
|
|
96
119
|
|
97
|
-
|
98
|
-
|
120
|
+
class StorageLayout:
|
121
|
+
|
122
|
+
__LAYOUTS: "tp.Dict[_meta.StorageLayout, StorageLayout]" = dict()
|
99
123
|
|
100
124
|
@classmethod
|
101
|
-
def
|
102
|
-
return DataPartKey(opaque_key='part_root')
|
125
|
+
def select(cls, layout_key: _meta.StorageLayout) -> "StorageLayout":
|
103
126
|
|
104
|
-
|
127
|
+
# Legacy compatibility - layout key not set in storage definition
|
128
|
+
if not layout_key or layout_key.value == 0:
|
129
|
+
layout_key = _meta.StorageLayout.OBJECT_ID_LAYOUT
|
130
|
+
|
131
|
+
layout = cls.__LAYOUTS.get(layout_key)
|
132
|
+
|
133
|
+
if layout is not None:
|
134
|
+
return layout
|
135
|
+
|
136
|
+
if layout_key == _meta.StorageLayout.OBJECT_ID_LAYOUT:
|
137
|
+
layout = ObjectIdLayout()
|
138
|
+
elif layout_key == _meta.StorageLayout.DEVELOPER_LAYOUT:
|
139
|
+
layout = DevelopmentLayout()
|
140
|
+
else:
|
141
|
+
raise _ex.ETracInternal(f"Unknown storage layout [{layout_key.name}]")
|
142
|
+
|
143
|
+
cls.__LAYOUTS[layout_key] = layout
|
144
|
+
|
145
|
+
return layout
|
146
|
+
|
147
|
+
@abc.abstractmethod
|
148
|
+
def layout_key(self) -> _meta.StorageLayout:
|
149
|
+
pass
|
150
|
+
|
151
|
+
@abc.abstractmethod
|
152
|
+
def new_data_spec(
|
153
|
+
self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
154
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
155
|
+
storage_config: _cfg.StorageConfig) -> DataSpec:
|
156
|
+
pass
|
157
|
+
|
158
|
+
@abc.abstractmethod
|
159
|
+
def new_data_version(
|
160
|
+
self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
161
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
162
|
+
prior_spec: DataSpec) -> DataSpec:
|
163
|
+
pass
|
164
|
+
|
165
|
+
@abc.abstractmethod
|
166
|
+
def new_file_spec(
|
167
|
+
self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
168
|
+
context_key: str, file_type: _meta.FileType,
|
169
|
+
storage_config: _cfg.StorageConfig) -> DataSpec:
|
170
|
+
pass
|
171
|
+
|
172
|
+
@abc.abstractmethod
|
173
|
+
def new_file_version(
|
174
|
+
self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
175
|
+
context_key: str, file_type: _meta.FileType,
|
176
|
+
prior_spec: DataSpec) -> DataSpec:
|
177
|
+
pass
|
178
|
+
|
179
|
+
|
180
|
+
class BaseLayout(StorageLayout):
|
181
|
+
|
182
|
+
__DATA_ITEM_TEMPLATE = "data/{}/{}/{}/snap-{:d}/delta-{:d}"
|
183
|
+
__FILE_ITEM_TEMPLATE = "file/{}/version-{}"
|
184
|
+
|
185
|
+
@abc.abstractmethod
|
186
|
+
def _data_storage_path(
|
187
|
+
self, data_id: _meta.TagHeader, context_key: str, trac_schema: _meta.SchemaDefinition,
|
188
|
+
part_key: _meta.PartKey, snap_index: int, delta_index: int, storage_format: str,
|
189
|
+
prior_copy: tp.Optional[_meta.StorageCopy]):
|
190
|
+
pass
|
191
|
+
|
192
|
+
@abc.abstractmethod
|
193
|
+
def _file_storage_path(
|
194
|
+
self, file_id: _meta.TagHeader, file_def: _meta.FileDefinition,
|
195
|
+
prior_copy: tp.Optional[_meta.StorageCopy]):
|
196
|
+
pass
|
197
|
+
|
198
|
+
def new_data_spec(
|
199
|
+
self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
200
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
201
|
+
storage_config: _cfg.StorageConfig) -> DataSpec:
|
202
|
+
|
203
|
+
part_key = _meta.PartKey("part-root", _meta.PartType.PART_ROOT)
|
204
|
+
snap_index = 0
|
205
|
+
|
206
|
+
data_item = self.__DATA_ITEM_TEMPLATE.format(
|
207
|
+
trac_schema.schemaType.name.lower(), data_id.objectId,
|
208
|
+
part_key.opaqueKey, snap_index, 0)
|
209
|
+
|
210
|
+
# Blank data definition with no parts
|
211
|
+
new_data_def = _meta.DataDefinition(
|
212
|
+
schema=trac_schema,
|
213
|
+
storageId=_util.selector_for_latest(storage_id))
|
214
|
+
|
215
|
+
data_def = self._add_new_snap(new_data_def, data_item, part_key, snap_index)
|
216
|
+
|
217
|
+
# Take default location from the storage config
|
218
|
+
storage_key = storage_config.defaultBucket
|
219
|
+
storage_format = "JSON" if trac_schema.schemaType == _meta.SchemaType.STRUCT else storage_config.defaultFormat
|
220
|
+
storage_path = self._data_storage_path(data_id, context_key, trac_schema, part_key, snap_index, 0, storage_format, prior_copy=None)
|
221
|
+
|
222
|
+
storage_copy = _meta.StorageCopy(
|
223
|
+
storageKey=storage_key,
|
224
|
+
storagePath=storage_path,
|
225
|
+
storageFormat=storage_format,
|
226
|
+
copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
|
227
|
+
copyTimestamp=data_id.objectTimestamp)
|
228
|
+
|
229
|
+
new_storage_def = _meta.StorageDefinition()
|
230
|
+
|
231
|
+
storage_def = self._add_storage_copy(new_storage_def, data_item, storage_copy)
|
232
|
+
|
233
|
+
# Dynamic data def will always use an embedded schema (this is no ID for an external schema)
|
234
|
+
|
235
|
+
return DataSpec \
|
236
|
+
.create_data_spec(data_item, data_def, storage_def, schema=None) \
|
237
|
+
.with_ids(data_id, storage_id)
|
238
|
+
|
239
|
+
def new_data_version(
|
240
|
+
self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
241
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
242
|
+
prior_spec: DataSpec) -> DataSpec:
|
243
|
+
|
244
|
+
part_key = _meta.PartKey("part-root", _meta.PartType.PART_ROOT)
|
245
|
+
snap_index = prior_spec.primary_id.objectVersion # snap index is zero-based
|
246
|
+
|
247
|
+
data_item = self.__DATA_ITEM_TEMPLATE.format(
|
248
|
+
trac_schema.schemaType.name.lower(), data_id.objectId,
|
249
|
+
part_key.opaqueKey, snap_index, 0)
|
250
|
+
|
251
|
+
data_def = self._add_new_snap(prior_spec.definition, data_item, part_key, snap_index)
|
252
|
+
|
253
|
+
prior_item = next(iter(prior_spec.storage.dataItems.keys()), None)
|
254
|
+
prior_copy = self._find_storage_copy(prior_item, prior_spec.storage)
|
255
|
+
|
256
|
+
if prior_copy is None:
|
257
|
+
raise _ex.ETracInternal(f"Missing prior metadata for [{_util.object_key(data_id)}]")
|
258
|
+
|
259
|
+
storage_key = prior_copy.storageKey
|
260
|
+
storage_format = prior_copy.storageFormat
|
261
|
+
storage_path = self._data_storage_path(data_id, context_key, trac_schema, part_key, snap_index, 0, storage_format, prior_copy)
|
262
|
+
|
263
|
+
storage_copy = _meta.StorageCopy(
|
264
|
+
storageKey=storage_key,
|
265
|
+
storagePath=storage_path,
|
266
|
+
storageFormat=storage_format,
|
267
|
+
copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
|
268
|
+
copyTimestamp=data_id.objectTimestamp)
|
269
|
+
|
270
|
+
storage_def = self._add_storage_copy(prior_spec.storage, data_item, storage_copy)
|
271
|
+
|
272
|
+
return DataSpec \
|
273
|
+
.create_data_spec(data_item, data_def, storage_def, schema=None) \
|
274
|
+
.with_ids(data_id, storage_id)
|
275
|
+
|
276
|
+
def new_file_spec(
|
277
|
+
self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
278
|
+
context_key: str, file_type: _meta.FileType,
|
279
|
+
storage_config: _cfg.StorageConfig) -> DataSpec:
|
280
|
+
|
281
|
+
data_item = self.__FILE_ITEM_TEMPLATE.format(file_id.objectId, file_id.objectVersion)
|
282
|
+
|
283
|
+
file_def = _meta.FileDefinition(
|
284
|
+
name=f"{context_key}.{file_type.extension}",
|
285
|
+
extension=file_type.extension,
|
286
|
+
mimeType=file_type.mimeType,
|
287
|
+
dataItem=data_item,
|
288
|
+
storageId=_util.selector_for_latest(storage_id),
|
289
|
+
size=0)
|
290
|
+
|
291
|
+
storage_key = storage_config.defaultBucket
|
292
|
+
storage_format = file_def.mimeType
|
293
|
+
storage_path = self._file_storage_path(file_id, file_def, prior_copy=None)
|
294
|
+
|
295
|
+
storage_copy = _meta.StorageCopy(
|
296
|
+
storageKey=storage_key,
|
297
|
+
storagePath=storage_path,
|
298
|
+
storageFormat=storage_format,
|
299
|
+
copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
|
300
|
+
copyTimestamp=file_id.objectTimestamp)
|
301
|
+
|
302
|
+
new_storage_def = _meta.StorageDefinition()
|
303
|
+
new_storage_def.layout = self.layout_key()
|
304
|
+
|
305
|
+
storage_def = self._add_storage_copy(new_storage_def, data_item, storage_copy)
|
306
|
+
|
307
|
+
return DataSpec \
|
308
|
+
.create_file_spec(data_item, file_def, storage_def) \
|
309
|
+
.with_ids(file_id, storage_id)
|
310
|
+
|
311
|
+
def new_file_version(
|
312
|
+
self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
313
|
+
context_key: str, file_type: _meta.FileType, prior_spec: DataSpec) -> DataSpec:
|
314
|
+
|
315
|
+
data_item = self.__FILE_ITEM_TEMPLATE.format(file_id.objectId, file_id.objectVersion)
|
316
|
+
|
317
|
+
file_def = _meta.FileDefinition(
|
318
|
+
name=f"{context_key}.{file_type.extension}",
|
319
|
+
extension=file_type.extension,
|
320
|
+
mimeType=file_type.mimeType,
|
321
|
+
dataItem=data_item,
|
322
|
+
storageId=_util.selector_for_latest(storage_id),
|
323
|
+
size=0)
|
324
|
+
|
325
|
+
prior_copy = self._find_storage_copy(prior_spec.definition.dataItem, prior_spec.storage)
|
326
|
+
|
327
|
+
if prior_copy is None:
|
328
|
+
raise _ex.ETracInternal(f"Missing prior metadata for [{_util.object_key(file_id)}]")
|
329
|
+
|
330
|
+
storage_key = prior_copy.storageKey
|
331
|
+
storage_format = file_def.mimeType
|
332
|
+
storage_path = self._file_storage_path(file_id, file_def, prior_copy=None)
|
333
|
+
|
334
|
+
storage_copy = _meta.StorageCopy(
|
335
|
+
storageKey=storage_key,
|
336
|
+
storagePath=storage_path,
|
337
|
+
storageFormat=storage_format,
|
338
|
+
copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
|
339
|
+
copyTimestamp=file_id.objectTimestamp)
|
340
|
+
|
341
|
+
storage_def = self._add_storage_copy(prior_spec.storage, data_item, storage_copy)
|
342
|
+
|
343
|
+
return DataSpec \
|
344
|
+
.create_file_spec(data_item, file_def, storage_def) \
|
345
|
+
.with_ids(file_id, storage_id)
|
346
|
+
|
347
|
+
@classmethod
|
348
|
+
def _add_new_snap(
|
349
|
+
cls, data_def: _meta.DataDefinition,data_item: str,
|
350
|
+
part_key: _meta.PartKey, snap_index: int):
|
351
|
+
|
352
|
+
delta = _meta.DataDefinition.Delta(
|
353
|
+
deltaIndex=0,
|
354
|
+
dataItem=data_item)
|
355
|
+
|
356
|
+
snap = _meta.DataDefinition.Snap(
|
357
|
+
snapIndex=snap_index,
|
358
|
+
deltas=[delta])
|
359
|
+
|
360
|
+
part = _meta.DataDefinition.Part(
|
361
|
+
partKey=part_key,
|
362
|
+
snap=snap)
|
363
|
+
|
364
|
+
data_def = copy.copy(data_def)
|
365
|
+
data_def.parts = copy.copy(data_def.parts)
|
366
|
+
data_def.parts[part_key.opaqueKey] = part
|
367
|
+
|
368
|
+
return data_def
|
369
|
+
|
370
|
+
@classmethod
|
371
|
+
def _add_storage_copy(cls, storage_def: _meta.StorageDefinition, data_item: str, storage_copy: _meta.StorageCopy):
|
372
|
+
|
373
|
+
new_incarnation = _meta.StorageIncarnation(
|
374
|
+
copies=[storage_copy],
|
375
|
+
incarnationIndex=0,
|
376
|
+
incarnationTimestamp=storage_copy.copyTimestamp,
|
377
|
+
incarnationStatus=_meta.IncarnationStatus.INCARNATION_AVAILABLE)
|
378
|
+
|
379
|
+
new_item = _meta.StorageItem(incarnations=[new_incarnation])
|
380
|
+
|
381
|
+
storage_def = copy.copy(storage_def)
|
382
|
+
storage_def.dataItems = copy.copy(storage_def.dataItems)
|
383
|
+
storage_def.dataItems[data_item] = new_item
|
384
|
+
|
385
|
+
return storage_def
|
386
|
+
|
387
|
+
@classmethod
|
388
|
+
def _find_storage_copy(cls, data_item: str, storage_def: _meta.StorageDefinition) -> tp.Optional[_meta.StorageCopy]:
|
389
|
+
|
390
|
+
if data_item is None:
|
391
|
+
return None
|
392
|
+
|
393
|
+
storage_item = storage_def.dataItems.get(data_item)
|
394
|
+
|
395
|
+
if storage_item is None:
|
396
|
+
return None
|
397
|
+
|
398
|
+
# Latest available incarnation
|
399
|
+
incarnation = next(filter(
|
400
|
+
lambda i: i.incarnationStatus == _meta.IncarnationStatus.INCARNATION_AVAILABLE,
|
401
|
+
reversed(storage_item.incarnations)), None)
|
402
|
+
|
403
|
+
if incarnation is None:
|
404
|
+
return None
|
405
|
+
|
406
|
+
# Use any available copy (currently there is no location preference)
|
407
|
+
return next(filter(
|
408
|
+
lambda c: c.copyStatus == _meta.CopyStatus.COPY_AVAILABLE,
|
409
|
+
incarnation.copies), None)
|
410
|
+
|
411
|
+
|
412
|
+
class ObjectIdLayout(BaseLayout):
|
413
|
+
|
414
|
+
__DATA_STORAGE_TEMPLATE = "data/{}/{}/{}/snap-{:d}/delta-{:d}-x{:0>6x}"
|
415
|
+
__FILE_STORAGE_TEMPLATE = "file/{}/version-{:d}-x{:0>6x}/{}.{}"
|
416
|
+
|
417
|
+
def __init__(self):
|
418
|
+
self.__random = random.Random()
|
419
|
+
self.__random.seed()
|
420
|
+
|
421
|
+
def layout_key(self) -> _meta.StorageLayout:
|
422
|
+
return _meta.StorageLayout.OBJECT_ID_LAYOUT
|
423
|
+
|
424
|
+
def _data_storage_path(
|
425
|
+
self, data_id, context_key, trac_schema,
|
426
|
+
part_key, snap_index, delta_index,
|
427
|
+
storage_format, prior_copy):
|
428
|
+
|
429
|
+
schema_type = trac_schema.schemaType.name.lower()
|
430
|
+
version_suffix = self.__random.randint(0, 1 << 24)
|
431
|
+
|
432
|
+
return self.__DATA_STORAGE_TEMPLATE.format(
|
433
|
+
schema_type, data_id.objectId,
|
434
|
+
part_key.opaqueKey, snap_index, delta_index,
|
435
|
+
version_suffix)
|
436
|
+
|
437
|
+
def _file_storage_path(self, file_id, file_def, prior_copy):
|
438
|
+
|
439
|
+
version_suffix = self.__random.randint(0, 1 << 24)
|
440
|
+
|
441
|
+
return self.__FILE_STORAGE_TEMPLATE.format(
|
442
|
+
file_id.objectId, file_id.objectVersion, version_suffix,
|
443
|
+
file_def.name, file_def.extension.lower())
|
444
|
+
|
445
|
+
|
446
|
+
class DevelopmentLayout(BaseLayout):
|
447
|
+
|
448
|
+
__DEFAULT_OUTPUT_DIR = "Dev Outputs"
|
449
|
+
|
450
|
+
__DATA_STORAGE_PATH = "{}/{}{}.{}"
|
451
|
+
__FILE_STORAGE_PATH = "{}/{}{}.{}"
|
452
|
+
|
453
|
+
def layout_key(self) -> _meta.StorageLayout:
|
454
|
+
return _meta.StorageLayout.DEVELOPER_LAYOUT
|
455
|
+
|
456
|
+
def _data_storage_path(
|
457
|
+
self, data_id, context_key, trac_schema,
|
458
|
+
part_key, snap_index, delta_index,
|
459
|
+
storage_format, prior_copy):
|
460
|
+
|
461
|
+
storage_dir = self._dev_storage_dir(prior_copy)
|
462
|
+
suffix = f"-{data_id.objectVersion}" if data_id.objectVersion > 1 else ""
|
463
|
+
|
464
|
+
if prior_copy is not None:
|
465
|
+
prior_path = pathlib.Path(prior_copy.storagePath)
|
466
|
+
file_name = prior_path.stem
|
467
|
+
if data_id.objectVersion > 2 and "-" in file_name:
|
468
|
+
file_name = file_name[:file_name.rfind("-")]
|
469
|
+
else:
|
470
|
+
file_name = context_key
|
471
|
+
|
472
|
+
return self.__DATA_STORAGE_PATH.format(storage_dir, file_name, suffix, storage_format.lower())
|
473
|
+
|
474
|
+
def _file_storage_path(self, file_id, file_def, prior_copy):
|
475
|
+
|
476
|
+
storage_dir = self._dev_storage_dir(prior_copy)
|
477
|
+
suffix = f"-{file_id.objectVersion}" if file_id.objectVersion > 1 else ""
|
478
|
+
|
479
|
+
return self.__FILE_STORAGE_PATH.format(storage_dir, file_def.name, suffix, file_def.extension.lower())
|
480
|
+
|
481
|
+
def _dev_storage_dir(self, prior_copy: _meta.StorageCopy):
|
482
|
+
|
483
|
+
if prior_copy is None:
|
484
|
+
return self.__DEFAULT_OUTPUT_DIR
|
485
|
+
|
486
|
+
prior_path = pathlib.Path(prior_copy.storagePath)
|
487
|
+
|
488
|
+
if len(prior_path.parts) > 1:
|
489
|
+
return prior_path.parent
|
490
|
+
else:
|
491
|
+
return self.__DEFAULT_OUTPUT_DIR
|
492
|
+
|
493
|
+
|
494
|
+
def build_data_spec(
|
495
|
+
data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
496
|
+
context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
|
497
|
+
storage_config: _cfg.StorageConfig,
|
498
|
+
prior_spec: tp.Optional[DataSpec] = None) \
|
499
|
+
-> DataSpec:
|
500
|
+
|
501
|
+
if prior_spec is None:
|
502
|
+
layout_key = storage_config.defaultLayout
|
503
|
+
layout = StorageLayout.select(layout_key)
|
504
|
+
return layout.new_data_spec(data_id, storage_id, context_key, trac_schema, storage_config)
|
505
|
+
|
506
|
+
else:
|
507
|
+
layout_key = prior_spec.storage.layout
|
508
|
+
layout = StorageLayout.select(layout_key)
|
509
|
+
return layout.new_data_version(data_id, storage_id, context_key, trac_schema, prior_spec)
|
510
|
+
|
511
|
+
|
512
|
+
def build_file_spec(
|
513
|
+
file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
|
514
|
+
context_key: tp.Optional[str], file_type: _meta.FileType,
|
515
|
+
storage_config: _cfg.StorageConfig,
|
516
|
+
prior_spec: tp.Optional[DataSpec] = None) \
|
517
|
+
-> DataSpec:
|
518
|
+
|
519
|
+
if prior_spec is None:
|
520
|
+
layout_key = storage_config.defaultLayout
|
521
|
+
layout = StorageLayout.select(layout_key)
|
522
|
+
return layout.new_file_spec(file_id, storage_id, context_key, file_type, storage_config)
|
523
|
+
|
524
|
+
else:
|
525
|
+
layout_key = prior_spec.storage.layout
|
526
|
+
layout = StorageLayout.select(layout_key)
|
527
|
+
return layout.new_file_version(file_id, storage_id, context_key, file_type, prior_spec)
|
105
528
|
|
106
529
|
|
107
530
|
@dc.dataclass(frozen=True)
|
@@ -184,6 +607,11 @@ class DataView:
|
|
184
607
|
else:
|
185
608
|
return DataView(_meta.ObjectType.DATA, trac_schema, parts = dict())
|
186
609
|
|
610
|
+
@staticmethod
|
611
|
+
def for_arrow_schema(arrow_schema: pa.Schema):
|
612
|
+
trac_schema = DataMapping.arrow_to_trac_schema(arrow_schema)
|
613
|
+
return DataView(_meta.ObjectType.DATA, trac_schema, arrow_schema, dict())
|
614
|
+
|
187
615
|
@staticmethod
|
188
616
|
def for_file_item(file_item: DataItem):
|
189
617
|
return DataView(file_item.object_type, file_item=file_item)
|
@@ -259,8 +687,17 @@ class DataMapping:
|
|
259
687
|
pa.date64(): _meta.BasicType.DATE
|
260
688
|
}
|
261
689
|
|
262
|
-
|
263
|
-
|
690
|
+
# For now, categorical handling is disabled by default and enabled by this setting
|
691
|
+
# The default will change to "true" for the 0.9 release
|
692
|
+
CATEGORICAL_CONFIG_KEY = "trac.runtime.categorical"
|
693
|
+
__categorical_enabled = False
|
694
|
+
|
695
|
+
@classmethod
|
696
|
+
def enable_categorical(cls, enabled: bool):
|
697
|
+
cls.__categorical_enabled = enabled
|
698
|
+
|
699
|
+
@classmethod
|
700
|
+
def arrow_to_python_type(cls, arrow_type: pa.DataType) -> type:
|
264
701
|
|
265
702
|
if pa.types.is_boolean(arrow_type):
|
266
703
|
return bool
|
@@ -283,6 +720,11 @@ class DataMapping:
|
|
283
720
|
if pa.types.is_timestamp(arrow_type):
|
284
721
|
return dt.datetime
|
285
722
|
|
723
|
+
# The python type for a dictionary-encoded field is its value type
|
724
|
+
if pa.types.is_dictionary(arrow_type):
|
725
|
+
if isinstance(arrow_type, pa.DictionaryType):
|
726
|
+
return cls.arrow_to_python_type(arrow_type.value_type)
|
727
|
+
|
286
728
|
raise _ex.ETracInternal(f"No Python type mapping available for Arrow type [{arrow_type}]")
|
287
729
|
|
288
730
|
@classmethod
|
@@ -340,7 +782,13 @@ class DataMapping:
|
|
340
782
|
def trac_to_arrow_field(cls, trac_field: _meta.FieldSchema):
|
341
783
|
|
342
784
|
arrow_type = cls.trac_to_arrow_basic_type(trac_field.fieldType)
|
343
|
-
|
785
|
+
|
786
|
+
# Categorical data uses an unordered dictionary with int32 index, ordered encoding not (currently) supported
|
787
|
+
# For legacy compatability, only use dictionary encoding if the categorical feature is enabled
|
788
|
+
if trac_field.categorical and cls.__categorical_enabled:
|
789
|
+
arrow_type = pa.dictionary(pa.int32(), arrow_type, False)
|
790
|
+
|
791
|
+
nullable = not (trac_field.notNull or trac_field.businessKey)
|
344
792
|
|
345
793
|
return pa.field(trac_field.fieldName, arrow_type, nullable)
|
346
794
|
|
@@ -369,12 +817,15 @@ class DataMapping:
|
|
369
817
|
field_type = cls.arrow_to_trac_type(field.type)
|
370
818
|
label = field.metadata["label"] if field.metadata and "label" in field.metadata else field.name
|
371
819
|
|
820
|
+
# When converting Arrow -> TRAC, always set the categorical flag for dictionary encoded fields
|
821
|
+
# This affects dynamic imports and is informational only (physical layout is controlled by Arrow schemas)
|
822
|
+
|
372
823
|
return _meta.FieldSchema(
|
373
824
|
field.name, field_index, field_type,
|
374
825
|
label=label,
|
375
826
|
businessKey=False,
|
376
827
|
notNull=not field.nullable,
|
377
|
-
categorical=
|
828
|
+
categorical=pa.types.is_dictionary(field.type))
|
378
829
|
|
379
830
|
@classmethod
|
380
831
|
def arrow_to_trac_type(cls, arrow_type: pa.DataType) -> _meta.BasicType:
|
@@ -390,6 +841,11 @@ class DataMapping:
|
|
390
841
|
if pa.types.is_timestamp(arrow_type):
|
391
842
|
return _meta.BasicType.DATETIME
|
392
843
|
|
844
|
+
# The basic type for a dictionary-encoded field is its value type
|
845
|
+
if pa.types.is_dictionary(arrow_type):
|
846
|
+
if isinstance(arrow_type, pa.DictionaryType):
|
847
|
+
return cls.arrow_to_trac_type(arrow_type.value_type)
|
848
|
+
|
393
849
|
raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
|
394
850
|
|
395
851
|
@classmethod
|
@@ -766,6 +1222,10 @@ class DataConformance:
|
|
766
1222
|
"Field [{field_name}] cannot be converted from {vector_type} to {field_type}, " + \
|
767
1223
|
"source and target have different time zones"
|
768
1224
|
|
1225
|
+
__E_WRONG_CATEGORICAL_TYPE = \
|
1226
|
+
"Field [{field_name}] categorical types do not match" + \
|
1227
|
+
"(expected {field_type}, got {vector_type})"
|
1228
|
+
|
769
1229
|
@classmethod
|
770
1230
|
def column_filter(cls, columns: tp.List[str], schema: tp.Optional[pa.Schema]) -> tp.Optional[tp.List[str]]:
|
771
1231
|
|
@@ -918,13 +1378,19 @@ class DataConformance:
|
|
918
1378
|
@classmethod
|
919
1379
|
def _coerce_vector(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
|
920
1380
|
|
1381
|
+
# Handle null vector
|
921
1382
|
if pa.types.is_null(vector.type):
|
922
|
-
|
923
1383
|
if field.nullable:
|
924
1384
|
return pa.nulls(size=len(vector), type=field.type)
|
925
1385
|
else:
|
926
1386
|
raise _ex.EDataConformance(f"All null values in non-null field [{field.name}]")
|
927
1387
|
|
1388
|
+
# If the vector is dict-encoded but the expected result is not, decode the dictionary
|
1389
|
+
if pa.types.is_dictionary(vector.type) and not pa.types.is_dictionary(field.type):
|
1390
|
+
if isinstance(vector, pa.DictionaryArray):
|
1391
|
+
dict_vector: pa.DictionaryArray = vector
|
1392
|
+
vector = dict_vector.dictionary_decode()
|
1393
|
+
|
928
1394
|
if pa.types.is_boolean(field.type):
|
929
1395
|
return cls._coerce_boolean(vector, field)
|
930
1396
|
|
@@ -946,6 +1412,9 @@ class DataConformance:
|
|
946
1412
|
if pa.types.is_timestamp(field.type):
|
947
1413
|
return cls._coerce_timestamp(vector, field)
|
948
1414
|
|
1415
|
+
if pa.types.is_dictionary(field.type):
|
1416
|
+
return cls._coerce_dictionary(vector, field)
|
1417
|
+
|
949
1418
|
error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
|
950
1419
|
cls.__log.error(error_message)
|
951
1420
|
raise _ex.EDataConformance(error_message)
|
@@ -1188,6 +1657,82 @@ class DataConformance:
|
|
1188
1657
|
|
1189
1658
|
return pc.cast(scaled_vector, field.type)
|
1190
1659
|
|
1660
|
+
@classmethod
|
1661
|
+
def _coerce_dictionary(cls, vector: pa.Array, field: pa.Field):
|
1662
|
+
|
1663
|
+
try:
|
1664
|
+
|
1665
|
+
if not isinstance(field.type, pa.DictionaryType):
|
1666
|
+
raise _ex.EUnexpected()
|
1667
|
+
|
1668
|
+
field_type: pa.DictionaryType = field.type
|
1669
|
+
|
1670
|
+
# Supplied vector is a dictionary (but the dictionary type is not an exact match)
|
1671
|
+
if pa.types.is_dictionary(vector.type):
|
1672
|
+
|
1673
|
+
if not isinstance(vector.type, pa.DictionaryType):
|
1674
|
+
raise _ex.EUnexpected()
|
1675
|
+
|
1676
|
+
vector_type: pa.DictionaryType = vector.type
|
1677
|
+
|
1678
|
+
# Do not allow coercion to a smaller index type or from unordered to ordered
|
1679
|
+
if (vector_type.index_type.bit_width > field_type.index_type.bit_width) or \
|
1680
|
+
(field_type.ordered and not vector_type.ordered):
|
1681
|
+
|
1682
|
+
error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
|
1683
|
+
cls.__log.error(error_message)
|
1684
|
+
raise _ex.EDataConformance(error_message)
|
1685
|
+
|
1686
|
+
# Value types are the same - basic cast should succeed
|
1687
|
+
if vector_type.value_type == field_type.value_type:
|
1688
|
+
return pc.cast(vector, field.type)
|
1689
|
+
|
1690
|
+
# Value types differ - try to coerce the underlying dictionary
|
1691
|
+
elif isinstance(vector, pa.DictionaryArray):
|
1692
|
+
try:
|
1693
|
+
values_field = pa.field(field.name, field_type.value_type, field.nullable)
|
1694
|
+
values_vector = cls._coerce_vector(vector.dictionary, values_field)
|
1695
|
+
dict_vector = pa.DictionaryArray.from_arrays(vector.indices, values_vector, ordered=field_type.ordered) # noqa
|
1696
|
+
return pc.cast(dict_vector, field.type)
|
1697
|
+
# Handle errors converting the value type
|
1698
|
+
except _ex.EDataConformance as e:
|
1699
|
+
error_message = cls._format_error(cls.__E_WRONG_CATEGORICAL_TYPE, vector, field)
|
1700
|
+
cls.__log.error(error_message)
|
1701
|
+
raise _ex.EDataConformance(error_message) from e
|
1702
|
+
|
1703
|
+
# Special handling for chunked dictionaries
|
1704
|
+
elif isinstance(vector, pa.ChunkedArray):
|
1705
|
+
chunks = [cls._coerce_dictionary(chunk, field) for chunk in vector.chunks]
|
1706
|
+
return pa.chunked_array(chunks)
|
1707
|
+
|
1708
|
+
# Vector type not recognized, coercion is not possible
|
1709
|
+
else:
|
1710
|
+
error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
|
1711
|
+
cls.__log.error(error_message)
|
1712
|
+
raise _ex.EDataConformance(error_message)
|
1713
|
+
|
1714
|
+
# Supplied vector matches the dictionary value type - perform dictionary encoding
|
1715
|
+
elif vector.type == field_type.value_type and not field_type.ordered:
|
1716
|
+
return vector.dictionary_encode().cast(field.type)
|
1717
|
+
|
1718
|
+
# Fallback option - try to coerce the value type first, then perform dictionary encoding
|
1719
|
+
else:
|
1720
|
+
try:
|
1721
|
+
values_field = pa.field(field.name, field_type.value_type, field.nullable)
|
1722
|
+
values_vector = cls._coerce_vector(vector, values_field)
|
1723
|
+
return values_vector.dictionary_encode().cast(field.type)
|
1724
|
+
# Handle errors converting the value type
|
1725
|
+
except _ex.EDataConformance as e:
|
1726
|
+
error_message = cls._format_error(cls.__E_WRONG_CATEGORICAL_TYPE, vector, field)
|
1727
|
+
cls.__log.error(error_message)
|
1728
|
+
raise _ex.EDataConformance(error_message) from e
|
1729
|
+
|
1730
|
+
except pa.ArrowInvalid as e:
|
1731
|
+
|
1732
|
+
error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e)
|
1733
|
+
cls.__log.error(error_message)
|
1734
|
+
raise _ex.EDataConformance(error_message) from e
|
1735
|
+
|
1191
1736
|
@classmethod
|
1192
1737
|
def _format_error(cls, error_template: str, vector: pa.Array, field: pa.Field, e: Exception = None):
|
1193
1738
|
|