tracdap-runtime 0.8.0rc2__py3-none-any.whl → 0.9.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. tracdap/rt/_impl/core/config_parser.py +29 -3
  2. tracdap/rt/_impl/core/data.py +627 -40
  3. tracdap/rt/_impl/core/repos.py +17 -8
  4. tracdap/rt/_impl/core/storage.py +25 -13
  5. tracdap/rt/_impl/core/struct.py +254 -60
  6. tracdap/rt/_impl/core/util.py +125 -11
  7. tracdap/rt/_impl/exec/context.py +35 -8
  8. tracdap/rt/_impl/exec/dev_mode.py +169 -127
  9. tracdap/rt/_impl/exec/engine.py +203 -140
  10. tracdap/rt/_impl/exec/functions.py +228 -263
  11. tracdap/rt/_impl/exec/graph.py +141 -126
  12. tracdap/rt/_impl/exec/graph_builder.py +428 -449
  13. tracdap/rt/_impl/grpc/codec.py +8 -13
  14. tracdap/rt/_impl/grpc/server.py +7 -7
  15. tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.py +25 -18
  16. tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.pyi +27 -9
  17. tracdap/rt/_impl/grpc/tracdap/metadata/common_pb2.py +1 -1
  18. tracdap/rt/_impl/grpc/tracdap/metadata/config_pb2.py +1 -1
  19. tracdap/rt/_impl/grpc/tracdap/metadata/custom_pb2.py +1 -1
  20. tracdap/rt/_impl/grpc/tracdap/metadata/data_pb2.py +37 -35
  21. tracdap/rt/_impl/grpc/tracdap/metadata/data_pb2.pyi +37 -43
  22. tracdap/rt/_impl/grpc/tracdap/metadata/file_pb2.py +1 -1
  23. tracdap/rt/_impl/grpc/tracdap/metadata/flow_pb2.py +1 -1
  24. tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +67 -63
  25. tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +11 -2
  26. tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +1 -1
  27. tracdap/rt/_impl/grpc/tracdap/metadata/object_id_pb2.py +1 -1
  28. tracdap/rt/_impl/grpc/tracdap/metadata/object_pb2.py +1 -1
  29. tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.py +1 -1
  30. tracdap/rt/_impl/grpc/tracdap/metadata/search_pb2.py +1 -1
  31. tracdap/rt/_impl/grpc/tracdap/metadata/storage_pb2.py +11 -9
  32. tracdap/rt/_impl/grpc/tracdap/metadata/storage_pb2.pyi +11 -2
  33. tracdap/rt/_impl/grpc/tracdap/metadata/tag_pb2.py +1 -1
  34. tracdap/rt/_impl/grpc/tracdap/metadata/tag_update_pb2.py +1 -1
  35. tracdap/rt/_impl/grpc/tracdap/metadata/type_pb2.py +23 -19
  36. tracdap/rt/_impl/grpc/tracdap/metadata/type_pb2.pyi +15 -2
  37. tracdap/rt/_impl/runtime.py +3 -9
  38. tracdap/rt/_impl/static_api.py +5 -6
  39. tracdap/rt/_plugins/format_csv.py +2 -2
  40. tracdap/rt/_plugins/repo_git.py +56 -11
  41. tracdap/rt/_plugins/storage_aws.py +165 -150
  42. tracdap/rt/_plugins/storage_azure.py +17 -11
  43. tracdap/rt/_plugins/storage_gcp.py +35 -18
  44. tracdap/rt/_version.py +1 -1
  45. tracdap/rt/api/model_api.py +45 -0
  46. tracdap/rt/config/__init__.py +7 -9
  47. tracdap/rt/config/common.py +3 -14
  48. tracdap/rt/config/job.py +17 -3
  49. tracdap/rt/config/platform.py +9 -32
  50. tracdap/rt/config/result.py +8 -4
  51. tracdap/rt/config/runtime.py +5 -10
  52. tracdap/rt/config/tenant.py +28 -0
  53. tracdap/rt/launch/cli.py +0 -8
  54. tracdap/rt/launch/launch.py +1 -3
  55. tracdap/rt/metadata/__init__.py +35 -35
  56. tracdap/rt/metadata/data.py +19 -31
  57. tracdap/rt/metadata/job.py +3 -1
  58. tracdap/rt/metadata/storage.py +9 -0
  59. tracdap/rt/metadata/type.py +9 -5
  60. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/METADATA +5 -3
  61. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/RECORD +64 -63
  62. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/WHEEL +1 -1
  63. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/licenses/LICENSE +0 -0
  64. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b2.dist-info}/top_level.txt +0 -0
@@ -16,10 +16,12 @@
16
16
  import abc
17
17
  import copy
18
18
  import dataclasses as dc
19
+ import pathlib
19
20
  import typing as tp
20
21
  import datetime as dt
21
22
  import decimal
22
23
  import platform
24
+ import random
23
25
 
24
26
  import pyarrow as pa
25
27
  import pyarrow.compute as pc
@@ -36,8 +38,21 @@ except ModuleNotFoundError:
36
38
 
37
39
  import tracdap.rt.api.experimental as _api
38
40
  import tracdap.rt.metadata as _meta
41
+ import tracdap.rt.config as _cfg
39
42
  import tracdap.rt.exceptions as _ex
43
+ import tracdap.rt._impl.core.config_parser as _cfg_p
40
44
  import tracdap.rt._impl.core.logging as _log
45
+ import tracdap.rt._impl.core.util as _util
46
+
47
+
48
+ @dc.dataclass(frozen=True)
49
+ class DataPartKey:
50
+
51
+ @classmethod
52
+ def for_root(cls) -> "DataPartKey":
53
+ return DataPartKey(opaque_key='part-root')
54
+
55
+ opaque_key: str
41
56
 
42
57
 
43
58
  @dc.dataclass(frozen=True)
@@ -47,61 +62,495 @@ class DataSpec:
47
62
  schema_type: _meta.SchemaType
48
63
  data_item: str
49
64
 
50
- data_def: _meta.DataDefinition
51
- file_def: _meta.FileDefinition
52
- storage_def: _meta.StorageDefinition
53
- schema_def: tp.Optional[_meta.SchemaDefinition]
65
+ definition: tp.Union[_meta.DataDefinition, _meta.FileDefinition]
66
+ storage: _meta.StorageDefinition
67
+ schema: tp.Optional[_meta.SchemaDefinition] = None
68
+
69
+ primary_id: _meta.TagHeader = None
70
+ storage_id: _meta.TagHeader = None
71
+ schema_id: tp.Optional[_meta.TagHeader] = None
72
+ context_key: tp.Optional[str] = None
73
+
74
+ metadata: tp.Optional[_api.RuntimeMetadata] = None
54
75
 
55
76
  @staticmethod
56
77
  def create_data_spec(
57
78
  data_item: str,
58
- data_def: _meta.DataDefinition,
59
- storage_def: _meta.StorageDefinition,
60
- schema_def: tp.Optional[_meta.SchemaDefinition] = None) -> "DataSpec":
61
-
62
- if schema_def:
63
- schema_type = schema_def.schemaType
64
- elif data_def.schema:
65
- schema_type = data_def.schema.schemaType
79
+ definition: _meta.DataDefinition,
80
+ storage: _meta.StorageDefinition,
81
+ schema: tp.Optional[_meta.SchemaDefinition] = None) -> "DataSpec":
82
+
83
+ if schema:
84
+ schema_type = schema.schemaType
85
+ elif definition.schema:
86
+ schema_type = definition.schema.schemaType
66
87
  else:
67
88
  schema_type = _meta.SchemaType.SCHEMA_TYPE_NOT_SET
68
89
 
69
90
  return DataSpec(
70
91
  _meta.ObjectType.DATA, schema_type, data_item,
71
- data_def,
72
- storage_def=storage_def,
73
- schema_def=schema_def,
74
- file_def=None)
92
+ definition, storage, schema)
75
93
 
76
94
  @staticmethod
77
95
  def create_file_spec(
78
96
  data_item: str,
79
- file_def: _meta.FileDefinition,
80
- storage_def: _meta.StorageDefinition) -> "DataSpec":
97
+ definition: _meta.FileDefinition,
98
+ storage: _meta.StorageDefinition) -> "DataSpec":
81
99
 
82
100
  return DataSpec(
83
101
  _meta.ObjectType.FILE, _meta.SchemaType.SCHEMA_TYPE_NOT_SET, data_item,
84
- file_def=file_def,
85
- storage_def=storage_def,
86
- data_def=None,
87
- schema_def=None)
102
+ definition, storage)
88
103
 
89
104
  @staticmethod
90
105
  def create_empty_spec(object_type: _meta.ObjectType, schema_type: _meta.SchemaType):
91
- return DataSpec(object_type, schema_type, None, None, None, None, None)
106
+ return DataSpec(object_type, schema_type, "", None, None, None) # noqa
107
+
108
+ def with_ids(
109
+ self, primary_id: _meta.TagHeader,
110
+ storage_id: _meta.TagHeader,
111
+ schema_id: tp.Optional[_meta.TagHeader] = None,
112
+ context_key: tp.Optional[str] = None):
113
+
114
+ return dc.replace(self,
115
+ primary_id=primary_id,
116
+ storage_id=storage_id,
117
+ schema_id=schema_id,
118
+ context_key=context_key)
119
+
120
+ def with_metadata(self, metadata: tp.Optional[_api.RuntimeMetadata]):
121
+ return dc.replace(self, metadata=metadata)
92
122
 
93
123
  def is_empty(self):
94
124
  return self.data_item is None or len(self.data_item) == 0
95
125
 
96
126
 
97
- @dc.dataclass(frozen=True)
98
- class DataPartKey:
127
+ class StorageLayout(metaclass=abc.ABCMeta):
128
+
129
+ __LAYOUTS: "tp.Dict[str, StorageLayout]" = dict()
99
130
 
100
131
  @classmethod
101
- def for_root(cls) -> "DataPartKey":
102
- return DataPartKey(opaque_key='part_root')
132
+ def select(cls, layout_key: tp.Union[str, _meta.StorageLayout]) -> "StorageLayout":
103
133
 
104
- opaque_key: str
134
+ # Legacy compatibility - layout key not set in storage definition
135
+ if not layout_key or layout_key == "":
136
+ layout_key = _meta.StorageLayout.OBJECT_ID_LAYOUT.name
137
+
138
+ if isinstance(layout_key, _meta.StorageLayout):
139
+ layout_key = layout_key.name
140
+
141
+ layout = cls.__LAYOUTS.get(layout_key)
142
+
143
+ if layout is not None:
144
+ return layout
145
+
146
+ if layout_key == _meta.StorageLayout.OBJECT_ID_LAYOUT.name:
147
+ layout = ObjectIdLayout()
148
+ elif layout_key == _meta.StorageLayout.DEVELOPER_LAYOUT.name:
149
+ layout = DevelopmentLayout()
150
+ else:
151
+ raise _ex.ETracInternal(f"Unknown storage layout [{layout_key}]")
152
+
153
+ cls.__LAYOUTS[layout_key] = layout
154
+
155
+ return layout
156
+
157
+ @abc.abstractmethod
158
+ def layout_key(self) -> _meta.StorageLayout:
159
+ pass
160
+
161
+ @abc.abstractmethod
162
+ def new_data_spec(
163
+ self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
164
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
165
+ sys_config: _cfg.RuntimeConfig) -> DataSpec:
166
+ pass
167
+
168
+ @abc.abstractmethod
169
+ def new_data_version(
170
+ self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
171
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
172
+ prior_spec: DataSpec) -> DataSpec:
173
+ pass
174
+
175
+ @abc.abstractmethod
176
+ def new_file_spec(
177
+ self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
178
+ context_key: str, file_type: _meta.FileType,
179
+ sys_config: _cfg.RuntimeConfig) -> DataSpec:
180
+ pass
181
+
182
+ @abc.abstractmethod
183
+ def new_file_version(
184
+ self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
185
+ context_key: str, file_type: _meta.FileType,
186
+ prior_spec: DataSpec) -> DataSpec:
187
+ pass
188
+
189
+
190
+ class BaseLayout(StorageLayout, metaclass=abc.ABCMeta):
191
+
192
+ __DATA_ITEM_TEMPLATE = "data/{}/{}/{}/snap-{:d}/delta-{:d}"
193
+ __FILE_ITEM_TEMPLATE = "file/{}/version-{}"
194
+
195
+ @abc.abstractmethod
196
+ def _data_storage_path(
197
+ self, data_id: _meta.TagHeader, context_key: str, trac_schema: _meta.SchemaDefinition,
198
+ part_key: _meta.PartKey, snap_index: int, delta_index: int, storage_format: str,
199
+ prior_copy: tp.Optional[_meta.StorageCopy]):
200
+ pass
201
+
202
+ @abc.abstractmethod
203
+ def _file_storage_path(
204
+ self, file_id: _meta.TagHeader, file_def: _meta.FileDefinition,
205
+ prior_copy: tp.Optional[_meta.StorageCopy]):
206
+ pass
207
+
208
+ def new_data_spec(
209
+ self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
210
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
211
+ sys_config: _cfg.RuntimeConfig) -> DataSpec:
212
+
213
+ part_key = _meta.PartKey("part-root", _meta.PartType.PART_ROOT)
214
+ snap_index = 0
215
+
216
+ data_item = self.__DATA_ITEM_TEMPLATE.format(
217
+ trac_schema.schemaType.name.lower(), data_id.objectId,
218
+ part_key.opaqueKey, snap_index, 0)
219
+
220
+ # Blank data definition with no parts
221
+ new_data_def = _meta.DataDefinition(
222
+ schema=trac_schema,
223
+ storageId=_util.selector_for_latest(storage_id))
224
+
225
+ data_def = self._add_new_snap(new_data_def, data_item, part_key, snap_index)
226
+
227
+ # Take default location from the storage config
228
+ storage_key = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_LOCATION)
229
+ if trac_schema.schemaType == _meta.SchemaType.STRUCT_SCHEMA:
230
+ storage_format = "text/json"
231
+ else:
232
+ storage_format = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_FORMAT, "text/csv")
233
+ storage_path = self._data_storage_path(data_id, context_key, trac_schema, part_key, snap_index, 0, storage_format, prior_copy=None)
234
+
235
+ storage_copy = _meta.StorageCopy(
236
+ storageKey=storage_key,
237
+ storagePath=storage_path,
238
+ storageFormat=storage_format,
239
+ copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
240
+ copyTimestamp=data_id.objectTimestamp)
241
+
242
+ new_storage_def = _meta.StorageDefinition()
243
+
244
+ storage_def = self._add_storage_copy(new_storage_def, data_item, storage_copy)
245
+
246
+ # Dynamic data def will always use an embedded schema (this is no ID for an external schema)
247
+
248
+ return DataSpec \
249
+ .create_data_spec(data_item, data_def, storage_def, schema=None) \
250
+ .with_ids(data_id, storage_id)
251
+
252
+ def new_data_version(
253
+ self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
254
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
255
+ prior_spec: DataSpec) -> DataSpec:
256
+
257
+ part_key = _meta.PartKey("part-root", _meta.PartType.PART_ROOT)
258
+ snap_index = prior_spec.primary_id.objectVersion # snap index is zero-based
259
+
260
+ data_item = self.__DATA_ITEM_TEMPLATE.format(
261
+ trac_schema.schemaType.name.lower(), data_id.objectId,
262
+ part_key.opaqueKey, snap_index, 0)
263
+
264
+ data_def = self._add_new_snap(prior_spec.definition, data_item, part_key, snap_index)
265
+
266
+ prior_item = next(iter(prior_spec.storage.dataItems.keys()), None)
267
+ prior_copy = self._find_storage_copy(prior_item, prior_spec.storage)
268
+
269
+ if prior_copy is None:
270
+ raise _ex.ETracInternal(f"Missing prior metadata for [{_util.object_key(data_id)}]")
271
+
272
+ storage_key = prior_copy.storageKey
273
+ storage_format = prior_copy.storageFormat
274
+ storage_path = self._data_storage_path(data_id, context_key, trac_schema, part_key, snap_index, 0, storage_format, prior_copy)
275
+
276
+ storage_copy = _meta.StorageCopy(
277
+ storageKey=storage_key,
278
+ storagePath=storage_path,
279
+ storageFormat=storage_format,
280
+ copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
281
+ copyTimestamp=data_id.objectTimestamp)
282
+
283
+ storage_def = self._add_storage_copy(prior_spec.storage, data_item, storage_copy)
284
+
285
+ return DataSpec \
286
+ .create_data_spec(data_item, data_def, storage_def, schema=None) \
287
+ .with_ids(data_id, storage_id)
288
+
289
+ def new_file_spec(
290
+ self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
291
+ context_key: str, file_type: _meta.FileType,
292
+ sys_config: _cfg.RuntimeConfig) -> DataSpec:
293
+
294
+ data_item = self.__FILE_ITEM_TEMPLATE.format(file_id.objectId, file_id.objectVersion)
295
+
296
+ file_def = _meta.FileDefinition(
297
+ name=f"{context_key}.{file_type.extension}",
298
+ extension=file_type.extension,
299
+ mimeType=file_type.mimeType,
300
+ dataItem=data_item,
301
+ storageId=_util.selector_for_latest(storage_id),
302
+ size=0)
303
+
304
+ storage_key = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_LOCATION)
305
+ storage_format = file_def.mimeType
306
+ storage_path = self._file_storage_path(file_id, file_def, prior_copy=None)
307
+
308
+ storage_copy = _meta.StorageCopy(
309
+ storageKey=storage_key,
310
+ storagePath=storage_path,
311
+ storageFormat=storage_format,
312
+ copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
313
+ copyTimestamp=file_id.objectTimestamp)
314
+
315
+ new_storage_def = _meta.StorageDefinition()
316
+ new_storage_def.layout = self.layout_key()
317
+
318
+ storage_def = self._add_storage_copy(new_storage_def, data_item, storage_copy)
319
+
320
+ return DataSpec \
321
+ .create_file_spec(data_item, file_def, storage_def) \
322
+ .with_ids(file_id, storage_id)
323
+
324
+ def new_file_version(
325
+ self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
326
+ context_key: str, file_type: _meta.FileType, prior_spec: DataSpec) -> DataSpec:
327
+
328
+ data_item = self.__FILE_ITEM_TEMPLATE.format(file_id.objectId, file_id.objectVersion)
329
+
330
+ file_def = _meta.FileDefinition(
331
+ name=f"{context_key}.{file_type.extension}",
332
+ extension=file_type.extension,
333
+ mimeType=file_type.mimeType,
334
+ dataItem=data_item,
335
+ storageId=_util.selector_for_latest(storage_id),
336
+ size=0)
337
+
338
+ prior_copy = self._find_storage_copy(prior_spec.definition.dataItem, prior_spec.storage)
339
+
340
+ if prior_copy is None:
341
+ raise _ex.ETracInternal(f"Missing prior metadata for [{_util.object_key(file_id)}]")
342
+
343
+ storage_key = prior_copy.storageKey
344
+ storage_format = file_def.mimeType
345
+ storage_path = self._file_storage_path(file_id, file_def, prior_copy=None)
346
+
347
+ storage_copy = _meta.StorageCopy(
348
+ storageKey=storage_key,
349
+ storagePath=storage_path,
350
+ storageFormat=storage_format,
351
+ copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
352
+ copyTimestamp=file_id.objectTimestamp)
353
+
354
+ storage_def = self._add_storage_copy(prior_spec.storage, data_item, storage_copy)
355
+
356
+ return DataSpec \
357
+ .create_file_spec(data_item, file_def, storage_def) \
358
+ .with_ids(file_id, storage_id)
359
+
360
+ @classmethod
361
+ def _add_new_snap(
362
+ cls, data_def: _meta.DataDefinition,data_item: str,
363
+ part_key: _meta.PartKey, snap_index: int):
364
+
365
+ delta = _meta.DataDefinition.Delta(
366
+ deltaIndex=0,
367
+ dataItem=data_item)
368
+
369
+ snap = _meta.DataDefinition.Snap(
370
+ snapIndex=snap_index,
371
+ deltas=[delta])
372
+
373
+ part = _meta.DataDefinition.Part(
374
+ partKey=part_key,
375
+ snap=snap)
376
+
377
+ data_def = copy.copy(data_def)
378
+ data_def.parts = copy.copy(data_def.parts)
379
+ data_def.parts[part_key.opaqueKey] = part
380
+
381
+ return data_def
382
+
383
+ @classmethod
384
+ def _add_storage_copy(cls, storage_def: _meta.StorageDefinition, data_item: str, storage_copy: _meta.StorageCopy):
385
+
386
+ new_incarnation = _meta.StorageIncarnation(
387
+ copies=[storage_copy],
388
+ incarnationIndex=0,
389
+ incarnationTimestamp=storage_copy.copyTimestamp,
390
+ incarnationStatus=_meta.IncarnationStatus.INCARNATION_AVAILABLE)
391
+
392
+ new_item = _meta.StorageItem(incarnations=[new_incarnation])
393
+
394
+ storage_def = copy.copy(storage_def)
395
+ storage_def.dataItems = copy.copy(storage_def.dataItems)
396
+ storage_def.dataItems[data_item] = new_item
397
+
398
+ return storage_def
399
+
400
+ @classmethod
401
+ def _find_storage_copy(cls, data_item: str, storage_def: _meta.StorageDefinition) -> tp.Optional[_meta.StorageCopy]:
402
+
403
+ if data_item is None:
404
+ return None
405
+
406
+ storage_item = storage_def.dataItems.get(data_item)
407
+
408
+ if storage_item is None:
409
+ return None
410
+
411
+ # Latest available incarnation
412
+ incarnation = next(filter(
413
+ lambda i: i.incarnationStatus == _meta.IncarnationStatus.INCARNATION_AVAILABLE,
414
+ reversed(storage_item.incarnations)), None)
415
+
416
+ if incarnation is None:
417
+ return None
418
+
419
+ # Use any available copy (currently there is no location preference)
420
+ return next(filter(
421
+ lambda c: c.copyStatus == _meta.CopyStatus.COPY_AVAILABLE,
422
+ incarnation.copies), None)
423
+
424
+
425
+ class ObjectIdLayout(BaseLayout):
426
+
427
+ __DATA_STORAGE_TEMPLATE = "data/{}/{}/{}/snap-{:d}/delta-{:d}-x{:0>6x}"
428
+ __FILE_STORAGE_TEMPLATE = "file/{}/version-{:d}-x{:0>6x}/{}.{}"
429
+
430
+ def __init__(self):
431
+ self.__random = random.Random()
432
+ self.__random.seed()
433
+
434
+ def layout_key(self) -> _meta.StorageLayout:
435
+ return _meta.StorageLayout.OBJECT_ID_LAYOUT
436
+
437
+ def _data_storage_path(
438
+ self, data_id, context_key, trac_schema,
439
+ part_key, snap_index, delta_index,
440
+ storage_format, prior_copy):
441
+
442
+ schema_type = trac_schema.schemaType.name.lower()
443
+ version_suffix = self.__random.randint(0, 1 << 24)
444
+
445
+ base_path = self.__DATA_STORAGE_TEMPLATE.format(
446
+ schema_type, data_id.objectId,
447
+ part_key.opaqueKey, snap_index, delta_index,
448
+ version_suffix)
449
+
450
+ # STRUCT stored as a single file, not directory layout
451
+ if trac_schema.schemaType == _meta.SchemaType.STRUCT_SCHEMA:
452
+ return base_path + ".json"
453
+ else:
454
+ return base_path
455
+
456
+ def _file_storage_path(self, file_id, file_def, prior_copy):
457
+
458
+ version_suffix = self.__random.randint(0, 1 << 24)
459
+
460
+ return self.__FILE_STORAGE_TEMPLATE.format(
461
+ file_id.objectId, file_id.objectVersion, version_suffix,
462
+ file_def.name, file_def.extension.lower())
463
+
464
+
465
+ class DevelopmentLayout(BaseLayout):
466
+
467
+ __DEFAULT_OUTPUT_DIR = "Dev Outputs"
468
+
469
+ __DATA_STORAGE_PATH = "{}/{}{}.{}"
470
+ __FILE_STORAGE_PATH = "{}/{}{}.{}"
471
+
472
+ def layout_key(self) -> _meta.StorageLayout:
473
+ return _meta.StorageLayout.DEVELOPER_LAYOUT
474
+
475
+ def _data_storage_path(
476
+ self, data_id, context_key, trac_schema,
477
+ part_key, snap_index, delta_index,
478
+ storage_format, prior_copy):
479
+
480
+ storage_dir = self._dev_storage_dir(prior_copy)
481
+ suffix = f"-{data_id.objectVersion}" if data_id.objectVersion > 1 else ""
482
+
483
+ if prior_copy is not None:
484
+ prior_path = pathlib.Path(prior_copy.storagePath)
485
+ file_name = prior_path.stem
486
+ if data_id.objectVersion > 2 and "-" in file_name:
487
+ file_name = file_name[:file_name.rfind("-")]
488
+ else:
489
+ file_name = context_key
490
+
491
+ return self.__DATA_STORAGE_PATH.format(storage_dir, file_name, suffix, storage_format.lower())
492
+
493
+ def _file_storage_path(self, file_id, file_def, prior_copy):
494
+
495
+ storage_dir = self._dev_storage_dir(prior_copy)
496
+ suffix = f"-{file_id.objectVersion}" if file_id.objectVersion > 1 else ""
497
+
498
+ return self.__FILE_STORAGE_PATH.format(storage_dir, file_def.name, suffix, file_def.extension.lower())
499
+
500
+ def _dev_storage_dir(self, prior_copy: _meta.StorageCopy):
501
+
502
+ if prior_copy is None:
503
+ return self.__DEFAULT_OUTPUT_DIR
504
+
505
+ prior_path = pathlib.Path(prior_copy.storagePath)
506
+
507
+ if len(prior_path.parts) > 1:
508
+ return prior_path.parent
509
+ else:
510
+ return self.__DEFAULT_OUTPUT_DIR
511
+
512
+
513
+ def build_data_spec(
514
+ data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
515
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
516
+ sys_config: _cfg.RuntimeConfig,
517
+ prior_spec: tp.Optional[DataSpec] = None,
518
+ metadata: tp.Optional[_api.RuntimeMetadata] = None) \
519
+ -> DataSpec:
520
+
521
+ if prior_spec is None:
522
+ layout_key = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_LAYOUT, _cfg_p.ConfigKDefaults.STORAGE_DEFAULT_LAYOUT)
523
+ layout = StorageLayout.select(layout_key)
524
+ spec = layout.new_data_spec(data_id, storage_id, context_key, trac_schema, sys_config)
525
+ else:
526
+ layout_key = prior_spec.storage.layout
527
+ layout = StorageLayout.select(layout_key)
528
+ spec = layout.new_data_version(data_id, storage_id, context_key, trac_schema, prior_spec)
529
+
530
+ # Attach metadata if it is available
531
+ return spec.with_metadata(metadata) if metadata is not None else spec
532
+
533
+
534
+ def build_file_spec(
535
+ file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
536
+ context_key: tp.Optional[str], file_type: _meta.FileType,
537
+ sys_config: _cfg.RuntimeConfig,
538
+ prior_spec: tp.Optional[DataSpec] = None,
539
+ metadata: tp.Optional[_api.RuntimeMetadata] = None) \
540
+ -> DataSpec:
541
+
542
+ if prior_spec is None:
543
+ layout_key = _util.read_property(sys_config.properties, _cfg_p.ConfigKeys.STORAGE_DEFAULT_LAYOUT, _cfg_p.ConfigKDefaults.STORAGE_DEFAULT_LAYOUT)
544
+ layout = StorageLayout.select(layout_key)
545
+ spec = layout.new_file_spec(file_id, storage_id, context_key, file_type, sys_config)
546
+
547
+ else:
548
+ layout_key = prior_spec.storage.layout
549
+ layout = StorageLayout.select(layout_key)
550
+ spec = layout.new_file_version(file_id, storage_id, context_key, file_type, prior_spec)
551
+
552
+ # Attach metadata if it is available
553
+ return spec.with_metadata(metadata) if metadata is not None else spec
105
554
 
106
555
 
107
556
  @dc.dataclass(frozen=True)
@@ -121,6 +570,8 @@ class DataItem:
121
570
  schema: pa.Schema = None
122
571
  table: tp.Optional[pa.Table] = None
123
572
 
573
+ metadata: tp.Optional[_api.RuntimeMetadata] = None
574
+
124
575
  def is_empty(self) -> bool:
125
576
  return self.content is None
126
577
 
@@ -138,7 +589,7 @@ class DataItem:
138
589
  def for_table(table: pa.Table, schema: pa.Schema, trac_schema: _meta.SchemaDefinition) -> "DataItem":
139
590
 
140
591
  return DataItem(
141
- _meta.ObjectType.DATA, _meta.SchemaType.TABLE,
592
+ _meta.ObjectType.DATA, _meta.SchemaType.TABLE_SCHEMA,
142
593
  content=table, content_type=pa.Table,
143
594
  trac_schema=trac_schema, native_schema=schema,
144
595
  table=table, schema=schema)
@@ -147,7 +598,7 @@ class DataItem:
147
598
  def for_struct(content: tp.Any):
148
599
 
149
600
  return DataItem(
150
- _meta.ObjectType.DATA, _meta.SchemaType.STRUCT,
601
+ _meta.ObjectType.DATA, _meta.SchemaType.STRUCT_SCHEMA,
151
602
  content=content, content_type=type(content))
152
603
 
153
604
  @staticmethod
@@ -157,6 +608,9 @@ class DataItem:
157
608
  _meta.ObjectType.FILE, _meta.SchemaType.SCHEMA_TYPE_NOT_SET,
158
609
  content=content, content_type=bytes)
159
610
 
611
+ def with_metadata(self, metadata: _api.RuntimeMetadata) -> "DataItem":
612
+ return dc.replace(self, metadata=metadata)
613
+
160
614
 
161
615
  @dc.dataclass(frozen=True)
162
616
  class DataView:
@@ -169,6 +623,8 @@ class DataView:
169
623
  parts: tp.Dict[DataPartKey, tp.List[DataItem]] = None
170
624
  file_item: tp.Optional[DataItem] = None
171
625
 
626
+ metadata: tp.Optional[_api.RuntimeMetadata] = None
627
+
172
628
  @staticmethod
173
629
  def create_empty(object_type: _meta.ObjectType = _meta.ObjectType.DATA) -> "DataView":
174
630
  if object_type == _meta.ObjectType.DATA:
@@ -184,21 +640,41 @@ class DataView:
184
640
  else:
185
641
  return DataView(_meta.ObjectType.DATA, trac_schema, parts = dict())
186
642
 
643
+ @staticmethod
644
+ def for_arrow_schema(arrow_schema: pa.Schema):
645
+ trac_schema = DataMapping.arrow_to_trac_schema(arrow_schema)
646
+ return DataView(_meta.ObjectType.DATA, trac_schema, arrow_schema, dict())
647
+
187
648
  @staticmethod
188
649
  def for_file_item(file_item: DataItem):
189
650
  return DataView(file_item.object_type, file_item=file_item)
190
651
 
191
652
  def with_trac_schema(self, trac_schema: _meta.SchemaDefinition):
192
653
  arrow_schema = DataMapping.trac_to_arrow_schema(trac_schema)
193
- return DataView(_meta.ObjectType.DATA, trac_schema, arrow_schema, self.parts)
654
+ return dc.replace(self, trac_schema=trac_schema, arrow_schema=arrow_schema)
194
655
 
195
656
  def with_part(self, part_key: DataPartKey, part: DataItem):
196
- new_parts = copy.copy(self.parts)
657
+ new_parts = copy.copy(self.parts) if self.parts is not None else {}
197
658
  new_parts[part_key] = [part]
198
- return DataView(self.object_type, self.trac_schema, self.arrow_schema, new_parts)
659
+ return dc.replace(self, parts=new_parts)
199
660
 
200
661
  def with_file_item(self, file_item: DataItem):
201
- return DataView(self.object_type, file_item=file_item)
662
+ return dc.replace(self, file_item=file_item)
663
+
664
+ def with_metadata(self, metadata: _api.RuntimeMetadata) -> "DataView":
665
+ return dc.replace(self, metadata=metadata)
666
+
667
+ def get_metadata(self) -> tp.Optional[_api.RuntimeMetadata]:
668
+ if self.metadata:
669
+ return self.metadata
670
+ if self.object_type == _meta.ObjectType.FILE and self.file_item:
671
+ return self.file_item.metadata
672
+ if self.parts:
673
+ for items in self.parts.values():
674
+ for item in items:
675
+ if item and item.metadata:
676
+ return item.metadata
677
+ return None
202
678
 
203
679
  def is_empty(self) -> bool:
204
680
  if self.object_type == _meta.ObjectType.FILE:
@@ -259,8 +735,8 @@ class DataMapping:
259
735
  pa.date64(): _meta.BasicType.DATE
260
736
  }
261
737
 
262
- @staticmethod
263
- def arrow_to_python_type(arrow_type: pa.DataType) -> type:
738
+ @classmethod
739
+ def arrow_to_python_type(cls, arrow_type: pa.DataType) -> type:
264
740
 
265
741
  if pa.types.is_boolean(arrow_type):
266
742
  return bool
@@ -283,6 +759,11 @@ class DataMapping:
283
759
  if pa.types.is_timestamp(arrow_type):
284
760
  return dt.datetime
285
761
 
762
+ # The python type for a dictionary-encoded field is its value type
763
+ if pa.types.is_dictionary(arrow_type):
764
+ if isinstance(arrow_type, pa.DictionaryType):
765
+ return cls.arrow_to_python_type(arrow_type.value_type)
766
+
286
767
  raise _ex.ETracInternal(f"No Python type mapping available for Arrow type [{arrow_type}]")
287
768
 
288
769
  @classmethod
@@ -340,7 +821,13 @@ class DataMapping:
340
821
  def trac_to_arrow_field(cls, trac_field: _meta.FieldSchema):
341
822
 
342
823
  arrow_type = cls.trac_to_arrow_basic_type(trac_field.fieldType)
343
- nullable = not trac_field.notNull if trac_field.notNull is not None else not trac_field.businessKey
824
+
825
+ # Categorical data uses an unordered dictionary with int32 index, ordered encoding not (currently) supported
826
+ # For legacy compatability, only use dictionary encoding if the categorical feature is enabled
827
+ if trac_field.categorical:
828
+ arrow_type = pa.dictionary(pa.int32(), arrow_type, False)
829
+
830
+ nullable = not (trac_field.notNull or trac_field.businessKey)
344
831
 
345
832
  return pa.field(trac_field.fieldName, arrow_type, nullable)
346
833
 
@@ -369,12 +856,15 @@ class DataMapping:
369
856
  field_type = cls.arrow_to_trac_type(field.type)
370
857
  label = field.metadata["label"] if field.metadata and "label" in field.metadata else field.name
371
858
 
859
+ # When converting Arrow -> TRAC, always set the categorical flag for dictionary encoded fields
860
+ # This affects dynamic imports and is informational only (physical layout is controlled by Arrow schemas)
861
+
372
862
  return _meta.FieldSchema(
373
863
  field.name, field_index, field_type,
374
864
  label=label,
375
865
  businessKey=False,
376
866
  notNull=not field.nullable,
377
- categorical=False)
867
+ categorical=pa.types.is_dictionary(field.type))
378
868
 
379
869
  @classmethod
380
870
  def arrow_to_trac_type(cls, arrow_type: pa.DataType) -> _meta.BasicType:
@@ -390,6 +880,11 @@ class DataMapping:
390
880
  if pa.types.is_timestamp(arrow_type):
391
881
  return _meta.BasicType.DATETIME
392
882
 
883
+ # The basic type for a dictionary-encoded field is its value type
884
+ if pa.types.is_dictionary(arrow_type):
885
+ if isinstance(arrow_type, pa.DictionaryType):
886
+ return cls.arrow_to_trac_type(arrow_type.value_type)
887
+
393
888
  raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
394
889
 
395
890
  @classmethod
@@ -470,7 +965,7 @@ T_INTERNAL_DATA = tp.TypeVar("T_INTERNAL_DATA")
470
965
  T_INTERNAL_SCHEMA = tp.TypeVar("T_INTERNAL_SCHEMA")
471
966
 
472
967
 
473
- class DataConverter(tp.Generic[T_DATA_API, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
968
+ class DataConverter(tp.Generic[T_DATA_API, T_INTERNAL_DATA, T_INTERNAL_SCHEMA], metaclass=abc.ABCMeta):
474
969
 
475
970
  # Available per-framework args, to enable framework-specific type-checking in public APIs
476
971
  # These should (for a purist point of view) be in the individual converter classes
@@ -766,6 +1261,10 @@ class DataConformance:
766
1261
  "Field [{field_name}] cannot be converted from {vector_type} to {field_type}, " + \
767
1262
  "source and target have different time zones"
768
1263
 
1264
+ __E_WRONG_CATEGORICAL_TYPE = \
1265
+ "Field [{field_name}] categorical types do not match" + \
1266
+ "(expected {field_type}, got {vector_type})"
1267
+
769
1268
  @classmethod
770
1269
  def column_filter(cls, columns: tp.List[str], schema: tp.Optional[pa.Schema]) -> tp.Optional[tp.List[str]]:
771
1270
 
@@ -918,13 +1417,19 @@ class DataConformance:
918
1417
  @classmethod
919
1418
  def _coerce_vector(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
920
1419
 
1420
+ # Handle null vector
921
1421
  if pa.types.is_null(vector.type):
922
-
923
1422
  if field.nullable:
924
1423
  return pa.nulls(size=len(vector), type=field.type)
925
1424
  else:
926
1425
  raise _ex.EDataConformance(f"All null values in non-null field [{field.name}]")
927
1426
 
1427
+ # If the vector is dict-encoded but the expected result is not, decode the dictionary
1428
+ if pa.types.is_dictionary(vector.type) and not pa.types.is_dictionary(field.type):
1429
+ if isinstance(vector, pa.DictionaryArray):
1430
+ dict_vector: pa.DictionaryArray = vector
1431
+ vector = dict_vector.dictionary_decode()
1432
+
928
1433
  if pa.types.is_boolean(field.type):
929
1434
  return cls._coerce_boolean(vector, field)
930
1435
 
@@ -946,6 +1451,9 @@ class DataConformance:
946
1451
  if pa.types.is_timestamp(field.type):
947
1452
  return cls._coerce_timestamp(vector, field)
948
1453
 
1454
+ if pa.types.is_dictionary(field.type):
1455
+ return cls._coerce_dictionary(vector, field)
1456
+
949
1457
  error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
950
1458
  cls.__log.error(error_message)
951
1459
  raise _ex.EDataConformance(error_message)
@@ -1188,6 +1696,85 @@ class DataConformance:
1188
1696
 
1189
1697
  return pc.cast(scaled_vector, field.type)
1190
1698
 
1699
+ @classmethod
1700
+ def _coerce_dictionary(cls, vector: pa.Array, field: pa.Field):
1701
+
1702
+ try:
1703
+
1704
+ if not isinstance(field.type, pa.DictionaryType):
1705
+ raise _ex.EUnexpected()
1706
+
1707
+ field_type: pa.DictionaryType = field.type
1708
+
1709
+ # Supplied vector is a dictionary (but the dictionary type is not an exact match)
1710
+ if pa.types.is_dictionary(vector.type):
1711
+
1712
+ if not isinstance(vector.type, pa.DictionaryType):
1713
+ raise _ex.EUnexpected()
1714
+
1715
+ vector_type: pa.DictionaryType = vector.type
1716
+
1717
+ # Do not allow coercion to a smaller index type or from unordered to ordered
1718
+ if (vector_type.index_type.bit_width > field_type.index_type.bit_width) or \
1719
+ (field_type.ordered and not vector_type.ordered):
1720
+
1721
+ error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
1722
+ cls.__log.error(error_message)
1723
+ raise _ex.EDataConformance(error_message)
1724
+
1725
+ # Value types are the same - basic cast should succeed
1726
+ if vector_type.value_type == field_type.value_type:
1727
+ return pc.cast(vector, field.type)
1728
+
1729
+ # Value types differ - try to coerce the underlying dictionary
1730
+ elif isinstance(vector, pa.DictionaryArray):
1731
+ try:
1732
+ values_field = pa.field(field.name, field_type.value_type, field.nullable)
1733
+ values_vector = cls._coerce_vector(vector.dictionary, values_field)
1734
+ dict_vector = pa.DictionaryArray.from_arrays(vector.indices, values_vector, ordered=field_type.ordered) # noqa
1735
+ return pc.cast(dict_vector, field.type)
1736
+ # Handle errors converting the value type
1737
+ except _ex.EDataConformance as e:
1738
+ error_message = cls._format_error(cls.__E_WRONG_CATEGORICAL_TYPE, vector, field)
1739
+ cls.__log.error(error_message)
1740
+ raise _ex.EDataConformance(error_message) from e
1741
+
1742
+ # Special handling for chunked dictionaries
1743
+ elif isinstance(vector, pa.ChunkedArray):
1744
+ if any(vector.chunks):
1745
+ chunks = [cls._coerce_dictionary(chunk, field) for chunk in vector.chunks]
1746
+ return pa.chunked_array(chunks)
1747
+ else:
1748
+ return pa.array([], type=field.type, size=0) # noqa
1749
+
1750
+ # Vector type not recognized, coercion is not possible
1751
+ else:
1752
+ error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
1753
+ cls.__log.error(error_message)
1754
+ raise _ex.EDataConformance(error_message)
1755
+
1756
+ # Supplied vector matches the dictionary value type - perform dictionary encoding
1757
+ elif vector.type == field_type.value_type and not field_type.ordered:
1758
+ return vector.dictionary_encode().cast(field.type)
1759
+
1760
+ # Fallback option - try to coerce the value type first, then perform dictionary encoding
1761
+ else:
1762
+ try:
1763
+ values_field = pa.field(field.name, field_type.value_type, field.nullable)
1764
+ values_vector = cls._coerce_vector(vector, values_field)
1765
+ return values_vector.dictionary_encode().cast(field.type)
1766
+ # Handle errors converting the value type
1767
+ except _ex.EDataConformance as e:
1768
+ error_message = cls._format_error(cls.__E_WRONG_CATEGORICAL_TYPE, vector, field)
1769
+ cls.__log.error(error_message)
1770
+ raise _ex.EDataConformance(error_message) from e
1771
+
1772
+ except pa.ArrowInvalid as e:
1773
+
1774
+ error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e)
1775
+ cls.__log.error(error_message)
1776
+ raise _ex.EDataConformance(error_message) from e
1777
+
1191
1778
  @classmethod
1192
1779
  def _format_error(cls, error_template: str, vector: pa.Array, field: pa.Field, e: Exception = None):
1193
1780