tracdap-runtime 0.8.0rc2__py3-none-any.whl → 0.9.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. tracdap/rt/_impl/core/data.py +578 -33
  2. tracdap/rt/_impl/core/repos.py +7 -0
  3. tracdap/rt/_impl/core/storage.py +10 -3
  4. tracdap/rt/_impl/core/util.py +54 -11
  5. tracdap/rt/_impl/exec/dev_mode.py +122 -100
  6. tracdap/rt/_impl/exec/engine.py +178 -109
  7. tracdap/rt/_impl/exec/functions.py +218 -257
  8. tracdap/rt/_impl/exec/graph.py +140 -125
  9. tracdap/rt/_impl/exec/graph_builder.py +411 -449
  10. tracdap/rt/_impl/grpc/codec.py +4 -2
  11. tracdap/rt/_impl/grpc/server.py +7 -7
  12. tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.py +25 -18
  13. tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.pyi +27 -9
  14. tracdap/rt/_impl/grpc/tracdap/metadata/common_pb2.py +1 -1
  15. tracdap/rt/_impl/grpc/tracdap/metadata/config_pb2.py +1 -1
  16. tracdap/rt/_impl/grpc/tracdap/metadata/custom_pb2.py +1 -1
  17. tracdap/rt/_impl/grpc/tracdap/metadata/data_pb2.py +1 -1
  18. tracdap/rt/_impl/grpc/tracdap/metadata/file_pb2.py +1 -1
  19. tracdap/rt/_impl/grpc/tracdap/metadata/flow_pb2.py +1 -1
  20. tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +67 -63
  21. tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +11 -2
  22. tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +1 -1
  23. tracdap/rt/_impl/grpc/tracdap/metadata/object_id_pb2.py +1 -1
  24. tracdap/rt/_impl/grpc/tracdap/metadata/object_pb2.py +1 -1
  25. tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.py +1 -1
  26. tracdap/rt/_impl/grpc/tracdap/metadata/search_pb2.py +1 -1
  27. tracdap/rt/_impl/grpc/tracdap/metadata/storage_pb2.py +11 -9
  28. tracdap/rt/_impl/grpc/tracdap/metadata/storage_pb2.pyi +11 -2
  29. tracdap/rt/_impl/grpc/tracdap/metadata/tag_pb2.py +1 -1
  30. tracdap/rt/_impl/grpc/tracdap/metadata/tag_update_pb2.py +1 -1
  31. tracdap/rt/_impl/grpc/tracdap/metadata/type_pb2.py +1 -1
  32. tracdap/rt/_impl/runtime.py +8 -0
  33. tracdap/rt/_plugins/repo_git.py +56 -11
  34. tracdap/rt/_version.py +1 -1
  35. tracdap/rt/config/__init__.py +6 -6
  36. tracdap/rt/config/common.py +5 -0
  37. tracdap/rt/config/job.py +13 -3
  38. tracdap/rt/config/result.py +8 -4
  39. tracdap/rt/config/runtime.py +2 -0
  40. tracdap/rt/metadata/__init__.py +37 -36
  41. tracdap/rt/metadata/job.py +2 -0
  42. tracdap/rt/metadata/storage.py +9 -0
  43. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b1.dist-info}/METADATA +3 -1
  44. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b1.dist-info}/RECORD +47 -47
  45. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b1.dist-info}/WHEEL +1 -1
  46. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b1.dist-info}/licenses/LICENSE +0 -0
  47. {tracdap_runtime-0.8.0rc2.dist-info → tracdap_runtime-0.9.0b1.dist-info}/top_level.txt +0 -0
@@ -16,10 +16,12 @@
16
16
  import abc
17
17
  import copy
18
18
  import dataclasses as dc
19
+ import pathlib
19
20
  import typing as tp
20
21
  import datetime as dt
21
22
  import decimal
22
23
  import platform
24
+ import random
23
25
 
24
26
  import pyarrow as pa
25
27
  import pyarrow.compute as pc
@@ -36,8 +38,20 @@ except ModuleNotFoundError:
36
38
 
37
39
  import tracdap.rt.api.experimental as _api
38
40
  import tracdap.rt.metadata as _meta
41
+ import tracdap.rt.config as _cfg
39
42
  import tracdap.rt.exceptions as _ex
40
43
  import tracdap.rt._impl.core.logging as _log
44
+ import tracdap.rt._impl.core.util as _util
45
+
46
+
47
+ @dc.dataclass(frozen=True)
48
+ class DataPartKey:
49
+
50
+ @classmethod
51
+ def for_root(cls) -> "DataPartKey":
52
+ return DataPartKey(opaque_key='part-root')
53
+
54
+ opaque_key: str
41
55
 
42
56
 
43
57
  @dc.dataclass(frozen=True)
@@ -47,61 +61,470 @@ class DataSpec:
47
61
  schema_type: _meta.SchemaType
48
62
  data_item: str
49
63
 
50
- data_def: _meta.DataDefinition
51
- file_def: _meta.FileDefinition
52
- storage_def: _meta.StorageDefinition
53
- schema_def: tp.Optional[_meta.SchemaDefinition]
64
+ definition: tp.Union[_meta.DataDefinition, _meta.FileDefinition]
65
+ storage: _meta.StorageDefinition
66
+ schema: tp.Optional[_meta.SchemaDefinition] = None
67
+
68
+ primary_id: _meta.TagHeader = None
69
+ storage_id: _meta.TagHeader = None
70
+ schema_id: tp.Optional[_meta.TagHeader] = None
71
+ context_key: tp.Optional[str] = None
54
72
 
55
73
  @staticmethod
56
74
  def create_data_spec(
57
75
  data_item: str,
58
- data_def: _meta.DataDefinition,
59
- storage_def: _meta.StorageDefinition,
60
- schema_def: tp.Optional[_meta.SchemaDefinition] = None) -> "DataSpec":
61
-
62
- if schema_def:
63
- schema_type = schema_def.schemaType
64
- elif data_def.schema:
65
- schema_type = data_def.schema.schemaType
76
+ definition: _meta.DataDefinition,
77
+ storage: _meta.StorageDefinition,
78
+ schema: tp.Optional[_meta.SchemaDefinition] = None) -> "DataSpec":
79
+
80
+ if schema:
81
+ schema_type = schema.schemaType
82
+ elif definition.schema:
83
+ schema_type = definition.schema.schemaType
66
84
  else:
67
85
  schema_type = _meta.SchemaType.SCHEMA_TYPE_NOT_SET
68
86
 
69
87
  return DataSpec(
70
88
  _meta.ObjectType.DATA, schema_type, data_item,
71
- data_def,
72
- storage_def=storage_def,
73
- schema_def=schema_def,
74
- file_def=None)
89
+ definition, storage, schema)
75
90
 
76
91
  @staticmethod
77
92
  def create_file_spec(
78
93
  data_item: str,
79
- file_def: _meta.FileDefinition,
80
- storage_def: _meta.StorageDefinition) -> "DataSpec":
94
+ definition: _meta.FileDefinition,
95
+ storage: _meta.StorageDefinition) -> "DataSpec":
81
96
 
82
97
  return DataSpec(
83
98
  _meta.ObjectType.FILE, _meta.SchemaType.SCHEMA_TYPE_NOT_SET, data_item,
84
- file_def=file_def,
85
- storage_def=storage_def,
86
- data_def=None,
87
- schema_def=None)
99
+ definition, storage)
88
100
 
89
101
  @staticmethod
90
102
  def create_empty_spec(object_type: _meta.ObjectType, schema_type: _meta.SchemaType):
91
- return DataSpec(object_type, schema_type, None, None, None, None, None)
103
+ return DataSpec(object_type, schema_type, "", None, None, None) # noqa
104
+
105
+ def with_ids(
106
+ self, primary_id: _meta.TagHeader,
107
+ storage_id: _meta.TagHeader,
108
+ schema_id: tp.Optional[_meta.TagHeader] = None,
109
+ context_key: tp.Optional[str] = None):
110
+
111
+ return DataSpec(
112
+ self.object_type, self.schema_type, self.data_item,
113
+ self.definition, self.storage, self.schema,
114
+ primary_id, storage_id, schema_id, context_key)
92
115
 
93
116
  def is_empty(self):
94
117
  return self.data_item is None or len(self.data_item) == 0
95
118
 
96
119
 
97
- @dc.dataclass(frozen=True)
98
- class DataPartKey:
120
+ class StorageLayout:
121
+
122
+ __LAYOUTS: "tp.Dict[_meta.StorageLayout, StorageLayout]" = dict()
99
123
 
100
124
  @classmethod
101
- def for_root(cls) -> "DataPartKey":
102
- return DataPartKey(opaque_key='part_root')
125
+ def select(cls, layout_key: _meta.StorageLayout) -> "StorageLayout":
103
126
 
104
- opaque_key: str
127
+ # Legacy compatibility - layout key not set in storage definition
128
+ if not layout_key or layout_key.value == 0:
129
+ layout_key = _meta.StorageLayout.OBJECT_ID_LAYOUT
130
+
131
+ layout = cls.__LAYOUTS.get(layout_key)
132
+
133
+ if layout is not None:
134
+ return layout
135
+
136
+ if layout_key == _meta.StorageLayout.OBJECT_ID_LAYOUT:
137
+ layout = ObjectIdLayout()
138
+ elif layout_key == _meta.StorageLayout.DEVELOPER_LAYOUT:
139
+ layout = DevelopmentLayout()
140
+ else:
141
+ raise _ex.ETracInternal(f"Unknown storage layout [{layout_key.name}]")
142
+
143
+ cls.__LAYOUTS[layout_key] = layout
144
+
145
+ return layout
146
+
147
+ @abc.abstractmethod
148
+ def layout_key(self) -> _meta.StorageLayout:
149
+ pass
150
+
151
+ @abc.abstractmethod
152
+ def new_data_spec(
153
+ self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
154
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
155
+ storage_config: _cfg.StorageConfig) -> DataSpec:
156
+ pass
157
+
158
+ @abc.abstractmethod
159
+ def new_data_version(
160
+ self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
161
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
162
+ prior_spec: DataSpec) -> DataSpec:
163
+ pass
164
+
165
+ @abc.abstractmethod
166
+ def new_file_spec(
167
+ self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
168
+ context_key: str, file_type: _meta.FileType,
169
+ storage_config: _cfg.StorageConfig) -> DataSpec:
170
+ pass
171
+
172
+ @abc.abstractmethod
173
+ def new_file_version(
174
+ self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
175
+ context_key: str, file_type: _meta.FileType,
176
+ prior_spec: DataSpec) -> DataSpec:
177
+ pass
178
+
179
+
180
+ class BaseLayout(StorageLayout):
181
+
182
+ __DATA_ITEM_TEMPLATE = "data/{}/{}/{}/snap-{:d}/delta-{:d}"
183
+ __FILE_ITEM_TEMPLATE = "file/{}/version-{}"
184
+
185
+ @abc.abstractmethod
186
+ def _data_storage_path(
187
+ self, data_id: _meta.TagHeader, context_key: str, trac_schema: _meta.SchemaDefinition,
188
+ part_key: _meta.PartKey, snap_index: int, delta_index: int, storage_format: str,
189
+ prior_copy: tp.Optional[_meta.StorageCopy]):
190
+ pass
191
+
192
+ @abc.abstractmethod
193
+ def _file_storage_path(
194
+ self, file_id: _meta.TagHeader, file_def: _meta.FileDefinition,
195
+ prior_copy: tp.Optional[_meta.StorageCopy]):
196
+ pass
197
+
198
+ def new_data_spec(
199
+ self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
200
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
201
+ storage_config: _cfg.StorageConfig) -> DataSpec:
202
+
203
+ part_key = _meta.PartKey("part-root", _meta.PartType.PART_ROOT)
204
+ snap_index = 0
205
+
206
+ data_item = self.__DATA_ITEM_TEMPLATE.format(
207
+ trac_schema.schemaType.name.lower(), data_id.objectId,
208
+ part_key.opaqueKey, snap_index, 0)
209
+
210
+ # Blank data definition with no parts
211
+ new_data_def = _meta.DataDefinition(
212
+ schema=trac_schema,
213
+ storageId=_util.selector_for_latest(storage_id))
214
+
215
+ data_def = self._add_new_snap(new_data_def, data_item, part_key, snap_index)
216
+
217
+ # Take default location from the storage config
218
+ storage_key = storage_config.defaultBucket
219
+ storage_format = "JSON" if trac_schema.schemaType == _meta.SchemaType.STRUCT else storage_config.defaultFormat
220
+ storage_path = self._data_storage_path(data_id, context_key, trac_schema, part_key, snap_index, 0, storage_format, prior_copy=None)
221
+
222
+ storage_copy = _meta.StorageCopy(
223
+ storageKey=storage_key,
224
+ storagePath=storage_path,
225
+ storageFormat=storage_format,
226
+ copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
227
+ copyTimestamp=data_id.objectTimestamp)
228
+
229
+ new_storage_def = _meta.StorageDefinition()
230
+
231
+ storage_def = self._add_storage_copy(new_storage_def, data_item, storage_copy)
232
+
233
+ # Dynamic data def will always use an embedded schema (this is no ID for an external schema)
234
+
235
+ return DataSpec \
236
+ .create_data_spec(data_item, data_def, storage_def, schema=None) \
237
+ .with_ids(data_id, storage_id)
238
+
239
+ def new_data_version(
240
+ self, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
241
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
242
+ prior_spec: DataSpec) -> DataSpec:
243
+
244
+ part_key = _meta.PartKey("part-root", _meta.PartType.PART_ROOT)
245
+ snap_index = prior_spec.primary_id.objectVersion # snap index is zero-based
246
+
247
+ data_item = self.__DATA_ITEM_TEMPLATE.format(
248
+ trac_schema.schemaType.name.lower(), data_id.objectId,
249
+ part_key.opaqueKey, snap_index, 0)
250
+
251
+ data_def = self._add_new_snap(prior_spec.definition, data_item, part_key, snap_index)
252
+
253
+ prior_item = next(iter(prior_spec.storage.dataItems.keys()), None)
254
+ prior_copy = self._find_storage_copy(prior_item, prior_spec.storage)
255
+
256
+ if prior_copy is None:
257
+ raise _ex.ETracInternal(f"Missing prior metadata for [{_util.object_key(data_id)}]")
258
+
259
+ storage_key = prior_copy.storageKey
260
+ storage_format = prior_copy.storageFormat
261
+ storage_path = self._data_storage_path(data_id, context_key, trac_schema, part_key, snap_index, 0, storage_format, prior_copy)
262
+
263
+ storage_copy = _meta.StorageCopy(
264
+ storageKey=storage_key,
265
+ storagePath=storage_path,
266
+ storageFormat=storage_format,
267
+ copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
268
+ copyTimestamp=data_id.objectTimestamp)
269
+
270
+ storage_def = self._add_storage_copy(prior_spec.storage, data_item, storage_copy)
271
+
272
+ return DataSpec \
273
+ .create_data_spec(data_item, data_def, storage_def, schema=None) \
274
+ .with_ids(data_id, storage_id)
275
+
276
+ def new_file_spec(
277
+ self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
278
+ context_key: str, file_type: _meta.FileType,
279
+ storage_config: _cfg.StorageConfig) -> DataSpec:
280
+
281
+ data_item = self.__FILE_ITEM_TEMPLATE.format(file_id.objectId, file_id.objectVersion)
282
+
283
+ file_def = _meta.FileDefinition(
284
+ name=f"{context_key}.{file_type.extension}",
285
+ extension=file_type.extension,
286
+ mimeType=file_type.mimeType,
287
+ dataItem=data_item,
288
+ storageId=_util.selector_for_latest(storage_id),
289
+ size=0)
290
+
291
+ storage_key = storage_config.defaultBucket
292
+ storage_format = file_def.mimeType
293
+ storage_path = self._file_storage_path(file_id, file_def, prior_copy=None)
294
+
295
+ storage_copy = _meta.StorageCopy(
296
+ storageKey=storage_key,
297
+ storagePath=storage_path,
298
+ storageFormat=storage_format,
299
+ copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
300
+ copyTimestamp=file_id.objectTimestamp)
301
+
302
+ new_storage_def = _meta.StorageDefinition()
303
+ new_storage_def.layout = self.layout_key()
304
+
305
+ storage_def = self._add_storage_copy(new_storage_def, data_item, storage_copy)
306
+
307
+ return DataSpec \
308
+ .create_file_spec(data_item, file_def, storage_def) \
309
+ .with_ids(file_id, storage_id)
310
+
311
+ def new_file_version(
312
+ self, file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
313
+ context_key: str, file_type: _meta.FileType, prior_spec: DataSpec) -> DataSpec:
314
+
315
+ data_item = self.__FILE_ITEM_TEMPLATE.format(file_id.objectId, file_id.objectVersion)
316
+
317
+ file_def = _meta.FileDefinition(
318
+ name=f"{context_key}.{file_type.extension}",
319
+ extension=file_type.extension,
320
+ mimeType=file_type.mimeType,
321
+ dataItem=data_item,
322
+ storageId=_util.selector_for_latest(storage_id),
323
+ size=0)
324
+
325
+ prior_copy = self._find_storage_copy(prior_spec.definition.dataItem, prior_spec.storage)
326
+
327
+ if prior_copy is None:
328
+ raise _ex.ETracInternal(f"Missing prior metadata for [{_util.object_key(file_id)}]")
329
+
330
+ storage_key = prior_copy.storageKey
331
+ storage_format = file_def.mimeType
332
+ storage_path = self._file_storage_path(file_id, file_def, prior_copy=None)
333
+
334
+ storage_copy = _meta.StorageCopy(
335
+ storageKey=storage_key,
336
+ storagePath=storage_path,
337
+ storageFormat=storage_format,
338
+ copyStatus=_meta.CopyStatus.COPY_AVAILABLE,
339
+ copyTimestamp=file_id.objectTimestamp)
340
+
341
+ storage_def = self._add_storage_copy(prior_spec.storage, data_item, storage_copy)
342
+
343
+ return DataSpec \
344
+ .create_file_spec(data_item, file_def, storage_def) \
345
+ .with_ids(file_id, storage_id)
346
+
347
+ @classmethod
348
+ def _add_new_snap(
349
+ cls, data_def: _meta.DataDefinition,data_item: str,
350
+ part_key: _meta.PartKey, snap_index: int):
351
+
352
+ delta = _meta.DataDefinition.Delta(
353
+ deltaIndex=0,
354
+ dataItem=data_item)
355
+
356
+ snap = _meta.DataDefinition.Snap(
357
+ snapIndex=snap_index,
358
+ deltas=[delta])
359
+
360
+ part = _meta.DataDefinition.Part(
361
+ partKey=part_key,
362
+ snap=snap)
363
+
364
+ data_def = copy.copy(data_def)
365
+ data_def.parts = copy.copy(data_def.parts)
366
+ data_def.parts[part_key.opaqueKey] = part
367
+
368
+ return data_def
369
+
370
+ @classmethod
371
+ def _add_storage_copy(cls, storage_def: _meta.StorageDefinition, data_item: str, storage_copy: _meta.StorageCopy):
372
+
373
+ new_incarnation = _meta.StorageIncarnation(
374
+ copies=[storage_copy],
375
+ incarnationIndex=0,
376
+ incarnationTimestamp=storage_copy.copyTimestamp,
377
+ incarnationStatus=_meta.IncarnationStatus.INCARNATION_AVAILABLE)
378
+
379
+ new_item = _meta.StorageItem(incarnations=[new_incarnation])
380
+
381
+ storage_def = copy.copy(storage_def)
382
+ storage_def.dataItems = copy.copy(storage_def.dataItems)
383
+ storage_def.dataItems[data_item] = new_item
384
+
385
+ return storage_def
386
+
387
+ @classmethod
388
+ def _find_storage_copy(cls, data_item: str, storage_def: _meta.StorageDefinition) -> tp.Optional[_meta.StorageCopy]:
389
+
390
+ if data_item is None:
391
+ return None
392
+
393
+ storage_item = storage_def.dataItems.get(data_item)
394
+
395
+ if storage_item is None:
396
+ return None
397
+
398
+ # Latest available incarnation
399
+ incarnation = next(filter(
400
+ lambda i: i.incarnationStatus == _meta.IncarnationStatus.INCARNATION_AVAILABLE,
401
+ reversed(storage_item.incarnations)), None)
402
+
403
+ if incarnation is None:
404
+ return None
405
+
406
+ # Use any available copy (currently there is no location preference)
407
+ return next(filter(
408
+ lambda c: c.copyStatus == _meta.CopyStatus.COPY_AVAILABLE,
409
+ incarnation.copies), None)
410
+
411
+
412
+ class ObjectIdLayout(BaseLayout):
413
+
414
+ __DATA_STORAGE_TEMPLATE = "data/{}/{}/{}/snap-{:d}/delta-{:d}-x{:0>6x}"
415
+ __FILE_STORAGE_TEMPLATE = "file/{}/version-{:d}-x{:0>6x}/{}.{}"
416
+
417
+ def __init__(self):
418
+ self.__random = random.Random()
419
+ self.__random.seed()
420
+
421
+ def layout_key(self) -> _meta.StorageLayout:
422
+ return _meta.StorageLayout.OBJECT_ID_LAYOUT
423
+
424
+ def _data_storage_path(
425
+ self, data_id, context_key, trac_schema,
426
+ part_key, snap_index, delta_index,
427
+ storage_format, prior_copy):
428
+
429
+ schema_type = trac_schema.schemaType.name.lower()
430
+ version_suffix = self.__random.randint(0, 1 << 24)
431
+
432
+ return self.__DATA_STORAGE_TEMPLATE.format(
433
+ schema_type, data_id.objectId,
434
+ part_key.opaqueKey, snap_index, delta_index,
435
+ version_suffix)
436
+
437
+ def _file_storage_path(self, file_id, file_def, prior_copy):
438
+
439
+ version_suffix = self.__random.randint(0, 1 << 24)
440
+
441
+ return self.__FILE_STORAGE_TEMPLATE.format(
442
+ file_id.objectId, file_id.objectVersion, version_suffix,
443
+ file_def.name, file_def.extension.lower())
444
+
445
+
446
+ class DevelopmentLayout(BaseLayout):
447
+
448
+ __DEFAULT_OUTPUT_DIR = "Dev Outputs"
449
+
450
+ __DATA_STORAGE_PATH = "{}/{}{}.{}"
451
+ __FILE_STORAGE_PATH = "{}/{}{}.{}"
452
+
453
+ def layout_key(self) -> _meta.StorageLayout:
454
+ return _meta.StorageLayout.DEVELOPER_LAYOUT
455
+
456
+ def _data_storage_path(
457
+ self, data_id, context_key, trac_schema,
458
+ part_key, snap_index, delta_index,
459
+ storage_format, prior_copy):
460
+
461
+ storage_dir = self._dev_storage_dir(prior_copy)
462
+ suffix = f"-{data_id.objectVersion}" if data_id.objectVersion > 1 else ""
463
+
464
+ if prior_copy is not None:
465
+ prior_path = pathlib.Path(prior_copy.storagePath)
466
+ file_name = prior_path.stem
467
+ if data_id.objectVersion > 2 and "-" in file_name:
468
+ file_name = file_name[:file_name.rfind("-")]
469
+ else:
470
+ file_name = context_key
471
+
472
+ return self.__DATA_STORAGE_PATH.format(storage_dir, file_name, suffix, storage_format.lower())
473
+
474
+ def _file_storage_path(self, file_id, file_def, prior_copy):
475
+
476
+ storage_dir = self._dev_storage_dir(prior_copy)
477
+ suffix = f"-{file_id.objectVersion}" if file_id.objectVersion > 1 else ""
478
+
479
+ return self.__FILE_STORAGE_PATH.format(storage_dir, file_def.name, suffix, file_def.extension.lower())
480
+
481
+ def _dev_storage_dir(self, prior_copy: _meta.StorageCopy):
482
+
483
+ if prior_copy is None:
484
+ return self.__DEFAULT_OUTPUT_DIR
485
+
486
+ prior_path = pathlib.Path(prior_copy.storagePath)
487
+
488
+ if len(prior_path.parts) > 1:
489
+ return prior_path.parent
490
+ else:
491
+ return self.__DEFAULT_OUTPUT_DIR
492
+
493
+
494
+ def build_data_spec(
495
+ data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
496
+ context_key: tp.Optional[str], trac_schema: _meta.SchemaDefinition,
497
+ storage_config: _cfg.StorageConfig,
498
+ prior_spec: tp.Optional[DataSpec] = None) \
499
+ -> DataSpec:
500
+
501
+ if prior_spec is None:
502
+ layout_key = storage_config.defaultLayout
503
+ layout = StorageLayout.select(layout_key)
504
+ return layout.new_data_spec(data_id, storage_id, context_key, trac_schema, storage_config)
505
+
506
+ else:
507
+ layout_key = prior_spec.storage.layout
508
+ layout = StorageLayout.select(layout_key)
509
+ return layout.new_data_version(data_id, storage_id, context_key, trac_schema, prior_spec)
510
+
511
+
512
+ def build_file_spec(
513
+ file_id: _meta.TagHeader, storage_id: _meta.TagHeader,
514
+ context_key: tp.Optional[str], file_type: _meta.FileType,
515
+ storage_config: _cfg.StorageConfig,
516
+ prior_spec: tp.Optional[DataSpec] = None) \
517
+ -> DataSpec:
518
+
519
+ if prior_spec is None:
520
+ layout_key = storage_config.defaultLayout
521
+ layout = StorageLayout.select(layout_key)
522
+ return layout.new_file_spec(file_id, storage_id, context_key, file_type, storage_config)
523
+
524
+ else:
525
+ layout_key = prior_spec.storage.layout
526
+ layout = StorageLayout.select(layout_key)
527
+ return layout.new_file_version(file_id, storage_id, context_key, file_type, prior_spec)
105
528
 
106
529
 
107
530
  @dc.dataclass(frozen=True)
@@ -184,6 +607,11 @@ class DataView:
184
607
  else:
185
608
  return DataView(_meta.ObjectType.DATA, trac_schema, parts = dict())
186
609
 
610
+ @staticmethod
611
+ def for_arrow_schema(arrow_schema: pa.Schema):
612
+ trac_schema = DataMapping.arrow_to_trac_schema(arrow_schema)
613
+ return DataView(_meta.ObjectType.DATA, trac_schema, arrow_schema, dict())
614
+
187
615
  @staticmethod
188
616
  def for_file_item(file_item: DataItem):
189
617
  return DataView(file_item.object_type, file_item=file_item)
@@ -259,8 +687,17 @@ class DataMapping:
259
687
  pa.date64(): _meta.BasicType.DATE
260
688
  }
261
689
 
262
- @staticmethod
263
- def arrow_to_python_type(arrow_type: pa.DataType) -> type:
690
+ # For now, categorical handling is disabled by default and enabled by this setting
691
+ # The default will change to "true" for the 0.9 release
692
+ CATEGORICAL_CONFIG_KEY = "trac.runtime.categorical"
693
+ __categorical_enabled = False
694
+
695
+ @classmethod
696
+ def enable_categorical(cls, enabled: bool):
697
+ cls.__categorical_enabled = enabled
698
+
699
+ @classmethod
700
+ def arrow_to_python_type(cls, arrow_type: pa.DataType) -> type:
264
701
 
265
702
  if pa.types.is_boolean(arrow_type):
266
703
  return bool
@@ -283,6 +720,11 @@ class DataMapping:
283
720
  if pa.types.is_timestamp(arrow_type):
284
721
  return dt.datetime
285
722
 
723
+ # The python type for a dictionary-encoded field is its value type
724
+ if pa.types.is_dictionary(arrow_type):
725
+ if isinstance(arrow_type, pa.DictionaryType):
726
+ return cls.arrow_to_python_type(arrow_type.value_type)
727
+
286
728
  raise _ex.ETracInternal(f"No Python type mapping available for Arrow type [{arrow_type}]")
287
729
 
288
730
  @classmethod
@@ -340,7 +782,13 @@ class DataMapping:
340
782
  def trac_to_arrow_field(cls, trac_field: _meta.FieldSchema):
341
783
 
342
784
  arrow_type = cls.trac_to_arrow_basic_type(trac_field.fieldType)
343
- nullable = not trac_field.notNull if trac_field.notNull is not None else not trac_field.businessKey
785
+
786
+ # Categorical data uses an unordered dictionary with int32 index, ordered encoding not (currently) supported
787
+ # For legacy compatability, only use dictionary encoding if the categorical feature is enabled
788
+ if trac_field.categorical and cls.__categorical_enabled:
789
+ arrow_type = pa.dictionary(pa.int32(), arrow_type, False)
790
+
791
+ nullable = not (trac_field.notNull or trac_field.businessKey)
344
792
 
345
793
  return pa.field(trac_field.fieldName, arrow_type, nullable)
346
794
 
@@ -369,12 +817,15 @@ class DataMapping:
369
817
  field_type = cls.arrow_to_trac_type(field.type)
370
818
  label = field.metadata["label"] if field.metadata and "label" in field.metadata else field.name
371
819
 
820
+ # When converting Arrow -> TRAC, always set the categorical flag for dictionary encoded fields
821
+ # This affects dynamic imports and is informational only (physical layout is controlled by Arrow schemas)
822
+
372
823
  return _meta.FieldSchema(
373
824
  field.name, field_index, field_type,
374
825
  label=label,
375
826
  businessKey=False,
376
827
  notNull=not field.nullable,
377
- categorical=False)
828
+ categorical=pa.types.is_dictionary(field.type))
378
829
 
379
830
  @classmethod
380
831
  def arrow_to_trac_type(cls, arrow_type: pa.DataType) -> _meta.BasicType:
@@ -390,6 +841,11 @@ class DataMapping:
390
841
  if pa.types.is_timestamp(arrow_type):
391
842
  return _meta.BasicType.DATETIME
392
843
 
844
+ # The basic type for a dictionary-encoded field is its value type
845
+ if pa.types.is_dictionary(arrow_type):
846
+ if isinstance(arrow_type, pa.DictionaryType):
847
+ return cls.arrow_to_trac_type(arrow_type.value_type)
848
+
393
849
  raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
394
850
 
395
851
  @classmethod
@@ -766,6 +1222,10 @@ class DataConformance:
766
1222
  "Field [{field_name}] cannot be converted from {vector_type} to {field_type}, " + \
767
1223
  "source and target have different time zones"
768
1224
 
1225
+ __E_WRONG_CATEGORICAL_TYPE = \
1226
+ "Field [{field_name}] categorical types do not match" + \
1227
+ "(expected {field_type}, got {vector_type})"
1228
+
769
1229
  @classmethod
770
1230
  def column_filter(cls, columns: tp.List[str], schema: tp.Optional[pa.Schema]) -> tp.Optional[tp.List[str]]:
771
1231
 
@@ -918,13 +1378,19 @@ class DataConformance:
918
1378
  @classmethod
919
1379
  def _coerce_vector(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
920
1380
 
1381
+ # Handle null vector
921
1382
  if pa.types.is_null(vector.type):
922
-
923
1383
  if field.nullable:
924
1384
  return pa.nulls(size=len(vector), type=field.type)
925
1385
  else:
926
1386
  raise _ex.EDataConformance(f"All null values in non-null field [{field.name}]")
927
1387
 
1388
+ # If the vector is dict-encoded but the expected result is not, decode the dictionary
1389
+ if pa.types.is_dictionary(vector.type) and not pa.types.is_dictionary(field.type):
1390
+ if isinstance(vector, pa.DictionaryArray):
1391
+ dict_vector: pa.DictionaryArray = vector
1392
+ vector = dict_vector.dictionary_decode()
1393
+
928
1394
  if pa.types.is_boolean(field.type):
929
1395
  return cls._coerce_boolean(vector, field)
930
1396
 
@@ -946,6 +1412,9 @@ class DataConformance:
946
1412
  if pa.types.is_timestamp(field.type):
947
1413
  return cls._coerce_timestamp(vector, field)
948
1414
 
1415
+ if pa.types.is_dictionary(field.type):
1416
+ return cls._coerce_dictionary(vector, field)
1417
+
949
1418
  error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
950
1419
  cls.__log.error(error_message)
951
1420
  raise _ex.EDataConformance(error_message)
@@ -1188,6 +1657,82 @@ class DataConformance:
1188
1657
 
1189
1658
  return pc.cast(scaled_vector, field.type)
1190
1659
 
1660
+ @classmethod
1661
+ def _coerce_dictionary(cls, vector: pa.Array, field: pa.Field):
1662
+
1663
+ try:
1664
+
1665
+ if not isinstance(field.type, pa.DictionaryType):
1666
+ raise _ex.EUnexpected()
1667
+
1668
+ field_type: pa.DictionaryType = field.type
1669
+
1670
+ # Supplied vector is a dictionary (but the dictionary type is not an exact match)
1671
+ if pa.types.is_dictionary(vector.type):
1672
+
1673
+ if not isinstance(vector.type, pa.DictionaryType):
1674
+ raise _ex.EUnexpected()
1675
+
1676
+ vector_type: pa.DictionaryType = vector.type
1677
+
1678
+ # Do not allow coercion to a smaller index type or from unordered to ordered
1679
+ if (vector_type.index_type.bit_width > field_type.index_type.bit_width) or \
1680
+ (field_type.ordered and not vector_type.ordered):
1681
+
1682
+ error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
1683
+ cls.__log.error(error_message)
1684
+ raise _ex.EDataConformance(error_message)
1685
+
1686
+ # Value types are the same - basic cast should succeed
1687
+ if vector_type.value_type == field_type.value_type:
1688
+ return pc.cast(vector, field.type)
1689
+
1690
+ # Value types differ - try to coerce the underlying dictionary
1691
+ elif isinstance(vector, pa.DictionaryArray):
1692
+ try:
1693
+ values_field = pa.field(field.name, field_type.value_type, field.nullable)
1694
+ values_vector = cls._coerce_vector(vector.dictionary, values_field)
1695
+ dict_vector = pa.DictionaryArray.from_arrays(vector.indices, values_vector, ordered=field_type.ordered) # noqa
1696
+ return pc.cast(dict_vector, field.type)
1697
+ # Handle errors converting the value type
1698
+ except _ex.EDataConformance as e:
1699
+ error_message = cls._format_error(cls.__E_WRONG_CATEGORICAL_TYPE, vector, field)
1700
+ cls.__log.error(error_message)
1701
+ raise _ex.EDataConformance(error_message) from e
1702
+
1703
+ # Special handling for chunked dictionaries
1704
+ elif isinstance(vector, pa.ChunkedArray):
1705
+ chunks = [cls._coerce_dictionary(chunk, field) for chunk in vector.chunks]
1706
+ return pa.chunked_array(chunks)
1707
+
1708
+ # Vector type not recognized, coercion is not possible
1709
+ else:
1710
+ error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
1711
+ cls.__log.error(error_message)
1712
+ raise _ex.EDataConformance(error_message)
1713
+
1714
+ # Supplied vector matches the dictionary value type - perform dictionary encoding
1715
+ elif vector.type == field_type.value_type and not field_type.ordered:
1716
+ return vector.dictionary_encode().cast(field.type)
1717
+
1718
+ # Fallback option - try to coerce the value type first, then perform dictionary encoding
1719
+ else:
1720
+ try:
1721
+ values_field = pa.field(field.name, field_type.value_type, field.nullable)
1722
+ values_vector = cls._coerce_vector(vector, values_field)
1723
+ return values_vector.dictionary_encode().cast(field.type)
1724
+ # Handle errors converting the value type
1725
+ except _ex.EDataConformance as e:
1726
+ error_message = cls._format_error(cls.__E_WRONG_CATEGORICAL_TYPE, vector, field)
1727
+ cls.__log.error(error_message)
1728
+ raise _ex.EDataConformance(error_message) from e
1729
+
1730
+ except pa.ArrowInvalid as e:
1731
+
1732
+ error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e)
1733
+ cls.__log.error(error_message)
1734
+ raise _ex.EDataConformance(error_message) from e
1735
+
1191
1736
  @classmethod
1192
1737
  def _format_error(cls, error_template: str, vector: pa.Array, field: pa.Field, e: Exception = None):
1193
1738