tracdap-runtime 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. tracdap/rt/_exec/context.py +572 -112
  2. tracdap/rt/_exec/dev_mode.py +166 -97
  3. tracdap/rt/_exec/engine.py +120 -9
  4. tracdap/rt/_exec/functions.py +137 -35
  5. tracdap/rt/_exec/graph.py +38 -13
  6. tracdap/rt/_exec/graph_builder.py +120 -9
  7. tracdap/rt/_impl/data.py +183 -52
  8. tracdap/rt/_impl/grpc/tracdap/metadata/data_pb2.py +18 -18
  9. tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +74 -30
  10. tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +120 -2
  11. tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +20 -18
  12. tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.pyi +22 -6
  13. tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.py +29 -0
  14. tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.pyi +16 -0
  15. tracdap/rt/_impl/models.py +8 -0
  16. tracdap/rt/_impl/static_api.py +42 -10
  17. tracdap/rt/_impl/storage.py +37 -25
  18. tracdap/rt/_impl/validation.py +113 -11
  19. tracdap/rt/_plugins/repo_git.py +1 -1
  20. tracdap/rt/_version.py +1 -1
  21. tracdap/rt/api/experimental.py +220 -0
  22. tracdap/rt/api/hook.py +6 -4
  23. tracdap/rt/api/model_api.py +98 -13
  24. tracdap/rt/api/static_api.py +14 -6
  25. tracdap/rt/config/__init__.py +2 -2
  26. tracdap/rt/config/common.py +23 -17
  27. tracdap/rt/config/job.py +2 -2
  28. tracdap/rt/config/platform.py +25 -25
  29. tracdap/rt/config/result.py +2 -2
  30. tracdap/rt/config/runtime.py +3 -3
  31. tracdap/rt/launch/cli.py +7 -4
  32. tracdap/rt/launch/launch.py +19 -3
  33. tracdap/rt/metadata/__init__.py +25 -20
  34. tracdap/rt/metadata/common.py +2 -2
  35. tracdap/rt/metadata/custom.py +3 -3
  36. tracdap/rt/metadata/data.py +12 -12
  37. tracdap/rt/metadata/file.py +6 -6
  38. tracdap/rt/metadata/flow.py +6 -6
  39. tracdap/rt/metadata/job.py +62 -8
  40. tracdap/rt/metadata/model.py +33 -11
  41. tracdap/rt/metadata/object_id.py +8 -8
  42. tracdap/rt/metadata/resource.py +24 -0
  43. tracdap/rt/metadata/search.py +5 -5
  44. tracdap/rt/metadata/stoarge.py +6 -6
  45. tracdap/rt/metadata/tag.py +1 -1
  46. tracdap/rt/metadata/tag_update.py +1 -1
  47. tracdap/rt/metadata/type.py +4 -4
  48. {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/METADATA +3 -1
  49. {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/RECORD +52 -48
  50. {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/LICENSE +0 -0
  51. {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/WHEEL +0 -0
  52. {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/top_level.txt +0 -0
@@ -12,8 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from __future__ import annotations
16
-
17
15
  import tracdap.rt.config as config
18
16
  import tracdap.rt.exceptions as _ex
19
17
  import tracdap.rt._impl.data as _data # noqa
@@ -42,6 +40,9 @@ class GraphBuilder:
42
40
  if job_config.job.jobType == meta.JobType.RUN_FLOW:
43
41
  return cls.build_standard_job(job_config, result_spec, cls.build_run_flow_job)
44
42
 
43
+ if job_config.job.jobType in [meta.JobType.IMPORT_DATA, meta.JobType.EXPORT_DATA]:
44
+ return cls.build_standard_job(job_config, result_spec, cls.build_import_export_data_job)
45
+
45
46
  raise _ex.EConfigParse(f"Job type [{job_config.job.jobType}] is not supported yet")
46
47
 
47
48
  @classmethod
@@ -114,6 +115,28 @@ class GraphBuilder:
114
115
 
115
116
  return cls._join_sections(main_section, result_section)
116
117
 
118
+ @classmethod
119
+ def build_import_export_data_job(
120
+ cls, job_config: config.JobConfig, result_spec: JobResultSpec,
121
+ job_namespace: NodeNamespace, job_push_id: NodeId) \
122
+ -> GraphSection:
123
+
124
+ # TODO: These are processed as regular calculation jobs for now
125
+ # That might be ok, but is worth reviewing
126
+
127
+ if job_config.job.jobType == meta.JobType.IMPORT_DATA:
128
+ job_def = job_config.job.importData
129
+ else:
130
+ job_def = job_config.job.exportData
131
+
132
+ target_selector = job_def.model
133
+ target_obj = _util.get_job_resource(target_selector, job_config)
134
+ target_def = target_obj.model
135
+
136
+ return cls.build_calculation_job(
137
+ job_config, result_spec, job_namespace, job_push_id,
138
+ target_selector, target_def, job_def)
139
+
117
140
  @classmethod
118
141
  def build_run_model_job(
119
142
  cls, job_config: config.JobConfig, result_spec: JobResultSpec,
@@ -380,6 +403,65 @@ class GraphBuilder:
380
403
 
381
404
  return GraphSection(nodes, inputs=inputs)
382
405
 
406
+ @classmethod
407
+ def build_runtime_outputs(cls, output_names: tp.List[str], job_namespace: NodeNamespace):
408
+
409
+ # TODO: Factor out common logic with regular job outputs (including static / dynamic)
410
+
411
+ nodes = {}
412
+ inputs = set()
413
+ outputs = list()
414
+
415
+ for output_name in output_names:
416
+
417
+ # Output data view must already exist in the namespace
418
+ data_view_id = NodeId.of(output_name, job_namespace, _data.DataView)
419
+ data_spec_id = NodeId.of(f"{output_name}:SPEC", job_namespace, _data.DataSpec)
420
+
421
+ data_key = output_name + ":DATA"
422
+ data_id = _util.new_object_id(meta.ObjectType.DATA)
423
+ storage_key = output_name + ":STORAGE"
424
+ storage_id = _util.new_object_id(meta.ObjectType.STORAGE)
425
+
426
+ data_spec_node = DynamicDataSpecNode(
427
+ data_spec_id, data_view_id,
428
+ data_id, storage_id,
429
+ prior_data_spec=None)
430
+
431
+ output_data_key = _util.object_key(data_id)
432
+ output_storage_key = _util.object_key(storage_id)
433
+
434
+ # Map one data item from each view, since outputs are single part/delta
435
+ data_item_id = NodeId(f"{output_name}:ITEM", job_namespace, _data.DataItem)
436
+ data_item_node = DataItemNode(data_item_id, data_view_id)
437
+
438
+ # Create a physical save operation for the data item
439
+ data_save_id = NodeId.of(f"{output_name}:SAVE", job_namespace, None)
440
+ data_save_node = SaveDataNode(data_save_id, data_spec_id, data_item_id)
441
+
442
+ data_result_id = NodeId.of(f"{output_name}:RESULT", job_namespace, ObjectBundle)
443
+ data_result_node = DataResultNode(
444
+ data_result_id, output_name,
445
+ data_item_id, data_spec_id, data_save_id,
446
+ output_data_key, output_storage_key)
447
+
448
+ nodes[data_spec_id] = data_spec_node
449
+ nodes[data_item_id] = data_item_node
450
+ nodes[data_save_id] = data_save_node
451
+ nodes[data_result_id] = data_result_node
452
+
453
+ # Job-level data view is an input to the save operation
454
+ inputs.add(data_view_id)
455
+ outputs.append(data_result_id)
456
+
457
+ runtime_outputs = JobOutputs(bundles=outputs)
458
+ runtime_outputs_id = NodeId.of("trac_runtime_outputs", job_namespace, JobOutputs)
459
+ runtime_outputs_node = RuntimeOutputsNode(runtime_outputs_id, runtime_outputs)
460
+
461
+ nodes[runtime_outputs_id] = runtime_outputs_node
462
+
463
+ return GraphSection(nodes, inputs=inputs, outputs={runtime_outputs_id})
464
+
383
465
  @classmethod
384
466
  def build_job_results(
385
467
  cls, job_config: cfg.JobConfig, job_namespace: NodeNamespace, result_spec: JobResultSpec,
@@ -396,7 +478,8 @@ class GraphBuilder:
396
478
 
397
479
  build_result_node = BuildJobResultNode(
398
480
  build_result_id, job_config.jobId,
399
- objects=objects, explicit_deps=explicit_deps)
481
+ outputs = JobOutputs(objects=objects),
482
+ explicit_deps=explicit_deps)
400
483
 
401
484
  elif bundles is not None:
402
485
 
@@ -404,7 +487,8 @@ class GraphBuilder:
404
487
 
405
488
  build_result_node = BuildJobResultNode(
406
489
  build_result_id, job_config.jobId,
407
- bundles=bundles, explicit_deps=explicit_deps)
490
+ outputs = JobOutputs(bundles=bundles),
491
+ explicit_deps=explicit_deps)
408
492
 
409
493
  else:
410
494
  raise _ex.EUnexpected()
@@ -459,7 +543,7 @@ class GraphBuilder:
459
543
  -> GraphSection:
460
544
 
461
545
  if model_or_flow.objectType == meta.ObjectType.MODEL:
462
- return cls.build_model(namespace, model_or_flow.model, explicit_deps)
546
+ return cls.build_model(job_config, namespace, model_or_flow.model, explicit_deps)
463
547
 
464
548
  elif model_or_flow.objectType == meta.ObjectType.FLOW:
465
549
  return cls.build_flow(job_config, namespace, model_or_flow.flow)
@@ -469,11 +553,13 @@ class GraphBuilder:
469
553
 
470
554
  @classmethod
471
555
  def build_model(
472
- cls, namespace: NodeNamespace,
556
+ cls, job_config: config.JobConfig, namespace: NodeNamespace,
473
557
  model_def: meta.ModelDefinition,
474
558
  explicit_deps: tp.Optional[tp.List[NodeId]] = None) \
475
559
  -> GraphSection:
476
560
 
561
+ cls.check_model_type(job_config, model_def)
562
+
477
563
  def param_id(node_name):
478
564
  return NodeId(node_name, namespace, meta.Value)
479
565
 
@@ -485,6 +571,14 @@ class GraphBuilder:
485
571
  input_ids = set(map(data_id, model_def.inputs))
486
572
  output_ids = set(map(data_id, model_def.outputs))
487
573
 
574
+ # Set up storage access for import / export data jobs
575
+ if job_config.job.jobType == meta.JobType.IMPORT_DATA:
576
+ storage_access = job_config.job.importData.storageAccess
577
+ elif job_config.job.jobType == meta.JobType.EXPORT_DATA:
578
+ storage_access = job_config.job.exportData.storageAccess
579
+ else:
580
+ storage_access = None
581
+
488
582
  # Create the model node
489
583
  # Always add the prior graph root ID as a dependency
490
584
  # This is to ensure dependencies are still pulled in for models with no inputs!
@@ -500,7 +594,8 @@ class GraphBuilder:
500
594
  model_node = RunModelNode(
501
595
  model_id, model_scope, model_def,
502
596
  frozenset(parameter_ids), frozenset(input_ids),
503
- explicit_deps=explicit_deps, bundle=model_id.namespace)
597
+ explicit_deps=explicit_deps, bundle=model_id.namespace,
598
+ storage_access=storage_access)
504
599
 
505
600
  model_result_id = NodeId(f"{model_name}:RESULT", namespace)
506
601
  model_result_node = RunModelResultNode(model_result_id, model_id)
@@ -637,6 +732,7 @@ class GraphBuilder:
637
732
 
638
733
  # Explicit check for model compatibility - report an error now, do not try build_model()
639
734
  cls.check_model_compatibility(model_selector, model_obj.model, node_name, node)
735
+ cls.check_model_type(job_config, model_obj.model)
640
736
 
641
737
  return cls.build_model_or_flow_with_context(
642
738
  job_config, namespace, node_name, model_obj,
@@ -647,8 +743,8 @@ class GraphBuilder:
647
743
 
648
744
  @classmethod
649
745
  def check_model_compatibility(
650
- cls, model_selector: meta.TagSelector, model_def: meta.ModelDefinition,
651
- node_name: str, flow_node: meta.FlowNode):
746
+ cls, model_selector: meta.TagSelector,
747
+ model_def: meta.ModelDefinition, node_name: str, flow_node: meta.FlowNode):
652
748
 
653
749
  model_params = list(sorted(model_def.parameters.keys()))
654
750
  model_inputs = list(sorted(model_def.inputs.keys()))
@@ -662,6 +758,21 @@ class GraphBuilder:
662
758
  model_key = _util.object_key(model_selector)
663
759
  raise _ex.EJobValidation(f"Incompatible model for flow node [{node_name}] (Model: [{model_key}])")
664
760
 
761
+ @classmethod
762
+ def check_model_type(cls, job_config: config.JobConfig, model_def: meta.ModelDefinition):
763
+
764
+ if job_config.job.jobType == meta.JobType.IMPORT_DATA:
765
+ allowed_model_types = [meta.ModelType.DATA_IMPORT_MODEL]
766
+ elif job_config.job.jobType == meta.JobType.EXPORT_DATA:
767
+ allowed_model_types = [meta.ModelType.DATA_EXPORT_MODEL]
768
+ else:
769
+ allowed_model_types = [meta.ModelType.STANDARD_MODEL]
770
+
771
+ if model_def.modelType not in allowed_model_types:
772
+ job_type = job_config.job.jobType.name
773
+ model_type = model_def.modelType.name
774
+ raise _ex.EJobValidation(f"Job type [{job_type}] cannot use model type [{model_type}]")
775
+
665
776
  @staticmethod
666
777
  def build_context_push(
667
778
  namespace: NodeNamespace, input_mapping: tp.Dict[str, NodeId],
tracdap/rt/_impl/data.py CHANGED
@@ -12,8 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from __future__ import annotations
16
-
17
15
  import dataclasses as dc
18
16
  import typing as tp
19
17
  import datetime as dt
@@ -22,7 +20,16 @@ import platform
22
20
 
23
21
  import pyarrow as pa
24
22
  import pyarrow.compute as pc
25
- import pandas as pd
23
+
24
+ try:
25
+ import pandas # noqa
26
+ except ModuleNotFoundError:
27
+ pandas = None
28
+
29
+ try:
30
+ import polars # noqa
31
+ except ModuleNotFoundError:
32
+ polars = None
26
33
 
27
34
  import tracdap.rt.metadata as _meta
28
35
  import tracdap.rt.exceptions as _ex
@@ -42,7 +49,7 @@ class DataSpec:
42
49
  class DataPartKey:
43
50
 
44
51
  @classmethod
45
- def for_root(cls) -> DataPartKey:
52
+ def for_root(cls) -> "DataPartKey":
46
53
  return DataPartKey(opaque_key='part_root')
47
54
 
48
55
  opaque_key: str
@@ -55,14 +62,14 @@ class DataItem:
55
62
  table: tp.Optional[pa.Table] = None
56
63
  batches: tp.Optional[tp.List[pa.RecordBatch]] = None
57
64
 
58
- pandas: tp.Optional[pd.DataFrame] = None
65
+ pandas: "tp.Optional[pandas.DataFrame]" = None
59
66
  pyspark: tp.Any = None
60
67
 
61
68
  def is_empty(self) -> bool:
62
69
  return self.table is None and (self.batches is None or len(self.batches) == 0)
63
70
 
64
71
  @staticmethod
65
- def create_empty() -> DataItem:
72
+ def create_empty() -> "DataItem":
66
73
  return DataItem(pa.schema([]))
67
74
 
68
75
 
@@ -74,17 +81,21 @@ class DataView:
74
81
 
75
82
  parts: tp.Dict[DataPartKey, tp.List[DataItem]]
76
83
 
84
+ @staticmethod
85
+ def create_empty() -> "DataView":
86
+ return DataView(_meta.SchemaDefinition(), pa.schema([]), dict())
87
+
77
88
  @staticmethod
78
89
  def for_trac_schema(trac_schema: _meta.SchemaDefinition):
79
90
  arrow_schema = DataMapping.trac_to_arrow_schema(trac_schema)
80
91
  return DataView(trac_schema, arrow_schema, dict())
81
92
 
82
- def is_empty(self) -> bool:
83
- return self.parts is None or len(self.parts) == 0
93
+ def with_trac_schema(self, trac_schema: _meta.SchemaDefinition):
94
+ arrow_schema = DataMapping.trac_to_arrow_schema(trac_schema)
95
+ return DataView(trac_schema, arrow_schema, self.parts)
84
96
 
85
- @staticmethod
86
- def create_empty() -> DataView:
87
- return DataView(_meta.SchemaDefinition(), pa.schema([]), dict())
97
+ def is_empty(self) -> bool:
98
+ return self.parts is None or not any(self.parts.values())
88
99
 
89
100
 
90
101
  class _DataInternal:
@@ -121,14 +132,14 @@ class DataMapping:
121
132
  }
122
133
 
123
134
  # Check the Pandas dtypes for handling floats are available before setting up the type mapping
124
- __PANDAS_VERSION_ELEMENTS = pd.__version__.split(".")
135
+ __PANDAS_VERSION_ELEMENTS = pandas.__version__.split(".")
125
136
  __PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
126
137
  __PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
127
138
 
128
139
  if __PANDAS_MAJOR_VERSION == 2:
129
140
 
130
- __PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
131
- __PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
141
+ __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
142
+ __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
132
143
 
133
144
  @classmethod
134
145
  def __pandas_datetime_type(cls, tz, unit):
@@ -136,41 +147,61 @@ class DataMapping:
136
147
  return cls.__PANDAS_DATETIME_TYPE
137
148
  _unit = unit if unit is not None else cls.__TRAC_TIMESTAMP_UNIT
138
149
  if tz is None:
139
- return pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
150
+ return pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
140
151
  else:
141
- return pd.DatetimeTZDtype(tz=tz, unit=_unit)
152
+ return pandas.DatetimeTZDtype(tz=tz, unit=_unit)
142
153
 
143
- # Minimum supported version for Pandas is 1.2, when pd.Float64Dtype was introduced
154
+ # Minimum supported version for Pandas is 1.2, when pandas.Float64Dtype was introduced
144
155
  elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
145
156
 
146
- __PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).dtype
147
- __PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
157
+ __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).dtype
158
+ __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
148
159
 
149
160
  @classmethod
150
161
  def __pandas_datetime_type(cls, tz, unit): # noqa
151
162
  if tz is None:
152
163
  return cls.__PANDAS_DATETIME_TYPE
153
164
  else:
154
- return pd.DatetimeTZDtype(tz=tz)
165
+ return pandas.DatetimeTZDtype(tz=tz)
155
166
 
156
167
  else:
157
- raise _ex.EStartup(f"Pandas version not supported: [{pd.__version__}]")
168
+ raise _ex.EStartup(f"Pandas version not supported: [{pandas.__version__}]")
158
169
 
159
170
  # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
160
171
  __ARROW_TO_PANDAS_TYPE_MAPPING = {
161
- pa.bool_(): pd.BooleanDtype(),
162
- pa.int8(): pd.Int8Dtype(),
163
- pa.int16(): pd.Int16Dtype(),
164
- pa.int32(): pd.Int32Dtype(),
165
- pa.int64(): pd.Int64Dtype(),
166
- pa.uint8(): pd.UInt8Dtype(),
167
- pa.uint16(): pd.UInt16Dtype(),
168
- pa.uint32(): pd.UInt32Dtype(),
169
- pa.uint64(): pd.UInt64Dtype(),
170
- pa.float16(): pd.Float32Dtype(),
171
- pa.float32(): pd.Float32Dtype(),
172
- pa.float64(): pd.Float64Dtype(),
173
- pa.utf8(): pd.StringDtype()
172
+ pa.bool_(): pandas.BooleanDtype(),
173
+ pa.int8(): pandas.Int8Dtype(),
174
+ pa.int16(): pandas.Int16Dtype(),
175
+ pa.int32(): pandas.Int32Dtype(),
176
+ pa.int64(): pandas.Int64Dtype(),
177
+ pa.uint8(): pandas.UInt8Dtype(),
178
+ pa.uint16(): pandas.UInt16Dtype(),
179
+ pa.uint32(): pandas.UInt32Dtype(),
180
+ pa.uint64(): pandas.UInt64Dtype(),
181
+ pa.float16(): pandas.Float32Dtype(),
182
+ pa.float32(): pandas.Float32Dtype(),
183
+ pa.float64(): pandas.Float64Dtype(),
184
+ pa.string(): pandas.StringDtype(),
185
+ pa.utf8(): pandas.StringDtype()
186
+ }
187
+
188
+ __ARROW_TO_TRAC_BASIC_TYPE_MAPPING = {
189
+ pa.bool_(): _meta.BasicType.BOOLEAN,
190
+ pa.int8(): _meta.BasicType.INTEGER,
191
+ pa.int16(): _meta.BasicType.INTEGER,
192
+ pa.int32(): _meta.BasicType.INTEGER,
193
+ pa.int64():_meta.BasicType.INTEGER,
194
+ pa.uint8(): _meta.BasicType.INTEGER,
195
+ pa.uint16(): _meta.BasicType.INTEGER,
196
+ pa.uint32(): _meta.BasicType.INTEGER,
197
+ pa.uint64(): _meta.BasicType.INTEGER,
198
+ pa.float16(): _meta.BasicType.FLOAT,
199
+ pa.float32(): _meta.BasicType.FLOAT,
200
+ pa.float64(): _meta.BasicType.FLOAT,
201
+ pa.string(): _meta.BasicType.STRING,
202
+ pa.utf8(): _meta.BasicType.STRING,
203
+ pa.date32(): _meta.BasicType.DATE,
204
+ pa.date64(): _meta.BasicType.DATE
174
205
  }
175
206
 
176
207
  @staticmethod
@@ -265,6 +296,47 @@ class DataMapping:
265
296
  cls.__TRAC_DECIMAL_PRECISION,
266
297
  cls.__TRAC_DECIMAL_SCALE)
267
298
 
299
+ @classmethod
300
+ def arrow_to_trac_schema(cls, arrow_schema: pa.Schema) -> _meta.SchemaDefinition:
301
+
302
+ trac_fields = list(
303
+ cls.arrow_to_trac_field(i, arrow_schema.field(i))
304
+ for (i, f) in enumerate(arrow_schema.names))
305
+
306
+ return _meta.SchemaDefinition(
307
+ schemaType=_meta.SchemaType.TABLE,
308
+ partType=_meta.PartType.PART_ROOT,
309
+ table=_meta.TableSchema(trac_fields))
310
+
311
+ @classmethod
312
+ def arrow_to_trac_field(cls, field_index: int, field: pa.Field) -> _meta.FieldSchema:
313
+
314
+ field_type = cls.arrow_to_trac_type(field.type)
315
+ label = field.metadata["label"] if field.metadata and "label" in field.metadata else field.name
316
+
317
+ return _meta.FieldSchema(
318
+ field.name, field_index, field_type,
319
+ label=label,
320
+ businessKey=False,
321
+ notNull=not field.nullable,
322
+ categorical=False)
323
+
324
+ @classmethod
325
+ def arrow_to_trac_type(cls, arrow_type: pa.DataType) -> _meta.BasicType:
326
+
327
+ mapped_basic_type = cls.__ARROW_TO_TRAC_BASIC_TYPE_MAPPING.get(arrow_type) # noqa
328
+
329
+ if mapped_basic_type is not None:
330
+ return mapped_basic_type
331
+
332
+ if pa.types.is_decimal(arrow_type):
333
+ return _meta.BasicType.DECIMAL
334
+
335
+ if pa.types.is_timestamp(arrow_type):
336
+ return _meta.BasicType.DATETIME
337
+
338
+ raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
339
+
268
340
  @classmethod
269
341
  def pandas_date_type(cls):
270
342
  return cls.__PANDAS_DATE_TYPE
@@ -275,18 +347,31 @@ class DataMapping:
275
347
 
276
348
  @classmethod
277
349
  def view_to_pandas(
278
- cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema],
279
- temporal_objects_flag: bool) -> pd.DataFrame:
350
+ cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema],
351
+ temporal_objects_flag: bool) -> "pandas.DataFrame":
280
352
 
281
353
  table = cls.view_to_arrow(view, part)
282
354
  return cls.arrow_to_pandas(table, schema, temporal_objects_flag)
283
355
 
284
356
  @classmethod
285
- def pandas_to_item(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema]) -> DataItem:
357
+ def view_to_polars(
358
+ cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema]):
359
+
360
+ table = cls.view_to_arrow(view, part)
361
+ return cls.arrow_to_polars(table, schema)
362
+
363
+ @classmethod
364
+ def pandas_to_item(cls, df: "pandas.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
286
365
 
287
366
  table = cls.pandas_to_arrow(df, schema)
288
367
  return DataItem(table.schema, table)
289
368
 
369
+ @classmethod
370
+ def polars_to_item(cls, df: "polars.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
371
+
372
+ table = cls.polars_to_arrow(df, schema)
373
+ return DataItem(table.schema, table)
374
+
290
375
  @classmethod
291
376
  def add_item_to_view(cls, view: DataView, part: DataPartKey, item: DataItem) -> DataView:
292
377
 
@@ -336,7 +421,7 @@ class DataMapping:
336
421
  @classmethod
337
422
  def arrow_to_pandas(
338
423
  cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None,
339
- temporal_objects_flag: bool = False) -> pd.DataFrame:
424
+ temporal_objects_flag: bool = False) -> "pandas.DataFrame":
340
425
 
341
426
  if schema is not None:
342
427
  table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
@@ -361,7 +446,18 @@ class DataMapping:
361
446
  split_blocks=True) # noqa
362
447
 
363
448
  @classmethod
364
- def pandas_to_arrow(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema] = None) -> pa.Table:
449
+ def arrow_to_polars(
450
+ cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> "polars.DataFrame":
451
+
452
+ if schema is not None:
453
+ table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
454
+ else:
455
+ DataConformance.check_duplicate_fields(table.schema.names, False)
456
+
457
+ return polars.from_arrow(table)
458
+
459
+ @classmethod
460
+ def pandas_to_arrow(cls, df: "pandas.DataFrame", schema: tp.Optional[pa.Schema] = None) -> pa.Table:
365
461
 
366
462
  # Converting pandas -> arrow needs care to ensure type coercion is applied correctly
367
463
  # Calling Table.from_pandas with the supplied schema will very often reject data
@@ -403,6 +499,30 @@ class DataMapping:
403
499
  df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
404
500
  return DataConformance.conform_to_schema(table, schema, df_types)
405
501
 
502
+ @classmethod
503
+ def pandas_to_arrow_schema(cls, df: "pandas.DataFrame") -> pa.Schema:
504
+
505
+ return pa.Schema.from_pandas(df, preserve_index=False) # noqa
506
+
507
+ @classmethod
508
+ def polars_to_arrow(cls, df: "polars.DataFrame", schema: tp.Optional[pa.Schema] = None) -> pa.Table:
509
+
510
+ column_filter = DataConformance.column_filter(df.columns, schema)
511
+
512
+ filtered_df = df.select(polars.col(*column_filter)) if column_filter else df
513
+ table = filtered_df.to_arrow()
514
+
515
+ if schema is None:
516
+ DataConformance.check_duplicate_fields(table.schema.names, False)
517
+ return table
518
+ else:
519
+ return DataConformance.conform_to_schema(table, schema, None)
520
+
521
+ @classmethod
522
+ def polars_to_arrow_schema(cls, df: "polars.DataFrame") -> pa.Schema:
523
+
524
+ return df.top_k(1).to_arrow().schema
525
+
406
526
 
407
527
  class DataConformance:
408
528
 
@@ -719,21 +839,32 @@ class DataConformance:
719
839
  @classmethod
720
840
  def _coerce_string(cls, vector: pa.Array, field: pa.Field) -> pa.Array:
721
841
 
722
- if pa.types.is_string(field.type):
723
- if pa.types.is_string(vector.type):
724
- return vector
842
+ try:
725
843
 
726
- if pa.types.is_large_string(field.type):
727
- if pa.types.is_large_string(vector.type):
728
- return vector
729
- # Allow up-casting string -> large_string
730
- if pa.types.is_string(vector.type):
731
- return pc.cast(vector, field.type)
844
+ if pa.types.is_string(field.type):
845
+ if pa.types.is_string(vector.type):
846
+ return vector
847
+ # Try to down-cast large string -> string, will raise ArrowInvalid if data does not fit
848
+ if pa.types.is_large_string(vector.type):
849
+ return pc.cast(vector, field.type, safe=True)
850
+
851
+ if pa.types.is_large_string(field.type):
852
+ if pa.types.is_large_string(vector.type):
853
+ return vector
854
+ # Allow up-casting string -> large_string
855
+ if pa.types.is_string(vector.type):
856
+ return pc.cast(vector, field.type)
732
857
 
733
- error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
734
- cls.__log.error(error_message)
858
+ error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
859
+ cls.__log.error(error_message)
860
+ raise _ex.EDataConformance(error_message)
861
+
862
+ except pa.ArrowInvalid as e:
863
+
864
+ error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e)
865
+ cls.__log.error(error_message)
866
+ raise _ex.EDataConformance(error_message) from e
735
867
 
736
- raise _ex.EDataConformance(error_message)
737
868
 
738
869
  @classmethod
739
870
  def _coerce_date(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
@@ -751,7 +882,7 @@ class DataConformance:
751
882
  # For Pandas 2.x dates are still np.datetime64 but can be in s, ms, us or ns
752
883
  # This conversion will not apply to dates held in Pandas using the Python date object types
753
884
  if pandas_type is not None:
754
- if pa.types.is_timestamp(vector.type) and pd.api.types.is_datetime64_any_dtype(pandas_type):
885
+ if pa.types.is_timestamp(vector.type) and pandas.api.types.is_datetime64_any_dtype(pandas_type):
755
886
  return pc.cast(vector, field.type)
756
887
 
757
888
  error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
@@ -16,7 +16,7 @@ from tracdap.rt._impl.grpc.tracdap.metadata import type_pb2 as tracdap_dot_rt_do
16
16
  from tracdap.rt._impl.grpc.tracdap.metadata import object_id_pb2 as tracdap_dot_rt_dot___impl_dot_grpc_dot_tracdap_dot_metadata_dot_object__id__pb2
17
17
 
18
18
 
19
- DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1tracdap/rt/_impl/grpc/tracdap/metadata/data.proto\x12\x10tracdap.metadata\x1a\x31tracdap/rt/_impl/grpc/tracdap/metadata/type.proto\x1a\x36tracdap/rt/_impl/grpc/tracdap/metadata/object_id.proto\"\xe7\x01\n\x0b\x46ieldSchema\x12\x11\n\tfieldName\x18\x01 \x01(\t\x12\x12\n\nfieldOrder\x18\x02 \x01(\x11\x12.\n\tfieldType\x18\x03 \x01(\x0e\x32\x1b.tracdap.metadata.BasicType\x12\r\n\x05label\x18\x04 \x01(\t\x12\x13\n\x0b\x62usinessKey\x18\x05 \x01(\x08\x12\x13\n\x0b\x63\x61tegorical\x18\x06 \x01(\x08\x12\x14\n\x07notNull\x18\x08 \x01(\x08H\x00\x88\x01\x01\x12\x17\n\nformatCode\x18\x07 \x01(\tH\x01\x88\x01\x01\x42\n\n\x08_notNullB\r\n\x0b_formatCode\"<\n\x0bTableSchema\x12-\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x1d.tracdap.metadata.FieldSchema\"\xba\x01\n\x10SchemaDefinition\x12\x30\n\nschemaType\x18\x01 \x01(\x0e\x32\x1c.tracdap.metadata.SchemaType\x12,\n\x08partType\x18\x02 \x01(\x0e\x32\x1a.tracdap.metadata.PartType\x12.\n\x05table\x18\x03 \x01(\x0b\x32\x1d.tracdap.metadata.TableSchemaH\x00\x42\x16\n\x14schemaTypeDefinition\"\x81\x02\n\x07PartKey\x12\x11\n\topaqueKey\x18\x01 \x01(\t\x12,\n\x08partType\x18\x02 \x01(\x0e\x32\x1a.tracdap.metadata.PartType\x12+\n\npartValues\x18\x03 \x03(\x0b\x32\x17.tracdap.metadata.Value\x12\x32\n\x0cpartRangeMin\x18\x04 \x01(\x0b\x32\x17.tracdap.metadata.ValueH\x00\x88\x01\x01\x12\x32\n\x0cpartRangeMax\x18\x05 \x01(\x0b\x32\x17.tracdap.metadata.ValueH\x01\x88\x01\x01\x42\x0f\n\r_partRangeMinB\x0f\n\r_partRangeMax\"\xba\x04\n\x0e\x44\x61taDefinition\x12\x31\n\x08schemaId\x18\x01 \x01(\x0b\x32\x1d.tracdap.metadata.TagSelectorH\x00\x12\x34\n\x06schema\x18\x02 \x01(\x0b\x32\".tracdap.metadata.SchemaDefinitionH\x00\x12:\n\x05parts\x18\x03 \x03(\x0b\x32+.tracdap.metadata.DataDefinition.PartsEntry\x12\x30\n\tstorageId\x18\x04 \x01(\x0b\x32\x1d.tracdap.metadata.TagSelector\x1a-\n\x05\x44\x65lta\x12\x12\n\ndeltaIndex\x18\x01 \x01(\r\x12\x10\n\x08\x64\x61taItem\x18\x02 \x01(\t\x1aQ\n\x04Snap\x12\x11\n\tsnapIndex\x18\x01 \x01(\r\x12\x36\n\x06\x64\x65ltas\x18\x02 \x03(\x0b\x32&.tracdap.metadata.DataDefinition.Delta\x1ag\n\x04Part\x12*\n\x07partKey\x18\x01 \x01(\x0b\x32\x19.tracdap.metadata.PartKey\x12\x33\n\x04snap\x18\x02 \x01(\x0b\x32%.tracdap.metadata.DataDefinition.Snap\x1aS\n\nPartsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x34\n\x05value\x18\x02 \x01(\x0b\x32%.tracdap.metadata.DataDefinition.Part:\x02\x38\x01\x42\x11\n\x0fschemaSpecifier*0\n\nSchemaType\x12\x17\n\x13SCHEMA_TYPE_NOT_SET\x10\x00\x12\t\n\x05TABLE\x10\x01*?\n\x08PartType\x12\r\n\tPART_ROOT\x10\x00\x12\x11\n\rPART_BY_RANGE\x10\x01\x12\x11\n\rPART_BY_VALUE\x10\x02\x42\x1e\n\x1aorg.finos.tracdap.metadataP\x01\x62\x06proto3')
19
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1tracdap/rt/_impl/grpc/tracdap/metadata/data.proto\x12\x10tracdap.metadata\x1a\x31tracdap/rt/_impl/grpc/tracdap/metadata/type.proto\x1a\x36tracdap/rt/_impl/grpc/tracdap/metadata/object_id.proto\"\xe7\x01\n\x0b\x46ieldSchema\x12\x11\n\tfieldName\x18\x01 \x01(\t\x12\x12\n\nfieldOrder\x18\x02 \x01(\x11\x12.\n\tfieldType\x18\x03 \x01(\x0e\x32\x1b.tracdap.metadata.BasicType\x12\r\n\x05label\x18\x04 \x01(\t\x12\x13\n\x0b\x62usinessKey\x18\x05 \x01(\x08\x12\x13\n\x0b\x63\x61tegorical\x18\x06 \x01(\x08\x12\x14\n\x07notNull\x18\x08 \x01(\x08H\x00\x88\x01\x01\x12\x17\n\nformatCode\x18\x07 \x01(\tH\x01\x88\x01\x01\x42\n\n\x08_notNullB\r\n\x0b_formatCode\"<\n\x0bTableSchema\x12-\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x1d.tracdap.metadata.FieldSchema\"\xb3\x01\n\x10SchemaDefinition\x12\x30\n\nschemaType\x18\x01 \x01(\x0e\x32\x1c.tracdap.metadata.SchemaType\x12,\n\x08partType\x18\x02 \x01(\x0e\x32\x1a.tracdap.metadata.PartType\x12.\n\x05table\x18\x03 \x01(\x0b\x32\x1d.tracdap.metadata.TableSchemaH\x00\x42\x0f\n\rschemaDetails\"\x81\x02\n\x07PartKey\x12\x11\n\topaqueKey\x18\x01 \x01(\t\x12,\n\x08partType\x18\x02 \x01(\x0e\x32\x1a.tracdap.metadata.PartType\x12+\n\npartValues\x18\x03 \x03(\x0b\x32\x17.tracdap.metadata.Value\x12\x32\n\x0cpartRangeMin\x18\x04 \x01(\x0b\x32\x17.tracdap.metadata.ValueH\x00\x88\x01\x01\x12\x32\n\x0cpartRangeMax\x18\x05 \x01(\x0b\x32\x17.tracdap.metadata.ValueH\x01\x88\x01\x01\x42\x0f\n\r_partRangeMinB\x0f\n\r_partRangeMax\"\xba\x04\n\x0e\x44\x61taDefinition\x12\x31\n\x08schemaId\x18\x01 \x01(\x0b\x32\x1d.tracdap.metadata.TagSelectorH\x00\x12\x34\n\x06schema\x18\x02 \x01(\x0b\x32\".tracdap.metadata.SchemaDefinitionH\x00\x12:\n\x05parts\x18\x03 \x03(\x0b\x32+.tracdap.metadata.DataDefinition.PartsEntry\x12\x30\n\tstorageId\x18\x04 \x01(\x0b\x32\x1d.tracdap.metadata.TagSelector\x1a-\n\x05\x44\x65lta\x12\x12\n\ndeltaIndex\x18\x01 \x01(\r\x12\x10\n\x08\x64\x61taItem\x18\x02 \x01(\t\x1aQ\n\x04Snap\x12\x11\n\tsnapIndex\x18\x01 \x01(\r\x12\x36\n\x06\x64\x65ltas\x18\x02 \x03(\x0b\x32&.tracdap.metadata.DataDefinition.Delta\x1ag\n\x04Part\x12*\n\x07partKey\x18\x01 \x01(\x0b\x32\x19.tracdap.metadata.PartKey\x12\x33\n\x04snap\x18\x02 \x01(\x0b\x32%.tracdap.metadata.DataDefinition.Snap\x1aS\n\nPartsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x34\n\x05value\x18\x02 \x01(\x0b\x32%.tracdap.metadata.DataDefinition.Part:\x02\x38\x01\x42\x11\n\x0fschemaSpecifier*0\n\nSchemaType\x12\x17\n\x13SCHEMA_TYPE_NOT_SET\x10\x00\x12\t\n\x05TABLE\x10\x01*?\n\x08PartType\x12\r\n\tPART_ROOT\x10\x00\x12\x11\n\rPART_BY_RANGE\x10\x01\x12\x11\n\rPART_BY_VALUE\x10\x02\x42\x1e\n\x1aorg.finos.tracdap.metadataP\x01\x62\x06proto3')
20
20
 
21
21
  _globals = globals()
22
22
  _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -26,26 +26,26 @@ if _descriptor._USE_C_DESCRIPTORS == False:
26
26
  _globals['DESCRIPTOR']._serialized_options = b'\n\032org.finos.tracdap.metadataP\001'
27
27
  _globals['_DATADEFINITION_PARTSENTRY']._options = None
28
28
  _globals['_DATADEFINITION_PARTSENTRY']._serialized_options = b'8\001'
29
- _globals['_SCHEMATYPE']._serialized_start=1496
30
- _globals['_SCHEMATYPE']._serialized_end=1544
31
- _globals['_PARTTYPE']._serialized_start=1546
32
- _globals['_PARTTYPE']._serialized_end=1609
29
+ _globals['_SCHEMATYPE']._serialized_start=1489
30
+ _globals['_SCHEMATYPE']._serialized_end=1537
31
+ _globals['_PARTTYPE']._serialized_start=1539
32
+ _globals['_PARTTYPE']._serialized_end=1602
33
33
  _globals['_FIELDSCHEMA']._serialized_start=179
34
34
  _globals['_FIELDSCHEMA']._serialized_end=410
35
35
  _globals['_TABLESCHEMA']._serialized_start=412
36
36
  _globals['_TABLESCHEMA']._serialized_end=472
37
37
  _globals['_SCHEMADEFINITION']._serialized_start=475
38
- _globals['_SCHEMADEFINITION']._serialized_end=661
39
- _globals['_PARTKEY']._serialized_start=664
40
- _globals['_PARTKEY']._serialized_end=921
41
- _globals['_DATADEFINITION']._serialized_start=924
42
- _globals['_DATADEFINITION']._serialized_end=1494
43
- _globals['_DATADEFINITION_DELTA']._serialized_start=1157
44
- _globals['_DATADEFINITION_DELTA']._serialized_end=1202
45
- _globals['_DATADEFINITION_SNAP']._serialized_start=1204
46
- _globals['_DATADEFINITION_SNAP']._serialized_end=1285
47
- _globals['_DATADEFINITION_PART']._serialized_start=1287
48
- _globals['_DATADEFINITION_PART']._serialized_end=1390
49
- _globals['_DATADEFINITION_PARTSENTRY']._serialized_start=1392
50
- _globals['_DATADEFINITION_PARTSENTRY']._serialized_end=1475
38
+ _globals['_SCHEMADEFINITION']._serialized_end=654
39
+ _globals['_PARTKEY']._serialized_start=657
40
+ _globals['_PARTKEY']._serialized_end=914
41
+ _globals['_DATADEFINITION']._serialized_start=917
42
+ _globals['_DATADEFINITION']._serialized_end=1487
43
+ _globals['_DATADEFINITION_DELTA']._serialized_start=1150
44
+ _globals['_DATADEFINITION_DELTA']._serialized_end=1195
45
+ _globals['_DATADEFINITION_SNAP']._serialized_start=1197
46
+ _globals['_DATADEFINITION_SNAP']._serialized_end=1278
47
+ _globals['_DATADEFINITION_PART']._serialized_start=1280
48
+ _globals['_DATADEFINITION_PART']._serialized_end=1383
49
+ _globals['_DATADEFINITION_PARTSENTRY']._serialized_start=1385
50
+ _globals['_DATADEFINITION_PARTSENTRY']._serialized_end=1468
51
51
  # @@protoc_insertion_point(module_scope)