tracdap-runtime 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracdap/rt/_exec/context.py +572 -112
- tracdap/rt/_exec/dev_mode.py +166 -97
- tracdap/rt/_exec/engine.py +120 -9
- tracdap/rt/_exec/functions.py +137 -35
- tracdap/rt/_exec/graph.py +38 -13
- tracdap/rt/_exec/graph_builder.py +120 -9
- tracdap/rt/_impl/data.py +183 -52
- tracdap/rt/_impl/grpc/tracdap/metadata/data_pb2.py +18 -18
- tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +74 -30
- tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +120 -2
- tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +20 -18
- tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.pyi +22 -6
- tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.py +29 -0
- tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.pyi +16 -0
- tracdap/rt/_impl/models.py +8 -0
- tracdap/rt/_impl/static_api.py +42 -10
- tracdap/rt/_impl/storage.py +37 -25
- tracdap/rt/_impl/validation.py +113 -11
- tracdap/rt/_plugins/repo_git.py +1 -1
- tracdap/rt/_version.py +1 -1
- tracdap/rt/api/experimental.py +220 -0
- tracdap/rt/api/hook.py +6 -4
- tracdap/rt/api/model_api.py +98 -13
- tracdap/rt/api/static_api.py +14 -6
- tracdap/rt/config/__init__.py +2 -2
- tracdap/rt/config/common.py +23 -17
- tracdap/rt/config/job.py +2 -2
- tracdap/rt/config/platform.py +25 -25
- tracdap/rt/config/result.py +2 -2
- tracdap/rt/config/runtime.py +3 -3
- tracdap/rt/launch/cli.py +7 -4
- tracdap/rt/launch/launch.py +19 -3
- tracdap/rt/metadata/__init__.py +25 -20
- tracdap/rt/metadata/common.py +2 -2
- tracdap/rt/metadata/custom.py +3 -3
- tracdap/rt/metadata/data.py +12 -12
- tracdap/rt/metadata/file.py +6 -6
- tracdap/rt/metadata/flow.py +6 -6
- tracdap/rt/metadata/job.py +62 -8
- tracdap/rt/metadata/model.py +33 -11
- tracdap/rt/metadata/object_id.py +8 -8
- tracdap/rt/metadata/resource.py +24 -0
- tracdap/rt/metadata/search.py +5 -5
- tracdap/rt/metadata/stoarge.py +6 -6
- tracdap/rt/metadata/tag.py +1 -1
- tracdap/rt/metadata/tag_update.py +1 -1
- tracdap/rt/metadata/type.py +4 -4
- {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/METADATA +3 -1
- {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/RECORD +52 -48
- {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/LICENSE +0 -0
- {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/WHEEL +0 -0
- {tracdap_runtime-0.6.3.dist-info → tracdap_runtime-0.6.5.dist-info}/top_level.txt +0 -0
@@ -12,8 +12,6 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
from __future__ import annotations
|
16
|
-
|
17
15
|
import tracdap.rt.config as config
|
18
16
|
import tracdap.rt.exceptions as _ex
|
19
17
|
import tracdap.rt._impl.data as _data # noqa
|
@@ -42,6 +40,9 @@ class GraphBuilder:
|
|
42
40
|
if job_config.job.jobType == meta.JobType.RUN_FLOW:
|
43
41
|
return cls.build_standard_job(job_config, result_spec, cls.build_run_flow_job)
|
44
42
|
|
43
|
+
if job_config.job.jobType in [meta.JobType.IMPORT_DATA, meta.JobType.EXPORT_DATA]:
|
44
|
+
return cls.build_standard_job(job_config, result_spec, cls.build_import_export_data_job)
|
45
|
+
|
45
46
|
raise _ex.EConfigParse(f"Job type [{job_config.job.jobType}] is not supported yet")
|
46
47
|
|
47
48
|
@classmethod
|
@@ -114,6 +115,28 @@ class GraphBuilder:
|
|
114
115
|
|
115
116
|
return cls._join_sections(main_section, result_section)
|
116
117
|
|
118
|
+
@classmethod
|
119
|
+
def build_import_export_data_job(
|
120
|
+
cls, job_config: config.JobConfig, result_spec: JobResultSpec,
|
121
|
+
job_namespace: NodeNamespace, job_push_id: NodeId) \
|
122
|
+
-> GraphSection:
|
123
|
+
|
124
|
+
# TODO: These are processed as regular calculation jobs for now
|
125
|
+
# That might be ok, but is worth reviewing
|
126
|
+
|
127
|
+
if job_config.job.jobType == meta.JobType.IMPORT_DATA:
|
128
|
+
job_def = job_config.job.importData
|
129
|
+
else:
|
130
|
+
job_def = job_config.job.exportData
|
131
|
+
|
132
|
+
target_selector = job_def.model
|
133
|
+
target_obj = _util.get_job_resource(target_selector, job_config)
|
134
|
+
target_def = target_obj.model
|
135
|
+
|
136
|
+
return cls.build_calculation_job(
|
137
|
+
job_config, result_spec, job_namespace, job_push_id,
|
138
|
+
target_selector, target_def, job_def)
|
139
|
+
|
117
140
|
@classmethod
|
118
141
|
def build_run_model_job(
|
119
142
|
cls, job_config: config.JobConfig, result_spec: JobResultSpec,
|
@@ -380,6 +403,65 @@ class GraphBuilder:
|
|
380
403
|
|
381
404
|
return GraphSection(nodes, inputs=inputs)
|
382
405
|
|
406
|
+
@classmethod
|
407
|
+
def build_runtime_outputs(cls, output_names: tp.List[str], job_namespace: NodeNamespace):
|
408
|
+
|
409
|
+
# TODO: Factor out common logic with regular job outputs (including static / dynamic)
|
410
|
+
|
411
|
+
nodes = {}
|
412
|
+
inputs = set()
|
413
|
+
outputs = list()
|
414
|
+
|
415
|
+
for output_name in output_names:
|
416
|
+
|
417
|
+
# Output data view must already exist in the namespace
|
418
|
+
data_view_id = NodeId.of(output_name, job_namespace, _data.DataView)
|
419
|
+
data_spec_id = NodeId.of(f"{output_name}:SPEC", job_namespace, _data.DataSpec)
|
420
|
+
|
421
|
+
data_key = output_name + ":DATA"
|
422
|
+
data_id = _util.new_object_id(meta.ObjectType.DATA)
|
423
|
+
storage_key = output_name + ":STORAGE"
|
424
|
+
storage_id = _util.new_object_id(meta.ObjectType.STORAGE)
|
425
|
+
|
426
|
+
data_spec_node = DynamicDataSpecNode(
|
427
|
+
data_spec_id, data_view_id,
|
428
|
+
data_id, storage_id,
|
429
|
+
prior_data_spec=None)
|
430
|
+
|
431
|
+
output_data_key = _util.object_key(data_id)
|
432
|
+
output_storage_key = _util.object_key(storage_id)
|
433
|
+
|
434
|
+
# Map one data item from each view, since outputs are single part/delta
|
435
|
+
data_item_id = NodeId(f"{output_name}:ITEM", job_namespace, _data.DataItem)
|
436
|
+
data_item_node = DataItemNode(data_item_id, data_view_id)
|
437
|
+
|
438
|
+
# Create a physical save operation for the data item
|
439
|
+
data_save_id = NodeId.of(f"{output_name}:SAVE", job_namespace, None)
|
440
|
+
data_save_node = SaveDataNode(data_save_id, data_spec_id, data_item_id)
|
441
|
+
|
442
|
+
data_result_id = NodeId.of(f"{output_name}:RESULT", job_namespace, ObjectBundle)
|
443
|
+
data_result_node = DataResultNode(
|
444
|
+
data_result_id, output_name,
|
445
|
+
data_item_id, data_spec_id, data_save_id,
|
446
|
+
output_data_key, output_storage_key)
|
447
|
+
|
448
|
+
nodes[data_spec_id] = data_spec_node
|
449
|
+
nodes[data_item_id] = data_item_node
|
450
|
+
nodes[data_save_id] = data_save_node
|
451
|
+
nodes[data_result_id] = data_result_node
|
452
|
+
|
453
|
+
# Job-level data view is an input to the save operation
|
454
|
+
inputs.add(data_view_id)
|
455
|
+
outputs.append(data_result_id)
|
456
|
+
|
457
|
+
runtime_outputs = JobOutputs(bundles=outputs)
|
458
|
+
runtime_outputs_id = NodeId.of("trac_runtime_outputs", job_namespace, JobOutputs)
|
459
|
+
runtime_outputs_node = RuntimeOutputsNode(runtime_outputs_id, runtime_outputs)
|
460
|
+
|
461
|
+
nodes[runtime_outputs_id] = runtime_outputs_node
|
462
|
+
|
463
|
+
return GraphSection(nodes, inputs=inputs, outputs={runtime_outputs_id})
|
464
|
+
|
383
465
|
@classmethod
|
384
466
|
def build_job_results(
|
385
467
|
cls, job_config: cfg.JobConfig, job_namespace: NodeNamespace, result_spec: JobResultSpec,
|
@@ -396,7 +478,8 @@ class GraphBuilder:
|
|
396
478
|
|
397
479
|
build_result_node = BuildJobResultNode(
|
398
480
|
build_result_id, job_config.jobId,
|
399
|
-
objects=objects,
|
481
|
+
outputs = JobOutputs(objects=objects),
|
482
|
+
explicit_deps=explicit_deps)
|
400
483
|
|
401
484
|
elif bundles is not None:
|
402
485
|
|
@@ -404,7 +487,8 @@ class GraphBuilder:
|
|
404
487
|
|
405
488
|
build_result_node = BuildJobResultNode(
|
406
489
|
build_result_id, job_config.jobId,
|
407
|
-
bundles=bundles,
|
490
|
+
outputs = JobOutputs(bundles=bundles),
|
491
|
+
explicit_deps=explicit_deps)
|
408
492
|
|
409
493
|
else:
|
410
494
|
raise _ex.EUnexpected()
|
@@ -459,7 +543,7 @@ class GraphBuilder:
|
|
459
543
|
-> GraphSection:
|
460
544
|
|
461
545
|
if model_or_flow.objectType == meta.ObjectType.MODEL:
|
462
|
-
return cls.build_model(namespace, model_or_flow.model, explicit_deps)
|
546
|
+
return cls.build_model(job_config, namespace, model_or_flow.model, explicit_deps)
|
463
547
|
|
464
548
|
elif model_or_flow.objectType == meta.ObjectType.FLOW:
|
465
549
|
return cls.build_flow(job_config, namespace, model_or_flow.flow)
|
@@ -469,11 +553,13 @@ class GraphBuilder:
|
|
469
553
|
|
470
554
|
@classmethod
|
471
555
|
def build_model(
|
472
|
-
cls, namespace: NodeNamespace,
|
556
|
+
cls, job_config: config.JobConfig, namespace: NodeNamespace,
|
473
557
|
model_def: meta.ModelDefinition,
|
474
558
|
explicit_deps: tp.Optional[tp.List[NodeId]] = None) \
|
475
559
|
-> GraphSection:
|
476
560
|
|
561
|
+
cls.check_model_type(job_config, model_def)
|
562
|
+
|
477
563
|
def param_id(node_name):
|
478
564
|
return NodeId(node_name, namespace, meta.Value)
|
479
565
|
|
@@ -485,6 +571,14 @@ class GraphBuilder:
|
|
485
571
|
input_ids = set(map(data_id, model_def.inputs))
|
486
572
|
output_ids = set(map(data_id, model_def.outputs))
|
487
573
|
|
574
|
+
# Set up storage access for import / export data jobs
|
575
|
+
if job_config.job.jobType == meta.JobType.IMPORT_DATA:
|
576
|
+
storage_access = job_config.job.importData.storageAccess
|
577
|
+
elif job_config.job.jobType == meta.JobType.EXPORT_DATA:
|
578
|
+
storage_access = job_config.job.exportData.storageAccess
|
579
|
+
else:
|
580
|
+
storage_access = None
|
581
|
+
|
488
582
|
# Create the model node
|
489
583
|
# Always add the prior graph root ID as a dependency
|
490
584
|
# This is to ensure dependencies are still pulled in for models with no inputs!
|
@@ -500,7 +594,8 @@ class GraphBuilder:
|
|
500
594
|
model_node = RunModelNode(
|
501
595
|
model_id, model_scope, model_def,
|
502
596
|
frozenset(parameter_ids), frozenset(input_ids),
|
503
|
-
explicit_deps=explicit_deps, bundle=model_id.namespace
|
597
|
+
explicit_deps=explicit_deps, bundle=model_id.namespace,
|
598
|
+
storage_access=storage_access)
|
504
599
|
|
505
600
|
model_result_id = NodeId(f"{model_name}:RESULT", namespace)
|
506
601
|
model_result_node = RunModelResultNode(model_result_id, model_id)
|
@@ -637,6 +732,7 @@ class GraphBuilder:
|
|
637
732
|
|
638
733
|
# Explicit check for model compatibility - report an error now, do not try build_model()
|
639
734
|
cls.check_model_compatibility(model_selector, model_obj.model, node_name, node)
|
735
|
+
cls.check_model_type(job_config, model_obj.model)
|
640
736
|
|
641
737
|
return cls.build_model_or_flow_with_context(
|
642
738
|
job_config, namespace, node_name, model_obj,
|
@@ -647,8 +743,8 @@ class GraphBuilder:
|
|
647
743
|
|
648
744
|
@classmethod
|
649
745
|
def check_model_compatibility(
|
650
|
-
cls, model_selector: meta.TagSelector,
|
651
|
-
node_name: str, flow_node: meta.FlowNode):
|
746
|
+
cls, model_selector: meta.TagSelector,
|
747
|
+
model_def: meta.ModelDefinition, node_name: str, flow_node: meta.FlowNode):
|
652
748
|
|
653
749
|
model_params = list(sorted(model_def.parameters.keys()))
|
654
750
|
model_inputs = list(sorted(model_def.inputs.keys()))
|
@@ -662,6 +758,21 @@ class GraphBuilder:
|
|
662
758
|
model_key = _util.object_key(model_selector)
|
663
759
|
raise _ex.EJobValidation(f"Incompatible model for flow node [{node_name}] (Model: [{model_key}])")
|
664
760
|
|
761
|
+
@classmethod
|
762
|
+
def check_model_type(cls, job_config: config.JobConfig, model_def: meta.ModelDefinition):
|
763
|
+
|
764
|
+
if job_config.job.jobType == meta.JobType.IMPORT_DATA:
|
765
|
+
allowed_model_types = [meta.ModelType.DATA_IMPORT_MODEL]
|
766
|
+
elif job_config.job.jobType == meta.JobType.EXPORT_DATA:
|
767
|
+
allowed_model_types = [meta.ModelType.DATA_EXPORT_MODEL]
|
768
|
+
else:
|
769
|
+
allowed_model_types = [meta.ModelType.STANDARD_MODEL]
|
770
|
+
|
771
|
+
if model_def.modelType not in allowed_model_types:
|
772
|
+
job_type = job_config.job.jobType.name
|
773
|
+
model_type = model_def.modelType.name
|
774
|
+
raise _ex.EJobValidation(f"Job type [{job_type}] cannot use model type [{model_type}]")
|
775
|
+
|
665
776
|
@staticmethod
|
666
777
|
def build_context_push(
|
667
778
|
namespace: NodeNamespace, input_mapping: tp.Dict[str, NodeId],
|
tracdap/rt/_impl/data.py
CHANGED
@@ -12,8 +12,6 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
from __future__ import annotations
|
16
|
-
|
17
15
|
import dataclasses as dc
|
18
16
|
import typing as tp
|
19
17
|
import datetime as dt
|
@@ -22,7 +20,16 @@ import platform
|
|
22
20
|
|
23
21
|
import pyarrow as pa
|
24
22
|
import pyarrow.compute as pc
|
25
|
-
|
23
|
+
|
24
|
+
try:
|
25
|
+
import pandas # noqa
|
26
|
+
except ModuleNotFoundError:
|
27
|
+
pandas = None
|
28
|
+
|
29
|
+
try:
|
30
|
+
import polars # noqa
|
31
|
+
except ModuleNotFoundError:
|
32
|
+
polars = None
|
26
33
|
|
27
34
|
import tracdap.rt.metadata as _meta
|
28
35
|
import tracdap.rt.exceptions as _ex
|
@@ -42,7 +49,7 @@ class DataSpec:
|
|
42
49
|
class DataPartKey:
|
43
50
|
|
44
51
|
@classmethod
|
45
|
-
def for_root(cls) -> DataPartKey:
|
52
|
+
def for_root(cls) -> "DataPartKey":
|
46
53
|
return DataPartKey(opaque_key='part_root')
|
47
54
|
|
48
55
|
opaque_key: str
|
@@ -55,14 +62,14 @@ class DataItem:
|
|
55
62
|
table: tp.Optional[pa.Table] = None
|
56
63
|
batches: tp.Optional[tp.List[pa.RecordBatch]] = None
|
57
64
|
|
58
|
-
pandas: tp.Optional[
|
65
|
+
pandas: "tp.Optional[pandas.DataFrame]" = None
|
59
66
|
pyspark: tp.Any = None
|
60
67
|
|
61
68
|
def is_empty(self) -> bool:
|
62
69
|
return self.table is None and (self.batches is None or len(self.batches) == 0)
|
63
70
|
|
64
71
|
@staticmethod
|
65
|
-
def create_empty() -> DataItem:
|
72
|
+
def create_empty() -> "DataItem":
|
66
73
|
return DataItem(pa.schema([]))
|
67
74
|
|
68
75
|
|
@@ -74,17 +81,21 @@ class DataView:
|
|
74
81
|
|
75
82
|
parts: tp.Dict[DataPartKey, tp.List[DataItem]]
|
76
83
|
|
84
|
+
@staticmethod
|
85
|
+
def create_empty() -> "DataView":
|
86
|
+
return DataView(_meta.SchemaDefinition(), pa.schema([]), dict())
|
87
|
+
|
77
88
|
@staticmethod
|
78
89
|
def for_trac_schema(trac_schema: _meta.SchemaDefinition):
|
79
90
|
arrow_schema = DataMapping.trac_to_arrow_schema(trac_schema)
|
80
91
|
return DataView(trac_schema, arrow_schema, dict())
|
81
92
|
|
82
|
-
def
|
83
|
-
|
93
|
+
def with_trac_schema(self, trac_schema: _meta.SchemaDefinition):
|
94
|
+
arrow_schema = DataMapping.trac_to_arrow_schema(trac_schema)
|
95
|
+
return DataView(trac_schema, arrow_schema, self.parts)
|
84
96
|
|
85
|
-
|
86
|
-
|
87
|
-
return DataView(_meta.SchemaDefinition(), pa.schema([]), dict())
|
97
|
+
def is_empty(self) -> bool:
|
98
|
+
return self.parts is None or not any(self.parts.values())
|
88
99
|
|
89
100
|
|
90
101
|
class _DataInternal:
|
@@ -121,14 +132,14 @@ class DataMapping:
|
|
121
132
|
}
|
122
133
|
|
123
134
|
# Check the Pandas dtypes for handling floats are available before setting up the type mapping
|
124
|
-
__PANDAS_VERSION_ELEMENTS =
|
135
|
+
__PANDAS_VERSION_ELEMENTS = pandas.__version__.split(".")
|
125
136
|
__PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
|
126
137
|
__PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
|
127
138
|
|
128
139
|
if __PANDAS_MAJOR_VERSION == 2:
|
129
140
|
|
130
|
-
__PANDAS_DATE_TYPE =
|
131
|
-
__PANDAS_DATETIME_TYPE =
|
141
|
+
__PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
|
142
|
+
__PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
|
132
143
|
|
133
144
|
@classmethod
|
134
145
|
def __pandas_datetime_type(cls, tz, unit):
|
@@ -136,41 +147,61 @@ class DataMapping:
|
|
136
147
|
return cls.__PANDAS_DATETIME_TYPE
|
137
148
|
_unit = unit if unit is not None else cls.__TRAC_TIMESTAMP_UNIT
|
138
149
|
if tz is None:
|
139
|
-
return
|
150
|
+
return pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
|
140
151
|
else:
|
141
|
-
return
|
152
|
+
return pandas.DatetimeTZDtype(tz=tz, unit=_unit)
|
142
153
|
|
143
|
-
# Minimum supported version for Pandas is 1.2, when
|
154
|
+
# Minimum supported version for Pandas is 1.2, when pandas.Float64Dtype was introduced
|
144
155
|
elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
|
145
156
|
|
146
|
-
__PANDAS_DATE_TYPE =
|
147
|
-
__PANDAS_DATETIME_TYPE =
|
157
|
+
__PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).dtype
|
158
|
+
__PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
|
148
159
|
|
149
160
|
@classmethod
|
150
161
|
def __pandas_datetime_type(cls, tz, unit): # noqa
|
151
162
|
if tz is None:
|
152
163
|
return cls.__PANDAS_DATETIME_TYPE
|
153
164
|
else:
|
154
|
-
return
|
165
|
+
return pandas.DatetimeTZDtype(tz=tz)
|
155
166
|
|
156
167
|
else:
|
157
|
-
raise _ex.EStartup(f"Pandas version not supported: [{
|
168
|
+
raise _ex.EStartup(f"Pandas version not supported: [{pandas.__version__}]")
|
158
169
|
|
159
170
|
# Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
|
160
171
|
__ARROW_TO_PANDAS_TYPE_MAPPING = {
|
161
|
-
pa.bool_():
|
162
|
-
pa.int8():
|
163
|
-
pa.int16():
|
164
|
-
pa.int32():
|
165
|
-
pa.int64():
|
166
|
-
pa.uint8():
|
167
|
-
pa.uint16():
|
168
|
-
pa.uint32():
|
169
|
-
pa.uint64():
|
170
|
-
pa.float16():
|
171
|
-
pa.float32():
|
172
|
-
pa.float64():
|
173
|
-
pa.
|
172
|
+
pa.bool_(): pandas.BooleanDtype(),
|
173
|
+
pa.int8(): pandas.Int8Dtype(),
|
174
|
+
pa.int16(): pandas.Int16Dtype(),
|
175
|
+
pa.int32(): pandas.Int32Dtype(),
|
176
|
+
pa.int64(): pandas.Int64Dtype(),
|
177
|
+
pa.uint8(): pandas.UInt8Dtype(),
|
178
|
+
pa.uint16(): pandas.UInt16Dtype(),
|
179
|
+
pa.uint32(): pandas.UInt32Dtype(),
|
180
|
+
pa.uint64(): pandas.UInt64Dtype(),
|
181
|
+
pa.float16(): pandas.Float32Dtype(),
|
182
|
+
pa.float32(): pandas.Float32Dtype(),
|
183
|
+
pa.float64(): pandas.Float64Dtype(),
|
184
|
+
pa.string(): pandas.StringDtype(),
|
185
|
+
pa.utf8(): pandas.StringDtype()
|
186
|
+
}
|
187
|
+
|
188
|
+
__ARROW_TO_TRAC_BASIC_TYPE_MAPPING = {
|
189
|
+
pa.bool_(): _meta.BasicType.BOOLEAN,
|
190
|
+
pa.int8(): _meta.BasicType.INTEGER,
|
191
|
+
pa.int16(): _meta.BasicType.INTEGER,
|
192
|
+
pa.int32(): _meta.BasicType.INTEGER,
|
193
|
+
pa.int64():_meta.BasicType.INTEGER,
|
194
|
+
pa.uint8(): _meta.BasicType.INTEGER,
|
195
|
+
pa.uint16(): _meta.BasicType.INTEGER,
|
196
|
+
pa.uint32(): _meta.BasicType.INTEGER,
|
197
|
+
pa.uint64(): _meta.BasicType.INTEGER,
|
198
|
+
pa.float16(): _meta.BasicType.FLOAT,
|
199
|
+
pa.float32(): _meta.BasicType.FLOAT,
|
200
|
+
pa.float64(): _meta.BasicType.FLOAT,
|
201
|
+
pa.string(): _meta.BasicType.STRING,
|
202
|
+
pa.utf8(): _meta.BasicType.STRING,
|
203
|
+
pa.date32(): _meta.BasicType.DATE,
|
204
|
+
pa.date64(): _meta.BasicType.DATE
|
174
205
|
}
|
175
206
|
|
176
207
|
@staticmethod
|
@@ -265,6 +296,47 @@ class DataMapping:
|
|
265
296
|
cls.__TRAC_DECIMAL_PRECISION,
|
266
297
|
cls.__TRAC_DECIMAL_SCALE)
|
267
298
|
|
299
|
+
@classmethod
|
300
|
+
def arrow_to_trac_schema(cls, arrow_schema: pa.Schema) -> _meta.SchemaDefinition:
|
301
|
+
|
302
|
+
trac_fields = list(
|
303
|
+
cls.arrow_to_trac_field(i, arrow_schema.field(i))
|
304
|
+
for (i, f) in enumerate(arrow_schema.names))
|
305
|
+
|
306
|
+
return _meta.SchemaDefinition(
|
307
|
+
schemaType=_meta.SchemaType.TABLE,
|
308
|
+
partType=_meta.PartType.PART_ROOT,
|
309
|
+
table=_meta.TableSchema(trac_fields))
|
310
|
+
|
311
|
+
@classmethod
|
312
|
+
def arrow_to_trac_field(cls, field_index: int, field: pa.Field) -> _meta.FieldSchema:
|
313
|
+
|
314
|
+
field_type = cls.arrow_to_trac_type(field.type)
|
315
|
+
label = field.metadata["label"] if field.metadata and "label" in field.metadata else field.name
|
316
|
+
|
317
|
+
return _meta.FieldSchema(
|
318
|
+
field.name, field_index, field_type,
|
319
|
+
label=label,
|
320
|
+
businessKey=False,
|
321
|
+
notNull=not field.nullable,
|
322
|
+
categorical=False)
|
323
|
+
|
324
|
+
@classmethod
|
325
|
+
def arrow_to_trac_type(cls, arrow_type: pa.DataType) -> _meta.BasicType:
|
326
|
+
|
327
|
+
mapped_basic_type = cls.__ARROW_TO_TRAC_BASIC_TYPE_MAPPING.get(arrow_type) # noqa
|
328
|
+
|
329
|
+
if mapped_basic_type is not None:
|
330
|
+
return mapped_basic_type
|
331
|
+
|
332
|
+
if pa.types.is_decimal(arrow_type):
|
333
|
+
return _meta.BasicType.DECIMAL
|
334
|
+
|
335
|
+
if pa.types.is_timestamp(arrow_type):
|
336
|
+
return _meta.BasicType.DATETIME
|
337
|
+
|
338
|
+
raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
|
339
|
+
|
268
340
|
@classmethod
|
269
341
|
def pandas_date_type(cls):
|
270
342
|
return cls.__PANDAS_DATE_TYPE
|
@@ -275,18 +347,31 @@ class DataMapping:
|
|
275
347
|
|
276
348
|
@classmethod
|
277
349
|
def view_to_pandas(
|
278
|
-
cls, view:
|
279
|
-
temporal_objects_flag: bool) ->
|
350
|
+
cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema],
|
351
|
+
temporal_objects_flag: bool) -> "pandas.DataFrame":
|
280
352
|
|
281
353
|
table = cls.view_to_arrow(view, part)
|
282
354
|
return cls.arrow_to_pandas(table, schema, temporal_objects_flag)
|
283
355
|
|
284
356
|
@classmethod
|
285
|
-
def
|
357
|
+
def view_to_polars(
|
358
|
+
cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema]):
|
359
|
+
|
360
|
+
table = cls.view_to_arrow(view, part)
|
361
|
+
return cls.arrow_to_polars(table, schema)
|
362
|
+
|
363
|
+
@classmethod
|
364
|
+
def pandas_to_item(cls, df: "pandas.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
|
286
365
|
|
287
366
|
table = cls.pandas_to_arrow(df, schema)
|
288
367
|
return DataItem(table.schema, table)
|
289
368
|
|
369
|
+
@classmethod
|
370
|
+
def polars_to_item(cls, df: "polars.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
|
371
|
+
|
372
|
+
table = cls.polars_to_arrow(df, schema)
|
373
|
+
return DataItem(table.schema, table)
|
374
|
+
|
290
375
|
@classmethod
|
291
376
|
def add_item_to_view(cls, view: DataView, part: DataPartKey, item: DataItem) -> DataView:
|
292
377
|
|
@@ -336,7 +421,7 @@ class DataMapping:
|
|
336
421
|
@classmethod
|
337
422
|
def arrow_to_pandas(
|
338
423
|
cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None,
|
339
|
-
temporal_objects_flag: bool = False) ->
|
424
|
+
temporal_objects_flag: bool = False) -> "pandas.DataFrame":
|
340
425
|
|
341
426
|
if schema is not None:
|
342
427
|
table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
|
@@ -361,7 +446,18 @@ class DataMapping:
|
|
361
446
|
split_blocks=True) # noqa
|
362
447
|
|
363
448
|
@classmethod
|
364
|
-
def
|
449
|
+
def arrow_to_polars(
|
450
|
+
cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> "polars.DataFrame":
|
451
|
+
|
452
|
+
if schema is not None:
|
453
|
+
table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
|
454
|
+
else:
|
455
|
+
DataConformance.check_duplicate_fields(table.schema.names, False)
|
456
|
+
|
457
|
+
return polars.from_arrow(table)
|
458
|
+
|
459
|
+
@classmethod
|
460
|
+
def pandas_to_arrow(cls, df: "pandas.DataFrame", schema: tp.Optional[pa.Schema] = None) -> pa.Table:
|
365
461
|
|
366
462
|
# Converting pandas -> arrow needs care to ensure type coercion is applied correctly
|
367
463
|
# Calling Table.from_pandas with the supplied schema will very often reject data
|
@@ -403,6 +499,30 @@ class DataMapping:
|
|
403
499
|
df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
|
404
500
|
return DataConformance.conform_to_schema(table, schema, df_types)
|
405
501
|
|
502
|
+
@classmethod
|
503
|
+
def pandas_to_arrow_schema(cls, df: "pandas.DataFrame") -> pa.Schema:
|
504
|
+
|
505
|
+
return pa.Schema.from_pandas(df, preserve_index=False) # noqa
|
506
|
+
|
507
|
+
@classmethod
|
508
|
+
def polars_to_arrow(cls, df: "polars.DataFrame", schema: tp.Optional[pa.Schema] = None) -> pa.Table:
|
509
|
+
|
510
|
+
column_filter = DataConformance.column_filter(df.columns, schema)
|
511
|
+
|
512
|
+
filtered_df = df.select(polars.col(*column_filter)) if column_filter else df
|
513
|
+
table = filtered_df.to_arrow()
|
514
|
+
|
515
|
+
if schema is None:
|
516
|
+
DataConformance.check_duplicate_fields(table.schema.names, False)
|
517
|
+
return table
|
518
|
+
else:
|
519
|
+
return DataConformance.conform_to_schema(table, schema, None)
|
520
|
+
|
521
|
+
@classmethod
|
522
|
+
def polars_to_arrow_schema(cls, df: "polars.DataFrame") -> pa.Schema:
|
523
|
+
|
524
|
+
return df.top_k(1).to_arrow().schema
|
525
|
+
|
406
526
|
|
407
527
|
class DataConformance:
|
408
528
|
|
@@ -719,21 +839,32 @@ class DataConformance:
|
|
719
839
|
@classmethod
|
720
840
|
def _coerce_string(cls, vector: pa.Array, field: pa.Field) -> pa.Array:
|
721
841
|
|
722
|
-
|
723
|
-
if pa.types.is_string(vector.type):
|
724
|
-
return vector
|
842
|
+
try:
|
725
843
|
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
844
|
+
if pa.types.is_string(field.type):
|
845
|
+
if pa.types.is_string(vector.type):
|
846
|
+
return vector
|
847
|
+
# Try to down-cast large string -> string, will raise ArrowInvalid if data does not fit
|
848
|
+
if pa.types.is_large_string(vector.type):
|
849
|
+
return pc.cast(vector, field.type, safe=True)
|
850
|
+
|
851
|
+
if pa.types.is_large_string(field.type):
|
852
|
+
if pa.types.is_large_string(vector.type):
|
853
|
+
return vector
|
854
|
+
# Allow up-casting string -> large_string
|
855
|
+
if pa.types.is_string(vector.type):
|
856
|
+
return pc.cast(vector, field.type)
|
732
857
|
|
733
|
-
|
734
|
-
|
858
|
+
error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
|
859
|
+
cls.__log.error(error_message)
|
860
|
+
raise _ex.EDataConformance(error_message)
|
861
|
+
|
862
|
+
except pa.ArrowInvalid as e:
|
863
|
+
|
864
|
+
error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e)
|
865
|
+
cls.__log.error(error_message)
|
866
|
+
raise _ex.EDataConformance(error_message) from e
|
735
867
|
|
736
|
-
raise _ex.EDataConformance(error_message)
|
737
868
|
|
738
869
|
@classmethod
|
739
870
|
def _coerce_date(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
|
@@ -751,7 +882,7 @@ class DataConformance:
|
|
751
882
|
# For Pandas 2.x dates are still np.datetime64 but can be in s, ms, us or ns
|
752
883
|
# This conversion will not apply to dates held in Pandas using the Python date object types
|
753
884
|
if pandas_type is not None:
|
754
|
-
if pa.types.is_timestamp(vector.type) and
|
885
|
+
if pa.types.is_timestamp(vector.type) and pandas.api.types.is_datetime64_any_dtype(pandas_type):
|
755
886
|
return pc.cast(vector, field.type)
|
756
887
|
|
757
888
|
error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
|
@@ -16,7 +16,7 @@ from tracdap.rt._impl.grpc.tracdap.metadata import type_pb2 as tracdap_dot_rt_do
|
|
16
16
|
from tracdap.rt._impl.grpc.tracdap.metadata import object_id_pb2 as tracdap_dot_rt_dot___impl_dot_grpc_dot_tracdap_dot_metadata_dot_object__id__pb2
|
17
17
|
|
18
18
|
|
19
|
-
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1tracdap/rt/_impl/grpc/tracdap/metadata/data.proto\x12\x10tracdap.metadata\x1a\x31tracdap/rt/_impl/grpc/tracdap/metadata/type.proto\x1a\x36tracdap/rt/_impl/grpc/tracdap/metadata/object_id.proto\"\xe7\x01\n\x0b\x46ieldSchema\x12\x11\n\tfieldName\x18\x01 \x01(\t\x12\x12\n\nfieldOrder\x18\x02 \x01(\x11\x12.\n\tfieldType\x18\x03 \x01(\x0e\x32\x1b.tracdap.metadata.BasicType\x12\r\n\x05label\x18\x04 \x01(\t\x12\x13\n\x0b\x62usinessKey\x18\x05 \x01(\x08\x12\x13\n\x0b\x63\x61tegorical\x18\x06 \x01(\x08\x12\x14\n\x07notNull\x18\x08 \x01(\x08H\x00\x88\x01\x01\x12\x17\n\nformatCode\x18\x07 \x01(\tH\x01\x88\x01\x01\x42\n\n\x08_notNullB\r\n\x0b_formatCode\"<\n\x0bTableSchema\x12-\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x1d.tracdap.metadata.FieldSchema\"\
|
19
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1tracdap/rt/_impl/grpc/tracdap/metadata/data.proto\x12\x10tracdap.metadata\x1a\x31tracdap/rt/_impl/grpc/tracdap/metadata/type.proto\x1a\x36tracdap/rt/_impl/grpc/tracdap/metadata/object_id.proto\"\xe7\x01\n\x0b\x46ieldSchema\x12\x11\n\tfieldName\x18\x01 \x01(\t\x12\x12\n\nfieldOrder\x18\x02 \x01(\x11\x12.\n\tfieldType\x18\x03 \x01(\x0e\x32\x1b.tracdap.metadata.BasicType\x12\r\n\x05label\x18\x04 \x01(\t\x12\x13\n\x0b\x62usinessKey\x18\x05 \x01(\x08\x12\x13\n\x0b\x63\x61tegorical\x18\x06 \x01(\x08\x12\x14\n\x07notNull\x18\x08 \x01(\x08H\x00\x88\x01\x01\x12\x17\n\nformatCode\x18\x07 \x01(\tH\x01\x88\x01\x01\x42\n\n\x08_notNullB\r\n\x0b_formatCode\"<\n\x0bTableSchema\x12-\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x1d.tracdap.metadata.FieldSchema\"\xb3\x01\n\x10SchemaDefinition\x12\x30\n\nschemaType\x18\x01 \x01(\x0e\x32\x1c.tracdap.metadata.SchemaType\x12,\n\x08partType\x18\x02 \x01(\x0e\x32\x1a.tracdap.metadata.PartType\x12.\n\x05table\x18\x03 \x01(\x0b\x32\x1d.tracdap.metadata.TableSchemaH\x00\x42\x0f\n\rschemaDetails\"\x81\x02\n\x07PartKey\x12\x11\n\topaqueKey\x18\x01 \x01(\t\x12,\n\x08partType\x18\x02 \x01(\x0e\x32\x1a.tracdap.metadata.PartType\x12+\n\npartValues\x18\x03 \x03(\x0b\x32\x17.tracdap.metadata.Value\x12\x32\n\x0cpartRangeMin\x18\x04 \x01(\x0b\x32\x17.tracdap.metadata.ValueH\x00\x88\x01\x01\x12\x32\n\x0cpartRangeMax\x18\x05 \x01(\x0b\x32\x17.tracdap.metadata.ValueH\x01\x88\x01\x01\x42\x0f\n\r_partRangeMinB\x0f\n\r_partRangeMax\"\xba\x04\n\x0e\x44\x61taDefinition\x12\x31\n\x08schemaId\x18\x01 \x01(\x0b\x32\x1d.tracdap.metadata.TagSelectorH\x00\x12\x34\n\x06schema\x18\x02 \x01(\x0b\x32\".tracdap.metadata.SchemaDefinitionH\x00\x12:\n\x05parts\x18\x03 \x03(\x0b\x32+.tracdap.metadata.DataDefinition.PartsEntry\x12\x30\n\tstorageId\x18\x04 \x01(\x0b\x32\x1d.tracdap.metadata.TagSelector\x1a-\n\x05\x44\x65lta\x12\x12\n\ndeltaIndex\x18\x01 \x01(\r\x12\x10\n\x08\x64\x61taItem\x18\x02 \x01(\t\x1aQ\n\x04Snap\x12\x11\n\tsnapIndex\x18\x01 \x01(\r\x12\x36\n\x06\x64\x65ltas\x18\x02 \x03(\x0b\x32&.tracdap.metadata.DataDefinition.Delta\x1ag\n\x04Part\x12*\n\x07partKey\x18\x01 \x01(\x0b\x32\x19.tracdap.metadata.PartKey\x12\x33\n\x04snap\x18\x02 \x01(\x0b\x32%.tracdap.metadata.DataDefinition.Snap\x1aS\n\nPartsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x34\n\x05value\x18\x02 \x01(\x0b\x32%.tracdap.metadata.DataDefinition.Part:\x02\x38\x01\x42\x11\n\x0fschemaSpecifier*0\n\nSchemaType\x12\x17\n\x13SCHEMA_TYPE_NOT_SET\x10\x00\x12\t\n\x05TABLE\x10\x01*?\n\x08PartType\x12\r\n\tPART_ROOT\x10\x00\x12\x11\n\rPART_BY_RANGE\x10\x01\x12\x11\n\rPART_BY_VALUE\x10\x02\x42\x1e\n\x1aorg.finos.tracdap.metadataP\x01\x62\x06proto3')
|
20
20
|
|
21
21
|
_globals = globals()
|
22
22
|
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
@@ -26,26 +26,26 @@ if _descriptor._USE_C_DESCRIPTORS == False:
|
|
26
26
|
_globals['DESCRIPTOR']._serialized_options = b'\n\032org.finos.tracdap.metadataP\001'
|
27
27
|
_globals['_DATADEFINITION_PARTSENTRY']._options = None
|
28
28
|
_globals['_DATADEFINITION_PARTSENTRY']._serialized_options = b'8\001'
|
29
|
-
_globals['_SCHEMATYPE']._serialized_start=
|
30
|
-
_globals['_SCHEMATYPE']._serialized_end=
|
31
|
-
_globals['_PARTTYPE']._serialized_start=
|
32
|
-
_globals['_PARTTYPE']._serialized_end=
|
29
|
+
_globals['_SCHEMATYPE']._serialized_start=1489
|
30
|
+
_globals['_SCHEMATYPE']._serialized_end=1537
|
31
|
+
_globals['_PARTTYPE']._serialized_start=1539
|
32
|
+
_globals['_PARTTYPE']._serialized_end=1602
|
33
33
|
_globals['_FIELDSCHEMA']._serialized_start=179
|
34
34
|
_globals['_FIELDSCHEMA']._serialized_end=410
|
35
35
|
_globals['_TABLESCHEMA']._serialized_start=412
|
36
36
|
_globals['_TABLESCHEMA']._serialized_end=472
|
37
37
|
_globals['_SCHEMADEFINITION']._serialized_start=475
|
38
|
-
_globals['_SCHEMADEFINITION']._serialized_end=
|
39
|
-
_globals['_PARTKEY']._serialized_start=
|
40
|
-
_globals['_PARTKEY']._serialized_end=
|
41
|
-
_globals['_DATADEFINITION']._serialized_start=
|
42
|
-
_globals['_DATADEFINITION']._serialized_end=
|
43
|
-
_globals['_DATADEFINITION_DELTA']._serialized_start=
|
44
|
-
_globals['_DATADEFINITION_DELTA']._serialized_end=
|
45
|
-
_globals['_DATADEFINITION_SNAP']._serialized_start=
|
46
|
-
_globals['_DATADEFINITION_SNAP']._serialized_end=
|
47
|
-
_globals['_DATADEFINITION_PART']._serialized_start=
|
48
|
-
_globals['_DATADEFINITION_PART']._serialized_end=
|
49
|
-
_globals['_DATADEFINITION_PARTSENTRY']._serialized_start=
|
50
|
-
_globals['_DATADEFINITION_PARTSENTRY']._serialized_end=
|
38
|
+
_globals['_SCHEMADEFINITION']._serialized_end=654
|
39
|
+
_globals['_PARTKEY']._serialized_start=657
|
40
|
+
_globals['_PARTKEY']._serialized_end=914
|
41
|
+
_globals['_DATADEFINITION']._serialized_start=917
|
42
|
+
_globals['_DATADEFINITION']._serialized_end=1487
|
43
|
+
_globals['_DATADEFINITION_DELTA']._serialized_start=1150
|
44
|
+
_globals['_DATADEFINITION_DELTA']._serialized_end=1195
|
45
|
+
_globals['_DATADEFINITION_SNAP']._serialized_start=1197
|
46
|
+
_globals['_DATADEFINITION_SNAP']._serialized_end=1278
|
47
|
+
_globals['_DATADEFINITION_PART']._serialized_start=1280
|
48
|
+
_globals['_DATADEFINITION_PART']._serialized_end=1383
|
49
|
+
_globals['_DATADEFINITION_PARTSENTRY']._serialized_start=1385
|
50
|
+
_globals['_DATADEFINITION_PARTSENTRY']._serialized_end=1468
|
51
51
|
# @@protoc_insertion_point(module_scope)
|