tracdap-runtime 0.6.5__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracdap/rt/__init__.py +6 -5
- tracdap/rt/_exec/actors.py +6 -5
- tracdap/rt/_exec/context.py +278 -110
- tracdap/rt/_exec/dev_mode.py +237 -143
- tracdap/rt/_exec/engine.py +223 -64
- tracdap/rt/_exec/functions.py +31 -6
- tracdap/rt/_exec/graph.py +15 -5
- tracdap/rt/_exec/graph_builder.py +301 -203
- tracdap/rt/_exec/runtime.py +13 -10
- tracdap/rt/_exec/server.py +6 -5
- tracdap/rt/_impl/__init__.py +6 -5
- tracdap/rt/_impl/config_parser.py +17 -9
- tracdap/rt/_impl/data.py +284 -172
- tracdap/rt/_impl/ext/__init__.py +14 -0
- tracdap/rt/_impl/ext/sql.py +117 -0
- tracdap/rt/_impl/ext/storage.py +58 -0
- tracdap/rt/_impl/grpc/__init__.py +6 -5
- tracdap/rt/_impl/grpc/codec.py +6 -5
- tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +62 -54
- tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +37 -2
- tracdap/rt/_impl/guard_rails.py +6 -5
- tracdap/rt/_impl/models.py +6 -5
- tracdap/rt/_impl/repos.py +6 -5
- tracdap/rt/_impl/schemas.py +6 -5
- tracdap/rt/_impl/shim.py +6 -5
- tracdap/rt/_impl/static_api.py +30 -16
- tracdap/rt/_impl/storage.py +8 -7
- tracdap/rt/_impl/type_system.py +6 -5
- tracdap/rt/_impl/util.py +16 -5
- tracdap/rt/_impl/validation.py +72 -18
- tracdap/rt/_plugins/__init__.py +6 -5
- tracdap/rt/_plugins/_helpers.py +6 -5
- tracdap/rt/_plugins/config_local.py +6 -5
- tracdap/rt/_plugins/format_arrow.py +6 -5
- tracdap/rt/_plugins/format_csv.py +6 -5
- tracdap/rt/_plugins/format_parquet.py +6 -5
- tracdap/rt/_plugins/repo_git.py +6 -5
- tracdap/rt/_plugins/repo_local.py +6 -5
- tracdap/rt/_plugins/repo_pypi.py +6 -5
- tracdap/rt/_plugins/storage_aws.py +6 -5
- tracdap/rt/_plugins/storage_azure.py +6 -5
- tracdap/rt/_plugins/storage_gcp.py +6 -5
- tracdap/rt/_plugins/storage_local.py +6 -5
- tracdap/rt/_plugins/storage_sql.py +418 -0
- tracdap/rt/_plugins/storage_sql_dialects.py +118 -0
- tracdap/rt/_version.py +7 -6
- tracdap/rt/api/__init__.py +23 -5
- tracdap/rt/api/experimental.py +85 -37
- tracdap/rt/api/hook.py +16 -5
- tracdap/rt/api/model_api.py +110 -90
- tracdap/rt/api/static_api.py +142 -100
- tracdap/rt/config/common.py +26 -27
- tracdap/rt/config/job.py +5 -6
- tracdap/rt/config/platform.py +41 -42
- tracdap/rt/config/result.py +5 -6
- tracdap/rt/config/runtime.py +6 -7
- tracdap/rt/exceptions.py +13 -7
- tracdap/rt/ext/__init__.py +6 -5
- tracdap/rt/ext/config.py +6 -5
- tracdap/rt/ext/embed.py +6 -5
- tracdap/rt/ext/plugins.py +6 -5
- tracdap/rt/ext/repos.py +6 -5
- tracdap/rt/ext/storage.py +6 -5
- tracdap/rt/launch/__init__.py +10 -5
- tracdap/rt/launch/__main__.py +6 -5
- tracdap/rt/launch/cli.py +6 -5
- tracdap/rt/launch/launch.py +38 -15
- tracdap/rt/metadata/__init__.py +4 -0
- tracdap/rt/metadata/common.py +2 -3
- tracdap/rt/metadata/custom.py +3 -4
- tracdap/rt/metadata/data.py +30 -31
- tracdap/rt/metadata/file.py +6 -7
- tracdap/rt/metadata/flow.py +22 -23
- tracdap/rt/metadata/job.py +89 -45
- tracdap/rt/metadata/model.py +26 -27
- tracdap/rt/metadata/object.py +11 -12
- tracdap/rt/metadata/object_id.py +23 -24
- tracdap/rt/metadata/resource.py +0 -1
- tracdap/rt/metadata/search.py +15 -16
- tracdap/rt/metadata/stoarge.py +22 -23
- tracdap/rt/metadata/tag.py +8 -9
- tracdap/rt/metadata/tag_update.py +11 -12
- tracdap/rt/metadata/type.py +38 -38
- {tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0.dist-info}/LICENSE +1 -1
- {tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0.dist-info}/METADATA +4 -2
- tracdap_runtime-0.7.0.dist-info/RECORD +121 -0
- {tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0.dist-info}/WHEEL +1 -1
- tracdap_runtime-0.6.5.dist-info/RECORD +0 -116
- {tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0.dist-info}/top_level.txt +0 -0
tracdap/rt/_impl/data.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
#
|
4
|
-
#
|
5
|
-
#
|
1
|
+
# Licensed to the Fintech Open Source Foundation (FINOS) under one or
|
2
|
+
# more contributor license agreements. See the NOTICE file distributed
|
3
|
+
# with this work for additional information regarding copyright ownership.
|
4
|
+
# FINOS licenses this file to you under the Apache License, Version 2.0
|
5
|
+
# (the "License"); you may not use this file except in compliance with the
|
6
|
+
# License. You may obtain a copy of the License at
|
6
7
|
#
|
7
8
|
# http://www.apache.org/licenses/LICENSE-2.0
|
8
9
|
#
|
@@ -12,6 +13,7 @@
|
|
12
13
|
# See the License for the specific language governing permissions and
|
13
14
|
# limitations under the License.
|
14
15
|
|
16
|
+
import abc
|
15
17
|
import dataclasses as dc
|
16
18
|
import typing as tp
|
17
19
|
import datetime as dt
|
@@ -31,6 +33,7 @@ try:
|
|
31
33
|
except ModuleNotFoundError:
|
32
34
|
polars = None
|
33
35
|
|
36
|
+
import tracdap.rt.api.experimental as _api
|
34
37
|
import tracdap.rt.metadata as _meta
|
35
38
|
import tracdap.rt.exceptions as _ex
|
36
39
|
import tracdap.rt._impl.util as _util
|
@@ -116,73 +119,19 @@ class DataMapping:
|
|
116
119
|
|
117
120
|
# Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data
|
118
121
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
122
|
+
DEFAULT_DECIMAL_PRECISION = 38
|
123
|
+
DEFAULT_DECIMAL_SCALE = 12
|
124
|
+
DEFAULT_TIMESTAMP_UNIT = "ms"
|
125
|
+
DEFAULT_TIMESTAMP_ZONE = None
|
123
126
|
|
124
127
|
__TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
|
125
128
|
_meta.BasicType.BOOLEAN: pa.bool_(),
|
126
129
|
_meta.BasicType.INTEGER: pa.int64(),
|
127
130
|
_meta.BasicType.FLOAT: pa.float64(),
|
128
|
-
_meta.BasicType.DECIMAL: pa.decimal128(
|
131
|
+
_meta.BasicType.DECIMAL: pa.decimal128(DEFAULT_DECIMAL_PRECISION, DEFAULT_DECIMAL_SCALE),
|
129
132
|
_meta.BasicType.STRING: pa.utf8(),
|
130
133
|
_meta.BasicType.DATE: pa.date32(),
|
131
|
-
_meta.BasicType.DATETIME: pa.timestamp(
|
132
|
-
}
|
133
|
-
|
134
|
-
# Check the Pandas dtypes for handling floats are available before setting up the type mapping
|
135
|
-
__PANDAS_VERSION_ELEMENTS = pandas.__version__.split(".")
|
136
|
-
__PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
|
137
|
-
__PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
|
138
|
-
|
139
|
-
if __PANDAS_MAJOR_VERSION == 2:
|
140
|
-
|
141
|
-
__PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
|
142
|
-
__PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
|
143
|
-
|
144
|
-
@classmethod
|
145
|
-
def __pandas_datetime_type(cls, tz, unit):
|
146
|
-
if tz is None and unit is None:
|
147
|
-
return cls.__PANDAS_DATETIME_TYPE
|
148
|
-
_unit = unit if unit is not None else cls.__TRAC_TIMESTAMP_UNIT
|
149
|
-
if tz is None:
|
150
|
-
return pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
|
151
|
-
else:
|
152
|
-
return pandas.DatetimeTZDtype(tz=tz, unit=_unit)
|
153
|
-
|
154
|
-
# Minimum supported version for Pandas is 1.2, when pandas.Float64Dtype was introduced
|
155
|
-
elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
|
156
|
-
|
157
|
-
__PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).dtype
|
158
|
-
__PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
|
159
|
-
|
160
|
-
@classmethod
|
161
|
-
def __pandas_datetime_type(cls, tz, unit): # noqa
|
162
|
-
if tz is None:
|
163
|
-
return cls.__PANDAS_DATETIME_TYPE
|
164
|
-
else:
|
165
|
-
return pandas.DatetimeTZDtype(tz=tz)
|
166
|
-
|
167
|
-
else:
|
168
|
-
raise _ex.EStartup(f"Pandas version not supported: [{pandas.__version__}]")
|
169
|
-
|
170
|
-
# Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
|
171
|
-
__ARROW_TO_PANDAS_TYPE_MAPPING = {
|
172
|
-
pa.bool_(): pandas.BooleanDtype(),
|
173
|
-
pa.int8(): pandas.Int8Dtype(),
|
174
|
-
pa.int16(): pandas.Int16Dtype(),
|
175
|
-
pa.int32(): pandas.Int32Dtype(),
|
176
|
-
pa.int64(): pandas.Int64Dtype(),
|
177
|
-
pa.uint8(): pandas.UInt8Dtype(),
|
178
|
-
pa.uint16(): pandas.UInt16Dtype(),
|
179
|
-
pa.uint32(): pandas.UInt32Dtype(),
|
180
|
-
pa.uint64(): pandas.UInt64Dtype(),
|
181
|
-
pa.float16(): pandas.Float32Dtype(),
|
182
|
-
pa.float32(): pandas.Float32Dtype(),
|
183
|
-
pa.float64(): pandas.Float64Dtype(),
|
184
|
-
pa.string(): pandas.StringDtype(),
|
185
|
-
pa.utf8(): pandas.StringDtype()
|
134
|
+
_meta.BasicType.DATETIME: pa.timestamp(DEFAULT_TIMESTAMP_UNIT, DEFAULT_TIMESTAMP_ZONE)
|
186
135
|
}
|
187
136
|
|
188
137
|
__ARROW_TO_TRAC_BASIC_TYPE_MAPPING = {
|
@@ -243,7 +192,7 @@ class DataMapping:
|
|
243
192
|
return pa.float64()
|
244
193
|
|
245
194
|
if python_type == decimal.Decimal:
|
246
|
-
return pa.decimal128(cls.
|
195
|
+
return pa.decimal128(cls.DEFAULT_DECIMAL_PRECISION, cls.DEFAULT_DECIMAL_SCALE)
|
247
196
|
|
248
197
|
if python_type == str:
|
249
198
|
return pa.utf8()
|
@@ -252,7 +201,7 @@ class DataMapping:
|
|
252
201
|
return pa.date32()
|
253
202
|
|
254
203
|
if python_type == dt.datetime:
|
255
|
-
return pa.timestamp(cls.
|
204
|
+
return pa.timestamp(cls.DEFAULT_TIMESTAMP_UNIT, cls.DEFAULT_TIMESTAMP_ZONE)
|
256
205
|
|
257
206
|
raise _ex.ETracInternal(f"No Arrow type mapping available for Python type [{python_type}]")
|
258
207
|
|
@@ -293,8 +242,8 @@ class DataMapping:
|
|
293
242
|
def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:
|
294
243
|
|
295
244
|
return pa.decimal128(
|
296
|
-
cls.
|
297
|
-
cls.
|
245
|
+
cls.DEFAULT_DECIMAL_PRECISION,
|
246
|
+
cls.DEFAULT_DECIMAL_SCALE,)
|
298
247
|
|
299
248
|
@classmethod
|
300
249
|
def arrow_to_trac_schema(cls, arrow_schema: pa.Schema) -> _meta.SchemaDefinition:
|
@@ -337,41 +286,6 @@ class DataMapping:
|
|
337
286
|
|
338
287
|
raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
|
339
288
|
|
340
|
-
@classmethod
|
341
|
-
def pandas_date_type(cls):
|
342
|
-
return cls.__PANDAS_DATE_TYPE
|
343
|
-
|
344
|
-
@classmethod
|
345
|
-
def pandas_datetime_type(cls, tz=None, unit=None):
|
346
|
-
return cls.__pandas_datetime_type(tz, unit)
|
347
|
-
|
348
|
-
@classmethod
|
349
|
-
def view_to_pandas(
|
350
|
-
cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema],
|
351
|
-
temporal_objects_flag: bool) -> "pandas.DataFrame":
|
352
|
-
|
353
|
-
table = cls.view_to_arrow(view, part)
|
354
|
-
return cls.arrow_to_pandas(table, schema, temporal_objects_flag)
|
355
|
-
|
356
|
-
@classmethod
|
357
|
-
def view_to_polars(
|
358
|
-
cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema]):
|
359
|
-
|
360
|
-
table = cls.view_to_arrow(view, part)
|
361
|
-
return cls.arrow_to_polars(table, schema)
|
362
|
-
|
363
|
-
@classmethod
|
364
|
-
def pandas_to_item(cls, df: "pandas.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
|
365
|
-
|
366
|
-
table = cls.pandas_to_arrow(df, schema)
|
367
|
-
return DataItem(table.schema, table)
|
368
|
-
|
369
|
-
@classmethod
|
370
|
-
def polars_to_item(cls, df: "polars.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
|
371
|
-
|
372
|
-
table = cls.polars_to_arrow(df, schema)
|
373
|
-
return DataItem(table.schema, table)
|
374
|
-
|
375
289
|
@classmethod
|
376
290
|
def add_item_to_view(cls, view: DataView, part: DataPartKey, item: DataItem) -> DataView:
|
377
291
|
|
@@ -420,108 +334,306 @@ class DataMapping:
|
|
420
334
|
|
421
335
|
@classmethod
|
422
336
|
def arrow_to_pandas(
|
423
|
-
cls, table: pa.Table,
|
337
|
+
cls, table: pa.Table,
|
338
|
+
schema: tp.Optional[pa.Schema] = None,
|
424
339
|
temporal_objects_flag: bool = False) -> "pandas.DataFrame":
|
425
340
|
|
426
|
-
|
427
|
-
|
428
|
-
else:
|
429
|
-
DataConformance.check_duplicate_fields(table.schema.names, False)
|
341
|
+
# This is a legacy internal method and should be removed
|
342
|
+
# DataMapping is no longer responsible for individual data APIs
|
430
343
|
|
431
|
-
#
|
432
|
-
return table.to_pandas(
|
344
|
+
# Maintained temporarily for compatibility with existing deployments
|
433
345
|
|
434
|
-
|
435
|
-
|
346
|
+
converter = PandasArrowConverter(_api.PANDAS, use_temporal_objects=temporal_objects_flag)
|
347
|
+
return converter.from_internal(table, schema)
|
436
348
|
|
437
|
-
|
438
|
-
|
439
|
-
|
349
|
+
@classmethod
|
350
|
+
def pandas_to_arrow(
|
351
|
+
cls, df: "pandas.DataFrame",
|
352
|
+
schema: tp.Optional[pa.Schema] = None) -> pa.Table:
|
440
353
|
|
441
|
-
|
442
|
-
|
354
|
+
# This is a legacy internal method and should be removed
|
355
|
+
# DataMapping is no longer responsible for individual data APIs
|
443
356
|
|
444
|
-
|
445
|
-
# This is a significant performance win for very wide datasets
|
446
|
-
split_blocks=True) # noqa
|
357
|
+
# Maintained temporarily for compatibility with existing deployments
|
447
358
|
|
448
|
-
|
449
|
-
|
450
|
-
cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> "polars.DataFrame":
|
359
|
+
converter = PandasArrowConverter(_api.PANDAS)
|
360
|
+
return converter.to_internal(df, schema)
|
451
361
|
|
452
|
-
if schema is not None:
|
453
|
-
table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
|
454
|
-
else:
|
455
|
-
DataConformance.check_duplicate_fields(table.schema.names, False)
|
456
362
|
|
457
|
-
return polars.from_arrow(table)
|
458
363
|
|
459
|
-
|
460
|
-
|
364
|
+
T_DATA_API = tp.TypeVar("T_DATA_API")
|
365
|
+
T_INTERNAL_DATA = tp.TypeVar("T_INTERNAL_DATA")
|
366
|
+
T_INTERNAL_SCHEMA = tp.TypeVar("T_INTERNAL_SCHEMA")
|
461
367
|
|
462
|
-
# Converting pandas -> arrow needs care to ensure type coercion is applied correctly
|
463
|
-
# Calling Table.from_pandas with the supplied schema will very often reject data
|
464
|
-
# Instead, we convert the dataframe as-is and then apply type conversion in a second step
|
465
|
-
# This allows us to apply specific coercion rules for each data type
|
466
368
|
|
467
|
-
|
468
|
-
# E.g. if a model outputs lots of undeclared columns, there is no need to convert them
|
369
|
+
class DataConverter(tp.Generic[T_DATA_API, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
|
469
370
|
|
470
|
-
|
371
|
+
# Available per-framework args, to enable framework-specific type-checking in public APIs
|
372
|
+
# These should (for a purist point of view) be in the individual converter classes
|
373
|
+
# For now there are only a few converters, they are all defined here so this is OK
|
374
|
+
__FRAMEWORK_ARGS = {
|
375
|
+
_api.PANDAS: {"use_temporal_objects": tp.Optional[bool]},
|
376
|
+
_api.POLARS: {}
|
377
|
+
}
|
471
378
|
|
472
|
-
|
379
|
+
@classmethod
|
380
|
+
def get_framework(cls, dataset: _api.DATA_API) -> _api.DataFramework[_api.DATA_API]:
|
473
381
|
|
474
|
-
|
382
|
+
if pandas is not None and isinstance(dataset, pandas.DataFrame):
|
383
|
+
return _api.PANDAS
|
475
384
|
|
476
|
-
|
477
|
-
|
478
|
-
# Type coercion and column filtering happen in conform_to_schema, if a schema has been supplied
|
385
|
+
if polars is not None and isinstance(dataset, polars.DataFrame):
|
386
|
+
return _api.POLARS
|
479
387
|
|
480
|
-
|
388
|
+
data_api_type = f"{type(dataset).__module__}.{type(dataset).__name__}"
|
389
|
+
raise _ex.EPluginNotAvailable(f"No data framework available for type [{data_api_type}]")
|
481
390
|
|
482
|
-
|
483
|
-
|
391
|
+
@classmethod
|
392
|
+
def get_framework_args(cls, framework: _api.DataFramework[_api.DATA_API]) -> tp.Dict[str, type]:
|
484
393
|
|
485
|
-
|
394
|
+
return cls.__FRAMEWORK_ARGS.get(framework) or {}
|
486
395
|
|
487
|
-
|
488
|
-
|
489
|
-
# E.g. unsigned int 32 -> signed int 64, TRAC standard integer type
|
396
|
+
@classmethod
|
397
|
+
def for_framework(cls, framework: _api.DataFramework[_api.DATA_API], **framework_args) -> "DataConverter[_api.DATA_API, pa.Table, pa.Schema]":
|
490
398
|
|
491
|
-
if
|
492
|
-
|
493
|
-
|
399
|
+
if framework == _api.PANDAS:
|
400
|
+
if pandas is not None:
|
401
|
+
return PandasArrowConverter(framework, **framework_args)
|
402
|
+
else:
|
403
|
+
raise _ex.EPluginNotAvailable(f"Optional package [{framework}] is not installed")
|
494
404
|
|
495
|
-
|
496
|
-
|
405
|
+
if framework == _api.POLARS:
|
406
|
+
if polars is not None:
|
407
|
+
return PolarsArrowConverter(framework)
|
408
|
+
else:
|
409
|
+
raise _ex.EPluginNotAvailable(f"Optional package [{framework}] is not installed")
|
497
410
|
|
498
|
-
|
499
|
-
df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
|
500
|
-
return DataConformance.conform_to_schema(table, schema, df_types)
|
411
|
+
raise _ex.EPluginNotAvailable(f"Data framework [{framework}] is not recognized")
|
501
412
|
|
502
413
|
@classmethod
|
503
|
-
def
|
414
|
+
def for_dataset(cls, dataset: _api.DATA_API) -> "DataConverter[_api.DATA_API, pa.Table, pa.Schema]":
|
504
415
|
|
505
|
-
return
|
416
|
+
return cls.for_framework(cls.get_framework(dataset))
|
506
417
|
|
507
418
|
@classmethod
|
508
|
-
def
|
419
|
+
def noop(cls) -> "DataConverter[T_INTERNAL_DATA, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]":
|
420
|
+
return NoopConverter()
|
421
|
+
|
422
|
+
def __init__(self, framework: _api.DataFramework[T_DATA_API]):
|
423
|
+
self.framework = framework
|
424
|
+
|
425
|
+
@abc.abstractmethod
|
426
|
+
def from_internal(self, dataset: T_INTERNAL_DATA, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_DATA_API:
|
427
|
+
pass
|
428
|
+
|
429
|
+
@abc.abstractmethod
|
430
|
+
def to_internal(self, dataset: T_DATA_API, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_INTERNAL_DATA:
|
431
|
+
pass
|
432
|
+
|
433
|
+
@abc.abstractmethod
|
434
|
+
def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
|
435
|
+
pass
|
436
|
+
|
437
|
+
|
438
|
+
class NoopConverter(DataConverter[T_INTERNAL_DATA, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
|
439
|
+
|
440
|
+
def __init__(self):
|
441
|
+
super().__init__(_api.DataFramework("internal", None)) # noqa
|
442
|
+
|
443
|
+
def from_internal(self, dataset: T_INTERNAL_DATA, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_DATA_API:
|
444
|
+
return dataset
|
445
|
+
|
446
|
+
def to_internal(self, dataset: T_DATA_API, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_INTERNAL_DATA:
|
447
|
+
return dataset
|
448
|
+
|
449
|
+
def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
|
450
|
+
raise _ex.EUnexpected() # A real converter should be selected before use
|
451
|
+
|
452
|
+
|
453
|
+
# Data frameworks are optional, do not blow up the module just because one framework is unavailable!
|
454
|
+
if pandas is not None:
|
509
455
|
|
510
|
-
|
456
|
+
class PandasArrowConverter(DataConverter[pandas.DataFrame, pa.Table, pa.Schema]):
|
511
457
|
|
512
|
-
|
513
|
-
|
458
|
+
# Check the Pandas dtypes for handling floats are available before setting up the type mapping
|
459
|
+
__PANDAS_VERSION_ELEMENTS = pandas.__version__.split(".")
|
460
|
+
__PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
|
461
|
+
__PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
|
462
|
+
|
463
|
+
if __PANDAS_MAJOR_VERSION == 2:
|
464
|
+
|
465
|
+
__PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).as_unit(DataMapping.DEFAULT_TIMESTAMP_UNIT).dtype
|
466
|
+
__PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(DataMapping.DEFAULT_TIMESTAMP_UNIT).dtype
|
467
|
+
|
468
|
+
@classmethod
|
469
|
+
def __pandas_datetime_type(cls, tz, unit):
|
470
|
+
if tz is None and unit is None:
|
471
|
+
return cls.__PANDAS_DATETIME_TYPE
|
472
|
+
_unit = unit if unit is not None else DataMapping.DEFAULT_TIMESTAMP_UNIT
|
473
|
+
if tz is None:
|
474
|
+
return pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
|
475
|
+
else:
|
476
|
+
return pandas.DatetimeTZDtype(tz=tz, unit=_unit)
|
477
|
+
|
478
|
+
# Minimum supported version for Pandas is 1.2, when pandas.Float64Dtype was introduced
|
479
|
+
elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
|
480
|
+
|
481
|
+
__PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).dtype
|
482
|
+
__PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
|
483
|
+
|
484
|
+
@classmethod
|
485
|
+
def __pandas_datetime_type(cls, tz, unit): # noqa
|
486
|
+
if tz is None:
|
487
|
+
return cls.__PANDAS_DATETIME_TYPE
|
488
|
+
else:
|
489
|
+
return pandas.DatetimeTZDtype(tz=tz)
|
514
490
|
|
515
|
-
if schema is None:
|
516
|
-
DataConformance.check_duplicate_fields(table.schema.names, False)
|
517
|
-
return table
|
518
491
|
else:
|
519
|
-
|
492
|
+
raise _ex.EStartup(f"Pandas version not supported: [{pandas.__version__}]")
|
493
|
+
|
494
|
+
# Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
|
495
|
+
__ARROW_TO_PANDAS_TYPE_MAPPING = {
|
496
|
+
pa.bool_(): pandas.BooleanDtype(),
|
497
|
+
pa.int8(): pandas.Int8Dtype(),
|
498
|
+
pa.int16(): pandas.Int16Dtype(),
|
499
|
+
pa.int32(): pandas.Int32Dtype(),
|
500
|
+
pa.int64(): pandas.Int64Dtype(),
|
501
|
+
pa.uint8(): pandas.UInt8Dtype(),
|
502
|
+
pa.uint16(): pandas.UInt16Dtype(),
|
503
|
+
pa.uint32(): pandas.UInt32Dtype(),
|
504
|
+
pa.uint64(): pandas.UInt64Dtype(),
|
505
|
+
pa.float16(): pandas.Float32Dtype(),
|
506
|
+
pa.float32(): pandas.Float32Dtype(),
|
507
|
+
pa.float64(): pandas.Float64Dtype(),
|
508
|
+
pa.string(): pandas.StringDtype(),
|
509
|
+
pa.utf8(): pandas.StringDtype()
|
510
|
+
}
|
511
|
+
|
512
|
+
__DEFAULT_TEMPORAL_OBJECTS = False
|
513
|
+
|
514
|
+
# Expose date type for testing
|
515
|
+
@classmethod
|
516
|
+
def pandas_date_type(cls):
|
517
|
+
return cls.__PANDAS_DATE_TYPE
|
520
518
|
|
521
|
-
|
522
|
-
|
519
|
+
# Expose datetime type for testing
|
520
|
+
@classmethod
|
521
|
+
def pandas_datetime_type(cls, tz=None, unit=None):
|
522
|
+
return cls.__pandas_datetime_type(tz, unit)
|
523
|
+
|
524
|
+
def __init__(self, framework: _api.DataFramework[T_DATA_API], use_temporal_objects: tp.Optional[bool] = None):
|
525
|
+
super().__init__(framework)
|
526
|
+
if use_temporal_objects is None:
|
527
|
+
self.__temporal_objects_flag = self.__DEFAULT_TEMPORAL_OBJECTS
|
528
|
+
else:
|
529
|
+
self.__temporal_objects_flag = use_temporal_objects
|
530
|
+
|
531
|
+
def from_internal(self, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> pandas.DataFrame:
|
532
|
+
|
533
|
+
if schema is not None:
|
534
|
+
table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
|
535
|
+
else:
|
536
|
+
DataConformance.check_duplicate_fields(table.schema.names, False)
|
537
|
+
|
538
|
+
# Use Arrow's built-in function to convert to Pandas
|
539
|
+
return table.to_pandas(
|
540
|
+
|
541
|
+
# Mapping for arrow -> pandas types for core types
|
542
|
+
types_mapper=self.__ARROW_TO_PANDAS_TYPE_MAPPING.get,
|
543
|
+
|
544
|
+
# Use Python objects for dates and times if temporal_objects_flag is set
|
545
|
+
date_as_object=self.__temporal_objects_flag, # noqa
|
546
|
+
timestamp_as_object=self.__temporal_objects_flag, # noqa
|
547
|
+
|
548
|
+
# Do not bring any Arrow metadata into Pandas dataframe
|
549
|
+
ignore_metadata=True, # noqa
|
550
|
+
|
551
|
+
# Do not consolidate memory across columns when preparing the Pandas vectors
|
552
|
+
# This is a significant performance win for very wide datasets
|
553
|
+
split_blocks=True) # noqa
|
554
|
+
|
555
|
+
def to_internal(self, df: pandas.DataFrame, schema: tp.Optional[pa.Schema] = None) -> pa.Table:
|
556
|
+
|
557
|
+
# Converting pandas -> arrow needs care to ensure type coercion is applied correctly
|
558
|
+
# Calling Table.from_pandas with the supplied schema will very often reject data
|
559
|
+
# Instead, we convert the dataframe as-is and then apply type conversion in a second step
|
560
|
+
# This allows us to apply specific coercion rules for each data type
|
561
|
+
|
562
|
+
# As an optimisation, the column filter means columns will not be converted if they are not needed
|
563
|
+
# E.g. if a model outputs lots of undeclared columns, there is no need to convert them
|
564
|
+
|
565
|
+
column_filter = DataConformance.column_filter(df.columns, schema) # noqa
|
566
|
+
|
567
|
+
if len(df) > 0:
|
568
|
+
|
569
|
+
table = pa.Table.from_pandas(df, columns=column_filter, preserve_index=False) # noqa
|
570
|
+
|
571
|
+
# Special case handling for converting an empty dataframe
|
572
|
+
# These must flow through the pipe with valid schemas, like any other dataset
|
573
|
+
# Type coercion and column filtering happen in conform_to_schema, if a schema has been supplied
|
574
|
+
|
575
|
+
else:
|
576
|
+
|
577
|
+
empty_df = df.filter(column_filter) if column_filter else df
|
578
|
+
empty_schema = pa.Schema.from_pandas(empty_df, preserve_index=False) # noqa
|
579
|
+
|
580
|
+
table = pa.Table.from_batches(list(), empty_schema) # noqa
|
581
|
+
|
582
|
+
# If there is no explict schema, give back the table exactly as it was received from Pandas
|
583
|
+
# There could be an option here to infer and coerce for TRAC standard types
|
584
|
+
# E.g. unsigned int 32 -> signed int 64, TRAC standard integer type
|
585
|
+
|
586
|
+
if schema is None:
|
587
|
+
DataConformance.check_duplicate_fields(table.schema.names, False)
|
588
|
+
return table
|
589
|
+
|
590
|
+
# If a schema has been supplied, apply data conformance
|
591
|
+
# If column filtering has been applied, we also need to filter the pandas dtypes used for hinting
|
592
|
+
|
593
|
+
else:
|
594
|
+
df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
|
595
|
+
return DataConformance.conform_to_schema(table, schema, df_types)
|
596
|
+
|
597
|
+
def infer_schema(self, dataset: pandas.DataFrame) -> _meta.SchemaDefinition:
|
598
|
+
|
599
|
+
arrow_schema = pa.Schema.from_pandas(dataset, preserve_index=False) # noqa
|
600
|
+
return DataMapping.arrow_to_trac_schema(arrow_schema)
|
601
|
+
|
602
|
+
|
603
|
+
# Data frameworks are optional, do not blow up the module just because one framework is unavailable!
|
604
|
+
if polars is not None:
|
605
|
+
|
606
|
+
class PolarsArrowConverter(DataConverter[polars.DataFrame, pa.Table, pa.Schema]):
|
607
|
+
|
608
|
+
def __init__(self, framework: _api.DataFramework[T_DATA_API]):
|
609
|
+
super().__init__(framework)
|
610
|
+
|
611
|
+
def from_internal(self, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> polars.DataFrame:
|
612
|
+
|
613
|
+
if schema is not None:
|
614
|
+
table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
|
615
|
+
else:
|
616
|
+
DataConformance.check_duplicate_fields(table.schema.names, False)
|
617
|
+
|
618
|
+
return polars.from_arrow(table)
|
619
|
+
|
620
|
+
def to_internal(self, df: polars.DataFrame, schema: tp.Optional[pa.Schema] = None,) -> pa.Table:
|
621
|
+
|
622
|
+
column_filter = DataConformance.column_filter(df.columns, schema)
|
623
|
+
|
624
|
+
filtered_df = df.select(polars.col(*column_filter)) if column_filter else df
|
625
|
+
table = filtered_df.to_arrow()
|
626
|
+
|
627
|
+
if schema is None:
|
628
|
+
DataConformance.check_duplicate_fields(table.schema.names, False)
|
629
|
+
return table
|
630
|
+
else:
|
631
|
+
return DataConformance.conform_to_schema(table, schema, None)
|
632
|
+
|
633
|
+
def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
|
523
634
|
|
524
|
-
|
635
|
+
arrow_schema = dataset.top_k(1).to_arrow().schema
|
636
|
+
return DataMapping.arrow_to_trac_schema(arrow_schema)
|
525
637
|
|
526
638
|
|
527
639
|
class DataConformance:
|
@@ -652,7 +764,7 @@ class DataConformance:
|
|
652
764
|
# Columns not defined in the schema will not be included in the conformed output
|
653
765
|
if warn_extra_columns and table.num_columns > len(schema.types):
|
654
766
|
|
655
|
-
schema_columns = set(map(
|
767
|
+
schema_columns = set(map(lambda c: c.lower(), schema.names))
|
656
768
|
extra_columns = [
|
657
769
|
f"[{col}]"
|
658
770
|
for col in table.schema.names
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# Licensed to the Fintech Open Source Foundation (FINOS) under one or
|
2
|
+
# more contributor license agreements. See the NOTICE file distributed
|
3
|
+
# with this work for additional information regarding copyright ownership.
|
4
|
+
# FINOS licenses this file to you under the Apache License, Version 2.0
|
5
|
+
# (the "License"); you may not use this file except in compliance with the
|
6
|
+
# License. You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|