PyPI - tracdap-runtime - Versions diffs - 0.6.5__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

tracdap-runtime 0.6.5py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

tracdap/rt/__init__.py +6 -5
tracdap/rt/_exec/actors.py +6 -5
tracdap/rt/_exec/context.py +278 -110
tracdap/rt/_exec/dev_mode.py +237 -143
tracdap/rt/_exec/engine.py +223 -64
tracdap/rt/_exec/functions.py +31 -6
tracdap/rt/_exec/graph.py +15 -5
tracdap/rt/_exec/graph_builder.py +301 -203
tracdap/rt/_exec/runtime.py +13 -10
tracdap/rt/_exec/server.py +6 -5
tracdap/rt/_impl/__init__.py +6 -5
tracdap/rt/_impl/config_parser.py +17 -9
tracdap/rt/_impl/data.py +284 -172
tracdap/rt/_impl/ext/__init__.py +14 -0
tracdap/rt/_impl/ext/sql.py +117 -0
tracdap/rt/_impl/ext/storage.py +58 -0
tracdap/rt/_impl/grpc/__init__.py +6 -5
tracdap/rt/_impl/grpc/codec.py +6 -5
tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +62 -54
tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +37 -2
tracdap/rt/_impl/guard_rails.py +6 -5
tracdap/rt/_impl/models.py +6 -5
tracdap/rt/_impl/repos.py +6 -5
tracdap/rt/_impl/schemas.py +6 -5
tracdap/rt/_impl/shim.py +6 -5
tracdap/rt/_impl/static_api.py +30 -16
tracdap/rt/_impl/storage.py +8 -7
tracdap/rt/_impl/type_system.py +6 -5
tracdap/rt/_impl/util.py +16 -5
tracdap/rt/_impl/validation.py +72 -18
tracdap/rt/_plugins/__init__.py +6 -5
tracdap/rt/_plugins/_helpers.py +6 -5
tracdap/rt/_plugins/config_local.py +6 -5
tracdap/rt/_plugins/format_arrow.py +6 -5
tracdap/rt/_plugins/format_csv.py +6 -5
tracdap/rt/_plugins/format_parquet.py +6 -5
tracdap/rt/_plugins/repo_git.py +6 -5
tracdap/rt/_plugins/repo_local.py +6 -5
tracdap/rt/_plugins/repo_pypi.py +6 -5
tracdap/rt/_plugins/storage_aws.py +6 -5
tracdap/rt/_plugins/storage_azure.py +6 -5
tracdap/rt/_plugins/storage_gcp.py +6 -5
tracdap/rt/_plugins/storage_local.py +6 -5
tracdap/rt/_plugins/storage_sql.py +418 -0
tracdap/rt/_plugins/storage_sql_dialects.py +118 -0
tracdap/rt/_version.py +7 -6
tracdap/rt/api/__init__.py +23 -5
tracdap/rt/api/experimental.py +85 -37
tracdap/rt/api/hook.py +16 -5
tracdap/rt/api/model_api.py +110 -90
tracdap/rt/api/static_api.py +142 -100
tracdap/rt/config/common.py +26 -27
tracdap/rt/config/job.py +5 -6
tracdap/rt/config/platform.py +41 -42
tracdap/rt/config/result.py +5 -6
tracdap/rt/config/runtime.py +6 -7
tracdap/rt/exceptions.py +13 -7
tracdap/rt/ext/__init__.py +6 -5
tracdap/rt/ext/config.py +6 -5
tracdap/rt/ext/embed.py +6 -5
tracdap/rt/ext/plugins.py +6 -5
tracdap/rt/ext/repos.py +6 -5
tracdap/rt/ext/storage.py +6 -5
tracdap/rt/launch/__init__.py +10 -5
tracdap/rt/launch/__main__.py +6 -5
tracdap/rt/launch/cli.py +6 -5
tracdap/rt/launch/launch.py +38 -15
tracdap/rt/metadata/__init__.py +4 -0
tracdap/rt/metadata/common.py +2 -3
tracdap/rt/metadata/custom.py +3 -4
tracdap/rt/metadata/data.py +30 -31
tracdap/rt/metadata/file.py +6 -7
tracdap/rt/metadata/flow.py +22 -23
tracdap/rt/metadata/job.py +89 -45
tracdap/rt/metadata/model.py +26 -27
tracdap/rt/metadata/object.py +11 -12
tracdap/rt/metadata/object_id.py +23 -24
tracdap/rt/metadata/resource.py +0 -1
tracdap/rt/metadata/search.py +15 -16
tracdap/rt/metadata/stoarge.py +22 -23
tracdap/rt/metadata/tag.py +8 -9
tracdap/rt/metadata/tag_update.py +11 -12
tracdap/rt/metadata/type.py +38 -38
{tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0.dist-info}/LICENSE +1 -1
{tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0.dist-info}/METADATA +4 -2
tracdap_runtime-0.7.0.dist-info/RECORD +121 -0
{tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0.dist-info}/WHEEL +1 -1
tracdap_runtime-0.6.5.dist-info/RECORD +0 -116
{tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0.dist-info}/top_level.txt +0 -0

tracdap/rt/_impl/data.py CHANGED Viewed

@@ -1,8 +1,9 @@
-#  Copyright 2022 Accenture Global Solutions Limited
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
+#  Licensed to the Fintech Open Source Foundation (FINOS) under one or
+#  more contributor license agreements. See the NOTICE file distributed
+#  with this work for additional information regarding copyright ownership.
+#  FINOS licenses this file to you under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with the
+#  License. You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
@@ -12,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import abc
 import dataclasses as dc
 import typing as tp
 import datetime as dt
@@ -31,6 +33,7 @@ try:
 except ModuleNotFoundError:
     polars = None
+import tracdap.rt.api.experimental as _api
 import tracdap.rt.metadata as _meta
 import tracdap.rt.exceptions as _ex
 import tracdap.rt._impl.util as _util
@@ -116,73 +119,19 @@ class DataMapping:
     # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data
-    __TRAC_DECIMAL_PRECISION = 38
-    __TRAC_DECIMAL_SCALE = 12
-    __TRAC_TIMESTAMP_UNIT = "ms"
-    __TRAC_TIMESTAMP_ZONE = None
+    DEFAULT_DECIMAL_PRECISION = 38
+    DEFAULT_DECIMAL_SCALE = 12
+    DEFAULT_TIMESTAMP_UNIT = "ms"
+    DEFAULT_TIMESTAMP_ZONE = None
     __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
         _meta.BasicType.BOOLEAN: pa.bool_(),
         _meta.BasicType.INTEGER: pa.int64(),
         _meta.BasicType.FLOAT: pa.float64(),
-        _meta.BasicType.DECIMAL: pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE),
+        _meta.BasicType.DECIMAL: pa.decimal128(DEFAULT_DECIMAL_PRECISION, DEFAULT_DECIMAL_SCALE),
         _meta.BasicType.STRING: pa.utf8(),
         _meta.BasicType.DATE: pa.date32(),
-        _meta.BasicType.DATETIME: pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE)
-    }
-    # Check the Pandas dtypes for handling floats are available before setting up the type mapping
-    __PANDAS_VERSION_ELEMENTS = pandas.__version__.split(".")
-    __PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
-    __PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
-    if __PANDAS_MAJOR_VERSION == 2:
-        __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
-        __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
-        @classmethod
-        def __pandas_datetime_type(cls, tz, unit):
-            if tz is None and unit is None:
-                return cls.__PANDAS_DATETIME_TYPE
-            _unit = unit if unit is not None else cls.__TRAC_TIMESTAMP_UNIT
-            if tz is None:
-                return pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
-            else:
-                return pandas.DatetimeTZDtype(tz=tz, unit=_unit)
-    # Minimum supported version for Pandas is 1.2, when pandas.Float64Dtype was introduced
-    elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
-        __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).dtype
-        __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
-        @classmethod
-        def __pandas_datetime_type(cls, tz, unit):  # noqa
-            if tz is None:
-                return cls.__PANDAS_DATETIME_TYPE
-            else:
-                return pandas.DatetimeTZDtype(tz=tz)
-    else:
-        raise _ex.EStartup(f"Pandas version not supported: [{pandas.__version__}]")
-    # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
-    __ARROW_TO_PANDAS_TYPE_MAPPING = {
-        pa.bool_(): pandas.BooleanDtype(),
-        pa.int8(): pandas.Int8Dtype(),
-        pa.int16(): pandas.Int16Dtype(),
-        pa.int32(): pandas.Int32Dtype(),
-        pa.int64(): pandas.Int64Dtype(),
-        pa.uint8(): pandas.UInt8Dtype(),
-        pa.uint16(): pandas.UInt16Dtype(),
-        pa.uint32(): pandas.UInt32Dtype(),
-        pa.uint64(): pandas.UInt64Dtype(),
-        pa.float16(): pandas.Float32Dtype(),
-        pa.float32(): pandas.Float32Dtype(),
-        pa.float64(): pandas.Float64Dtype(),
-        pa.string(): pandas.StringDtype(),
-        pa.utf8(): pandas.StringDtype()
+        _meta.BasicType.DATETIME: pa.timestamp(DEFAULT_TIMESTAMP_UNIT, DEFAULT_TIMESTAMP_ZONE)
     }
     __ARROW_TO_TRAC_BASIC_TYPE_MAPPING = {
@@ -243,7 +192,7 @@ class DataMapping:
             return pa.float64()
         if python_type == decimal.Decimal:
-            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE)
+            return pa.decimal128(cls.DEFAULT_DECIMAL_PRECISION, cls.DEFAULT_DECIMAL_SCALE)
         if python_type == str:
             return pa.utf8()
@@ -252,7 +201,7 @@ class DataMapping:
             return pa.date32()
         if python_type == dt.datetime:
-            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT, cls.__TRAC_TIMESTAMP_ZONE)
+            return pa.timestamp(cls.DEFAULT_TIMESTAMP_UNIT, cls.DEFAULT_TIMESTAMP_ZONE)
         raise _ex.ETracInternal(f"No Arrow type mapping available for Python type [{python_type}]")
@@ -293,8 +242,8 @@ class DataMapping:
     def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:
         return pa.decimal128(
-            cls.__TRAC_DECIMAL_PRECISION,
-            cls.__TRAC_DECIMAL_SCALE)
+            cls.DEFAULT_DECIMAL_PRECISION,
+            cls.DEFAULT_DECIMAL_SCALE,)
     @classmethod
     def arrow_to_trac_schema(cls, arrow_schema: pa.Schema) -> _meta.SchemaDefinition:
@@ -337,41 +286,6 @@ class DataMapping:
         raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
-    @classmethod
-    def pandas_date_type(cls):
-        return cls.__PANDAS_DATE_TYPE
-    @classmethod
-    def pandas_datetime_type(cls, tz=None, unit=None):
-        return cls.__pandas_datetime_type(tz, unit)
-    @classmethod
-    def view_to_pandas(
-            cls, view:  DataView,  part: DataPartKey, schema: tp.Optional[pa.Schema],
-            temporal_objects_flag: bool) -> "pandas.DataFrame":
-        table = cls.view_to_arrow(view, part)
-        return cls.arrow_to_pandas(table, schema, temporal_objects_flag)
-    @classmethod
-    def view_to_polars(
-            cls, view:  DataView, part: DataPartKey, schema: tp.Optional[pa.Schema]):
-        table = cls.view_to_arrow(view, part)
-        return cls.arrow_to_polars(table, schema)
-    @classmethod
-    def pandas_to_item(cls, df: "pandas.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
-        table = cls.pandas_to_arrow(df, schema)
-        return DataItem(table.schema, table)
-    @classmethod
-    def polars_to_item(cls, df: "polars.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
-        table = cls.polars_to_arrow(df, schema)
-        return DataItem(table.schema, table)
     @classmethod
     def add_item_to_view(cls, view: DataView, part: DataPartKey, item: DataItem) -> DataView:
@@ -420,108 +334,306 @@ class DataMapping:
     @classmethod
     def arrow_to_pandas(
-            cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None,
+            cls, table: pa.Table,
+            schema: tp.Optional[pa.Schema] = None,
             temporal_objects_flag: bool = False) -> "pandas.DataFrame":
-        if schema is not None:
-            table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
-        else:
-            DataConformance.check_duplicate_fields(table.schema.names, False)
+        # This is a legacy internal method and should be removed
+        # DataMapping is no longer responsible for individual data APIs
-        # Use Arrow's built-in function to convert to Pandas
-        return table.to_pandas(
+        # Maintained temporarily for compatibility with existing deployments
-            # Mapping for arrow -> pandas types for core types
-            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get,
+        converter = PandasArrowConverter(_api.PANDAS, use_temporal_objects=temporal_objects_flag)
+        return converter.from_internal(table, schema)
-            # Use Python objects for dates and times if temporal_objects_flag is set
-            date_as_object=temporal_objects_flag,  # noqa
-            timestamp_as_object=temporal_objects_flag,  # noqa
+    @classmethod
+    def pandas_to_arrow(
+            cls, df: "pandas.DataFrame",
+            schema: tp.Optional[pa.Schema] = None) -> pa.Table:
-            # Do not bring any Arrow metadata into Pandas dataframe
-            ignore_metadata=True,  # noqa
+        # This is a legacy internal method and should be removed
+        # DataMapping is no longer responsible for individual data APIs
-            # Do not consolidate memory across columns when preparing the Pandas vectors
-            # This is a significant performance win for very wide datasets
-            split_blocks=True)  # noqa
+        # Maintained temporarily for compatibility with existing deployments
-    @classmethod
-    def arrow_to_polars(
-            cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> "polars.DataFrame":
+        converter = PandasArrowConverter(_api.PANDAS)
+        return converter.to_internal(df, schema)
-        if schema is not None:
-            table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
-        else:
-            DataConformance.check_duplicate_fields(table.schema.names, False)
-        return polars.from_arrow(table)
-    @classmethod
-    def pandas_to_arrow(cls, df: "pandas.DataFrame", schema: tp.Optional[pa.Schema] = None) -> pa.Table:
+T_DATA_API = tp.TypeVar("T_DATA_API")
+T_INTERNAL_DATA = tp.TypeVar("T_INTERNAL_DATA")
+T_INTERNAL_SCHEMA = tp.TypeVar("T_INTERNAL_SCHEMA")
-        # Converting pandas -> arrow needs care to ensure type coercion is applied correctly
-        # Calling Table.from_pandas with the supplied schema will very often reject data
-        # Instead, we convert the dataframe as-is and then apply type conversion in a second step
-        # This allows us to apply specific coercion rules for each data type
-        # As an optimisation, the column filter means columns will not be converted if they are not needed
-        # E.g. if a model outputs lots of undeclared columns, there is no need to convert them
+class DataConverter(tp.Generic[T_DATA_API, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
-        column_filter = DataConformance.column_filter(df.columns, schema)  # noqa
+    # Available per-framework args, to enable framework-specific type-checking in public APIs
+    # These should (for a purist point of view) be in the individual converter classes
+    # For now there are only a few converters, they are all defined here so this is OK
+    __FRAMEWORK_ARGS = {
+        _api.PANDAS: {"use_temporal_objects": tp.Optional[bool]},
+        _api.POLARS: {}
+    }
-        if len(df) > 0:
+    @classmethod
+    def get_framework(cls, dataset: _api.DATA_API) -> _api.DataFramework[_api.DATA_API]:
-            table = pa.Table.from_pandas(df, columns=column_filter, preserve_index=False)  # noqa
+        if pandas is not None and isinstance(dataset, pandas.DataFrame):
+            return _api.PANDAS
-        # Special case handling for converting an empty dataframe
-        # These must flow through the pipe with valid schemas, like any other dataset
-        # Type coercion and column filtering happen in conform_to_schema, if a schema has been supplied
+        if polars is not None and isinstance(dataset, polars.DataFrame):
+            return _api.POLARS
-        else:
+        data_api_type = f"{type(dataset).__module__}.{type(dataset).__name__}"
+        raise _ex.EPluginNotAvailable(f"No data framework available for type [{data_api_type}]")
-            empty_df = df.filter(column_filter) if column_filter else df
-            empty_schema = pa.Schema.from_pandas(empty_df, preserve_index=False)  # noqa
+    @classmethod
+    def get_framework_args(cls, framework: _api.DataFramework[_api.DATA_API]) -> tp.Dict[str, type]:
-            table = pa.Table.from_batches(list(), empty_schema)  # noqa
+        return cls.__FRAMEWORK_ARGS.get(framework) or {}
-        # If there is no explict schema, give back the table exactly as it was received from Pandas
-        # There could be an option here to infer and coerce for TRAC standard types
-        # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type
+    @classmethod
+    def for_framework(cls, framework: _api.DataFramework[_api.DATA_API], **framework_args) -> "DataConverter[_api.DATA_API, pa.Table, pa.Schema]":
-        if schema is None:
-            DataConformance.check_duplicate_fields(table.schema.names, False)
-            return table
+        if framework == _api.PANDAS:
+            if pandas is not None:
+                return PandasArrowConverter(framework, **framework_args)
+            else:
+                raise _ex.EPluginNotAvailable(f"Optional package [{framework}] is not installed")
-        # If a schema has been supplied, apply data conformance
-        # If column filtering has been applied, we also need to filter the pandas dtypes used for hinting
+        if framework == _api.POLARS:
+            if polars is not None:
+                return PolarsArrowConverter(framework)
+            else:
+                raise _ex.EPluginNotAvailable(f"Optional package [{framework}] is not installed")
-        else:
-            df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
-            return DataConformance.conform_to_schema(table, schema, df_types)
+        raise _ex.EPluginNotAvailable(f"Data framework [{framework}] is not recognized")
     @classmethod
-    def pandas_to_arrow_schema(cls, df: "pandas.DataFrame") -> pa.Schema:
+    def for_dataset(cls, dataset: _api.DATA_API) -> "DataConverter[_api.DATA_API, pa.Table, pa.Schema]":
-        return pa.Schema.from_pandas(df, preserve_index=False)  # noqa
+        return cls.for_framework(cls.get_framework(dataset))
     @classmethod
-    def polars_to_arrow(cls, df: "polars.DataFrame", schema: tp.Optional[pa.Schema] = None) -> pa.Table:
+    def noop(cls) -> "DataConverter[T_INTERNAL_DATA, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]":
+        return NoopConverter()
+    def __init__(self, framework: _api.DataFramework[T_DATA_API]):
+        self.framework = framework
+    @abc.abstractmethod
+    def from_internal(self, dataset: T_INTERNAL_DATA, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_DATA_API:
+        pass
+    @abc.abstractmethod
+    def to_internal(self, dataset: T_DATA_API, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_INTERNAL_DATA:
+        pass
+    @abc.abstractmethod
+    def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
+        pass
+class NoopConverter(DataConverter[T_INTERNAL_DATA, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
+    def __init__(self):
+        super().__init__(_api.DataFramework("internal", None))  # noqa
+    def from_internal(self, dataset: T_INTERNAL_DATA, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_DATA_API:
+        return dataset
+    def to_internal(self, dataset: T_DATA_API, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_INTERNAL_DATA:
+        return dataset
+    def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
+        raise _ex.EUnexpected()  # A real converter should be selected before use
+# Data frameworks are optional, do not blow up the module just because one framework is unavailable!
+if pandas is not None:
-        column_filter = DataConformance.column_filter(df.columns, schema)
+    class PandasArrowConverter(DataConverter[pandas.DataFrame, pa.Table, pa.Schema]):
-        filtered_df = df.select(polars.col(*column_filter)) if column_filter else df
-        table = filtered_df.to_arrow()
+        # Check the Pandas dtypes for handling floats are available before setting up the type mapping
+        __PANDAS_VERSION_ELEMENTS = pandas.__version__.split(".")
+        __PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
+        __PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
+        if __PANDAS_MAJOR_VERSION == 2:
+            __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).as_unit(DataMapping.DEFAULT_TIMESTAMP_UNIT).dtype
+            __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(DataMapping.DEFAULT_TIMESTAMP_UNIT).dtype
+            @classmethod
+            def __pandas_datetime_type(cls, tz, unit):
+                if tz is None and unit is None:
+                    return cls.__PANDAS_DATETIME_TYPE
+                _unit = unit if unit is not None else DataMapping.DEFAULT_TIMESTAMP_UNIT
+                if tz is None:
+                    return pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
+                else:
+                    return pandas.DatetimeTZDtype(tz=tz, unit=_unit)
+        # Minimum supported version for Pandas is 1.2, when pandas.Float64Dtype was introduced
+        elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
+            __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).dtype
+            __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
+            @classmethod
+            def __pandas_datetime_type(cls, tz, unit):  # noqa
+                if tz is None:
+                    return cls.__PANDAS_DATETIME_TYPE
+                else:
+                    return pandas.DatetimeTZDtype(tz=tz)
-        if schema is None:
-            DataConformance.check_duplicate_fields(table.schema.names, False)
-            return table
         else:
-            return DataConformance.conform_to_schema(table, schema, None)
+            raise _ex.EStartup(f"Pandas version not supported: [{pandas.__version__}]")
+        # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
+        __ARROW_TO_PANDAS_TYPE_MAPPING = {
+            pa.bool_(): pandas.BooleanDtype(),
+            pa.int8(): pandas.Int8Dtype(),
+            pa.int16(): pandas.Int16Dtype(),
+            pa.int32(): pandas.Int32Dtype(),
+            pa.int64(): pandas.Int64Dtype(),
+            pa.uint8(): pandas.UInt8Dtype(),
+            pa.uint16(): pandas.UInt16Dtype(),
+            pa.uint32(): pandas.UInt32Dtype(),
+            pa.uint64(): pandas.UInt64Dtype(),
+            pa.float16(): pandas.Float32Dtype(),
+            pa.float32(): pandas.Float32Dtype(),
+            pa.float64(): pandas.Float64Dtype(),
+            pa.string(): pandas.StringDtype(),
+            pa.utf8(): pandas.StringDtype()
+        }
+        __DEFAULT_TEMPORAL_OBJECTS = False
+        # Expose date type for testing
+        @classmethod
+        def pandas_date_type(cls):
+            return cls.__PANDAS_DATE_TYPE
-    @classmethod
-    def polars_to_arrow_schema(cls, df: "polars.DataFrame") -> pa.Schema:
+        # Expose datetime type for testing
+        @classmethod
+        def pandas_datetime_type(cls, tz=None, unit=None):
+            return cls.__pandas_datetime_type(tz, unit)
+        def __init__(self, framework: _api.DataFramework[T_DATA_API], use_temporal_objects: tp.Optional[bool] = None):
+            super().__init__(framework)
+            if use_temporal_objects is None:
+                self.__temporal_objects_flag = self.__DEFAULT_TEMPORAL_OBJECTS
+            else:
+                self.__temporal_objects_flag = use_temporal_objects
+        def from_internal(self, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> pandas.DataFrame:
+            if schema is not None:
+                table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
+            else:
+                DataConformance.check_duplicate_fields(table.schema.names, False)
+                # Use Arrow's built-in function to convert to Pandas
+            return table.to_pandas(
+                # Mapping for arrow -> pandas types for core types
+                types_mapper=self.__ARROW_TO_PANDAS_TYPE_MAPPING.get,
+                # Use Python objects for dates and times if temporal_objects_flag is set
+                date_as_object=self.__temporal_objects_flag,  # noqa
+                timestamp_as_object=self.__temporal_objects_flag,  # noqa
+                # Do not bring any Arrow metadata into Pandas dataframe
+                ignore_metadata=True,  # noqa
+                # Do not consolidate memory across columns when preparing the Pandas vectors
+                # This is a significant performance win for very wide datasets
+                split_blocks=True)  # noqa
+        def to_internal(self, df: pandas.DataFrame, schema: tp.Optional[pa.Schema] = None) -> pa.Table:
+            # Converting pandas -> arrow needs care to ensure type coercion is applied correctly
+            # Calling Table.from_pandas with the supplied schema will very often reject data
+            # Instead, we convert the dataframe as-is and then apply type conversion in a second step
+            # This allows us to apply specific coercion rules for each data type
+            # As an optimisation, the column filter means columns will not be converted if they are not needed
+            # E.g. if a model outputs lots of undeclared columns, there is no need to convert them
+            column_filter = DataConformance.column_filter(df.columns, schema)  # noqa
+            if len(df) > 0:
+                table = pa.Table.from_pandas(df, columns=column_filter, preserve_index=False)  # noqa
+            # Special case handling for converting an empty dataframe
+            # These must flow through the pipe with valid schemas, like any other dataset
+            # Type coercion and column filtering happen in conform_to_schema, if a schema has been supplied
+            else:
+                empty_df = df.filter(column_filter) if column_filter else df
+                empty_schema = pa.Schema.from_pandas(empty_df, preserve_index=False)  # noqa
+                table = pa.Table.from_batches(list(), empty_schema)  # noqa
+            # If there is no explict schema, give back the table exactly as it was received from Pandas
+            # There could be an option here to infer and coerce for TRAC standard types
+            # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type
+            if schema is None:
+                DataConformance.check_duplicate_fields(table.schema.names, False)
+                return table
+            # If a schema has been supplied, apply data conformance
+            # If column filtering has been applied, we also need to filter the pandas dtypes used for hinting
+            else:
+                df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
+                return DataConformance.conform_to_schema(table, schema, df_types)
+        def infer_schema(self, dataset: pandas.DataFrame) -> _meta.SchemaDefinition:
+            arrow_schema = pa.Schema.from_pandas(dataset, preserve_index=False)  # noqa
+            return DataMapping.arrow_to_trac_schema(arrow_schema)
+# Data frameworks are optional, do not blow up the module just because one framework is unavailable!
+if polars is not None:
+    class PolarsArrowConverter(DataConverter[polars.DataFrame, pa.Table, pa.Schema]):
+        def __init__(self, framework: _api.DataFramework[T_DATA_API]):
+            super().__init__(framework)
+        def from_internal(self, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> polars.DataFrame:
+            if schema is not None:
+                table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
+            else:
+                DataConformance.check_duplicate_fields(table.schema.names, False)
+            return polars.from_arrow(table)
+        def to_internal(self, df: polars.DataFrame, schema: tp.Optional[pa.Schema] = None,) -> pa.Table:
+            column_filter = DataConformance.column_filter(df.columns, schema)
+            filtered_df = df.select(polars.col(*column_filter)) if column_filter else df
+            table = filtered_df.to_arrow()
+            if schema is None:
+                DataConformance.check_duplicate_fields(table.schema.names, False)
+                return table
+            else:
+                return DataConformance.conform_to_schema(table, schema, None)
+        def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
-        return df.top_k(1).to_arrow().schema
+            arrow_schema = dataset.top_k(1).to_arrow().schema
+            return DataMapping.arrow_to_trac_schema(arrow_schema)
 class DataConformance:
@@ -652,7 +764,7 @@ class DataConformance:
         # Columns not defined in the schema will not be included in the conformed output
         if warn_extra_columns and table.num_columns > len(schema.types):
-            schema_columns = set(map(str.lower, schema.names))
+            schema_columns = set(map(lambda c: c.lower(), schema.names))
             extra_columns = [
                 f"[{col}]"
                 for col in table.schema.names

tracdap/rt/_impl/ext/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+#  Licensed to the Fintech Open Source Foundation (FINOS) under one or
+#  more contributor license agreements. See the NOTICE file distributed
+#  with this work for additional information regarding copyright ownership.
+#  FINOS licenses this file to you under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with the
+#  License. You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.

tracdap-runtime 0.6.5__py3-none-any.whl → 0.7.0__py3-none-any.whl

tracdap-runtime 0.6.5py3-none-any.whl → 0.7.0py3-none-any.whl