PyPI - tracdap-runtime - Versions diffs - 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl - Mend

tracdap-runtime 0.6.4py3-none-any.whl → 0.6.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

tracdap/rt/_exec/context.py +556 -36
tracdap/rt/_exec/dev_mode.py +320 -198
tracdap/rt/_exec/engine.py +331 -62
tracdap/rt/_exec/functions.py +151 -22
tracdap/rt/_exec/graph.py +47 -13
tracdap/rt/_exec/graph_builder.py +383 -175
tracdap/rt/_exec/runtime.py +7 -5
tracdap/rt/_impl/config_parser.py +11 -4
tracdap/rt/_impl/data.py +329 -152
tracdap/rt/_impl/ext/__init__.py +13 -0
tracdap/rt/_impl/ext/sql.py +116 -0
tracdap/rt/_impl/ext/storage.py +57 -0
tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +82 -30
tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +155 -2
tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +12 -10
tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.pyi +14 -2
tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.py +29 -0
tracdap/rt/_impl/grpc/tracdap/metadata/resource_pb2.pyi +16 -0
tracdap/rt/_impl/models.py +8 -0
tracdap/rt/_impl/static_api.py +29 -0
tracdap/rt/_impl/storage.py +39 -27
tracdap/rt/_impl/util.py +10 -0
tracdap/rt/_impl/validation.py +140 -18
tracdap/rt/_plugins/repo_git.py +1 -1
tracdap/rt/_plugins/storage_sql.py +417 -0
tracdap/rt/_plugins/storage_sql_dialects.py +117 -0
tracdap/rt/_version.py +1 -1
tracdap/rt/api/experimental.py +267 -0
tracdap/rt/api/hook.py +14 -0
tracdap/rt/api/model_api.py +48 -6
tracdap/rt/config/__init__.py +2 -2
tracdap/rt/config/common.py +6 -0
tracdap/rt/metadata/__init__.py +29 -20
tracdap/rt/metadata/job.py +99 -0
tracdap/rt/metadata/model.py +18 -0
tracdap/rt/metadata/resource.py +24 -0
{tracdap_runtime-0.6.4.dist-info → tracdap_runtime-0.6.6.dist-info}/METADATA +5 -1
{tracdap_runtime-0.6.4.dist-info → tracdap_runtime-0.6.6.dist-info}/RECORD +41 -32
{tracdap_runtime-0.6.4.dist-info → tracdap_runtime-0.6.6.dist-info}/WHEEL +1 -1
{tracdap_runtime-0.6.4.dist-info → tracdap_runtime-0.6.6.dist-info}/LICENSE +0 -0
{tracdap_runtime-0.6.4.dist-info → tracdap_runtime-0.6.6.dist-info}/top_level.txt +0 -0

tracdap/rt/_exec/runtime.py CHANGED Viewed

@@ -154,7 +154,6 @@ class TracRuntime:
                 _plugins.PluginManager.register_plugin_package(plugin_package)
             _static_api.StaticApiImpl.register_impl()
-            _guard.PythonGuardRails.protect_dangerous_functions()
             # Load sys config (or use embedded), config errors are detected before start()
             # Job config can also be checked before start() by using load_job_config()
@@ -201,6 +200,11 @@ class TracRuntime:
             self._models = _models.ModelLoader(self._sys_config, self._scratch_dir)
             self._storage = _storage.StorageManager(self._sys_config)
+            # Enable protection after the initial setup of the runtime is complete
+            # Storage plugins in particular are likely to tigger protected imports
+            # Once the runtime is up, no more plugins should be loaded
+            _guard.PythonGuardRails.protect_dangerous_functions()
             self._engine = _engine.TracEngine(
                 self._sys_config, self._models, self._storage,
                 notify_callback=self._engine_callback)
@@ -329,10 +333,8 @@ class TracRuntime:
                 config_file_name="job")
         if self._dev_mode:
-            job_config = _dev_mode.DevModeTranslator.translate_job_config(
-                self._sys_config, job_config,
-                self._scratch_dir, self._config_mgr,
-                model_class)
+            translator = _dev_mode.DevModeTranslator(self._sys_config, self._config_mgr, self._scratch_dir)
+            job_config = translator.translate_job_config(job_config, model_class)
         return job_config

tracdap/rt/_impl/config_parser.py CHANGED Viewed

@@ -341,10 +341,17 @@ class ConfigParser(tp.Generic[_T]):
             if isinstance(raw_value, tp.Dict):
                 return self._parse_simple_class(location, raw_value, annotation)
-            elif self._is_dev_mode_location(location) and type(raw_value) in ConfigParser.__primitive_types:
-                return self._parse_primitive(location, raw_value, type(raw_value))
-            else:
-                return self._error(location, f"Expected type {annotation.__name__}, got '{str(raw_value)}'")
+            if self._is_dev_mode_location(location):
+                if type(raw_value) in ConfigParser.__primitive_types:
+                    return self._parse_primitive(location, raw_value, type(raw_value))
+                if isinstance(raw_value, list):
+                    if len(raw_value) == 0:
+                        return []
+                    list_type = type(raw_value[0])
+                    return list(map(lambda x: self._parse_primitive(location, x, list_type), raw_value))
+            return self._error(location, f"Expected type {annotation.__name__}, got '{str(raw_value)}'")
         if isinstance(annotation, self.__generic_metaclass):
             return self._parse_generic_class(location, raw_value, annotation)  # noqa

tracdap/rt/_impl/data.py CHANGED Viewed

@@ -12,8 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-from __future__ import annotations
+import abc
 import dataclasses as dc
 import typing as tp
 import datetime as dt
@@ -22,8 +21,18 @@ import platform
 import pyarrow as pa
 import pyarrow.compute as pc
-import pandas as pd
+try:
+    import pandas  # noqa
+except ModuleNotFoundError:
+    pandas = None
+try:
+    import polars  # noqa
+except ModuleNotFoundError:
+    polars = None
+import tracdap.rt.api.experimental as _api
 import tracdap.rt.metadata as _meta
 import tracdap.rt.exceptions as _ex
 import tracdap.rt._impl.util as _util
@@ -42,7 +51,7 @@ class DataSpec:
 class DataPartKey:
     @classmethod
-    def for_root(cls) -> DataPartKey:
+    def for_root(cls) -> "DataPartKey":
         return DataPartKey(opaque_key='part_root')
     opaque_key: str
@@ -55,14 +64,14 @@ class DataItem:
     table: tp.Optional[pa.Table] = None
     batches: tp.Optional[tp.List[pa.RecordBatch]] = None
-    pandas: tp.Optional[pd.DataFrame] = None
+    pandas: "tp.Optional[pandas.DataFrame]" = None
     pyspark: tp.Any = None
     def is_empty(self) -> bool:
         return self.table is None and (self.batches is None or len(self.batches) == 0)
     @staticmethod
-    def create_empty() -> DataItem:
+    def create_empty() -> "DataItem":
         return DataItem(pa.schema([]))
@@ -75,7 +84,7 @@ class DataView:
     parts: tp.Dict[DataPartKey, tp.List[DataItem]]
     @staticmethod
-    def create_empty() -> DataView:
+    def create_empty() -> "DataView":
         return DataView(_meta.SchemaDefinition(), pa.schema([]), dict())
     @staticmethod
@@ -109,73 +118,19 @@ class DataMapping:
     # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data
-    __TRAC_DECIMAL_PRECISION = 38
-    __TRAC_DECIMAL_SCALE = 12
-    __TRAC_TIMESTAMP_UNIT = "ms"
-    __TRAC_TIMESTAMP_ZONE = None
+    DEFAULT_DECIMAL_PRECISION = 38
+    DEFAULT_DECIMAL_SCALE = 12
+    DEFAULT_TIMESTAMP_UNIT = "ms"
+    DEFAULT_TIMESTAMP_ZONE = None
     __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
         _meta.BasicType.BOOLEAN: pa.bool_(),
         _meta.BasicType.INTEGER: pa.int64(),
         _meta.BasicType.FLOAT: pa.float64(),
-        _meta.BasicType.DECIMAL: pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE),
+        _meta.BasicType.DECIMAL: pa.decimal128(DEFAULT_DECIMAL_PRECISION, DEFAULT_DECIMAL_SCALE),
         _meta.BasicType.STRING: pa.utf8(),
         _meta.BasicType.DATE: pa.date32(),
-        _meta.BasicType.DATETIME: pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE)
-    }
-    # Check the Pandas dtypes for handling floats are available before setting up the type mapping
-    __PANDAS_VERSION_ELEMENTS = pd.__version__.split(".")
-    __PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
-    __PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
-    if __PANDAS_MAJOR_VERSION == 2:
-        __PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
-        __PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
-        @classmethod
-        def __pandas_datetime_type(cls, tz, unit):
-            if tz is None and unit is None:
-                return cls.__PANDAS_DATETIME_TYPE
-            _unit = unit if unit is not None else cls.__TRAC_TIMESTAMP_UNIT
-            if tz is None:
-                return pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
-            else:
-                return pd.DatetimeTZDtype(tz=tz, unit=_unit)
-    # Minimum supported version for Pandas is 1.2, when pd.Float64Dtype was introduced
-    elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
-        __PANDAS_DATE_TYPE = pd.to_datetime([dt.date(2000, 1, 1)]).dtype
-        __PANDAS_DATETIME_TYPE = pd.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
-        @classmethod
-        def __pandas_datetime_type(cls, tz, unit):  # noqa
-            if tz is None:
-                return cls.__PANDAS_DATETIME_TYPE
-            else:
-                return pd.DatetimeTZDtype(tz=tz)
-    else:
-        raise _ex.EStartup(f"Pandas version not supported: [{pd.__version__}]")
-    # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
-    __ARROW_TO_PANDAS_TYPE_MAPPING = {
-        pa.bool_(): pd.BooleanDtype(),
-        pa.int8(): pd.Int8Dtype(),
-        pa.int16(): pd.Int16Dtype(),
-        pa.int32(): pd.Int32Dtype(),
-        pa.int64(): pd.Int64Dtype(),
-        pa.uint8(): pd.UInt8Dtype(),
-        pa.uint16(): pd.UInt16Dtype(),
-        pa.uint32(): pd.UInt32Dtype(),
-        pa.uint64(): pd.UInt64Dtype(),
-        pa.float16(): pd.Float32Dtype(),
-        pa.float32(): pd.Float32Dtype(),
-        pa.float64(): pd.Float64Dtype(),
-        pa.string(): pd.StringDtype(),
-        pa.utf8(): pd.StringDtype()
+        _meta.BasicType.DATETIME: pa.timestamp(DEFAULT_TIMESTAMP_UNIT, DEFAULT_TIMESTAMP_ZONE)
     }
     __ARROW_TO_TRAC_BASIC_TYPE_MAPPING = {
@@ -236,7 +191,7 @@ class DataMapping:
             return pa.float64()
         if python_type == decimal.Decimal:
-            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE)
+            return pa.decimal128(cls.DEFAULT_DECIMAL_PRECISION, cls.DEFAULT_DECIMAL_SCALE)
         if python_type == str:
             return pa.utf8()
@@ -245,7 +200,7 @@ class DataMapping:
             return pa.date32()
         if python_type == dt.datetime:
-            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT, cls.__TRAC_TIMESTAMP_ZONE)
+            return pa.timestamp(cls.DEFAULT_TIMESTAMP_UNIT, cls.DEFAULT_TIMESTAMP_ZONE)
         raise _ex.ETracInternal(f"No Arrow type mapping available for Python type [{python_type}]")
@@ -286,8 +241,8 @@ class DataMapping:
     def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:
         return pa.decimal128(
-            cls.__TRAC_DECIMAL_PRECISION,
-            cls.__TRAC_DECIMAL_SCALE)
+            cls.DEFAULT_DECIMAL_PRECISION,
+            cls.DEFAULT_DECIMAL_SCALE,)
     @classmethod
     def arrow_to_trac_schema(cls, arrow_schema: pa.Schema) -> _meta.SchemaDefinition:
@@ -330,28 +285,6 @@ class DataMapping:
         raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
-    @classmethod
-    def pandas_date_type(cls):
-        return cls.__PANDAS_DATE_TYPE
-    @classmethod
-    def pandas_datetime_type(cls, tz=None, unit=None):
-        return cls.__pandas_datetime_type(tz, unit)
-    @classmethod
-    def view_to_pandas(
-            cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema],
-            temporal_objects_flag: bool) -> pd.DataFrame:
-        table = cls.view_to_arrow(view, part)
-        return cls.arrow_to_pandas(table, schema, temporal_objects_flag)
-    @classmethod
-    def pandas_to_item(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema]) -> DataItem:
-        table = cls.pandas_to_arrow(df, schema)
-        return DataItem(table.schema, table)
     @classmethod
     def add_item_to_view(cls, view: DataView, part: DataPartKey, item: DataItem) -> DataView:
@@ -400,73 +333,306 @@ class DataMapping:
     @classmethod
     def arrow_to_pandas(
-            cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None,
-            temporal_objects_flag: bool = False) -> pd.DataFrame:
+            cls, table: pa.Table,
+            schema: tp.Optional[pa.Schema] = None,
+            temporal_objects_flag: bool = False) -> "pandas.DataFrame":
-        if schema is not None:
-            table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
-        else:
-            DataConformance.check_duplicate_fields(table.schema.names, False)
+        # This is a legacy internal method and should be removed
+        # DataMapping is no longer responsible for individual data APIs
+        # Maintained temporarily for compatibility with existing deployments
+        converter = PandasArrowConverter(_api.PANDAS, use_temporal_objects=temporal_objects_flag)
+        return converter.from_internal(table, schema)
+    @classmethod
+    def pandas_to_arrow(
+            cls, df: "pandas.DataFrame",
+            schema: tp.Optional[pa.Schema] = None) -> pa.Table:
+        # This is a legacy internal method and should be removed
+        # DataMapping is no longer responsible for individual data APIs
+        # Maintained temporarily for compatibility with existing deployments
-        # Use Arrow's built-in function to convert to Pandas
-        return table.to_pandas(
+        converter = PandasArrowConverter(_api.PANDAS)
+        return converter.to_internal(df, schema)
-            # Mapping for arrow -> pandas types for core types
-            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get,
-            # Use Python objects for dates and times if temporal_objects_flag is set
-            date_as_object=temporal_objects_flag,  # noqa
-            timestamp_as_object=temporal_objects_flag,  # noqa
-            # Do not bring any Arrow metadata into Pandas dataframe
-            ignore_metadata=True,  # noqa
+T_DATA_API = tp.TypeVar("T_DATA_API")
+T_INTERNAL_DATA = tp.TypeVar("T_INTERNAL_DATA")
+T_INTERNAL_SCHEMA = tp.TypeVar("T_INTERNAL_SCHEMA")
-            # Do not consolidate memory across columns when preparing the Pandas vectors
-            # This is a significant performance win for very wide datasets
-            split_blocks=True)  # noqa
+class DataConverter(tp.Generic[T_DATA_API, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
+    # Available per-framework args, to enable framework-specific type-checking in public APIs
+    # These should (for a purist point of view) be in the individual converter classes
+    # For now there are only a few converters, they are all defined here so this is OK
+    __FRAMEWORK_ARGS = {
+        _api.PANDAS: {"use_temporal_objects": tp.Optional[bool]},
+        _api.POLARS: {}
+    }
     @classmethod
-    def pandas_to_arrow(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema] = None) -> pa.Table:
+    def get_framework(cls, dataset: _api.DATA_API) -> _api.DataFramework[_api.DATA_API]:
+        if pandas is not None and isinstance(dataset, pandas.DataFrame):
+            return _api.PANDAS
-        # Converting pandas -> arrow needs care to ensure type coercion is applied correctly
-        # Calling Table.from_pandas with the supplied schema will very often reject data
-        # Instead, we convert the dataframe as-is and then apply type conversion in a second step
-        # This allows us to apply specific coercion rules for each data type
+        if polars is not None and isinstance(dataset, polars.DataFrame):
+            return _api.POLARS
-        # As an optimisation, the column filter means columns will not be converted if they are not needed
-        # E.g. if a model outputs lots of undeclared columns, there is no need to convert them
+        data_api_type = f"{type(dataset).__module__}.{type(dataset).__name__}"
+        raise _ex.EPluginNotAvailable(f"No data framework available for type [{data_api_type}]")
-        column_filter = DataConformance.column_filter(df.columns, schema)  # noqa
+    @classmethod
+    def get_framework_args(cls, framework: _api.DataFramework[_api.DATA_API]) -> tp.Dict[str, type]:
-        if len(df) > 0:
+        return cls.__FRAMEWORK_ARGS.get(framework) or {}
-            table = pa.Table.from_pandas(df, columns=column_filter, preserve_index=False)  # noqa
+    @classmethod
+    def for_framework(cls, framework: _api.DataFramework[_api.DATA_API], **framework_args) -> "DataConverter[_api.DATA_API, pa.Table, pa.Schema]":
-        # Special case handling for converting an empty dataframe
-        # These must flow through the pipe with valid schemas, like any other dataset
-        # Type coercion and column filtering happen in conform_to_schema, if a schema has been supplied
+        if framework == _api.PANDAS:
+            if pandas is not None:
+                return PandasArrowConverter(framework, **framework_args)
+            else:
+                raise _ex.EPluginNotAvailable(f"Optional package [{framework}] is not installed")
-        else:
+        if framework == _api.POLARS:
+            if polars is not None:
+                return PolarsArrowConverter(framework)
+            else:
+                raise _ex.EPluginNotAvailable(f"Optional package [{framework}] is not installed")
+        raise _ex.EPluginNotAvailable(f"Data framework [{framework}] is not recognized")
+    @classmethod
+    def for_dataset(cls, dataset: _api.DATA_API) -> "DataConverter[_api.DATA_API, pa.Table, pa.Schema]":
+        return cls.for_framework(cls.get_framework(dataset))
+    @classmethod
+    def noop(cls) -> "DataConverter[T_INTERNAL_DATA, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]":
+        return NoopConverter()
+    def __init__(self, framework: _api.DataFramework[T_DATA_API]):
+        self.framework = framework
-            empty_df = df.filter(column_filter) if column_filter else df
-            empty_schema = pa.Schema.from_pandas(empty_df, preserve_index=False)  # noqa
+    @abc.abstractmethod
+    def from_internal(self, dataset: T_INTERNAL_DATA, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_DATA_API:
+        pass
-            table = pa.Table.from_batches(list(), empty_schema)  # noqa
+    @abc.abstractmethod
+    def to_internal(self, dataset: T_DATA_API, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_INTERNAL_DATA:
+        pass
-        # If there is no explict schema, give back the table exactly as it was received from Pandas
-        # There could be an option here to infer and coerce for TRAC standard types
-        # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type
+    @abc.abstractmethod
+    def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
+        pass
-        if schema is None:
-            DataConformance.check_duplicate_fields(table.schema.names, False)
-            return table
-        # If a schema has been supplied, apply data conformance
-        # If column filtering has been applied, we also need to filter the pandas dtypes used for hinting
+class NoopConverter(DataConverter[T_INTERNAL_DATA, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
+    def __init__(self):
+        super().__init__(_api.DataFramework("internal", None))  # noqa
+    def from_internal(self, dataset: T_INTERNAL_DATA, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_DATA_API:
+        return dataset
+    def to_internal(self, dataset: T_DATA_API, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_INTERNAL_DATA:
+        return dataset
+    def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
+        raise _ex.EUnexpected()  # A real converter should be selected before use
+# Data frameworks are optional, do not blow up the module just because one framework is unavailable!
+if pandas is not None:
+    class PandasArrowConverter(DataConverter[pandas.DataFrame, pa.Table, pa.Schema]):
+        # Check the Pandas dtypes for handling floats are available before setting up the type mapping
+        __PANDAS_VERSION_ELEMENTS = pandas.__version__.split(".")
+        __PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
+        __PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
+        if __PANDAS_MAJOR_VERSION == 2:
+            __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).as_unit(DataMapping.DEFAULT_TIMESTAMP_UNIT).dtype
+            __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(DataMapping.DEFAULT_TIMESTAMP_UNIT).dtype
+            @classmethod
+            def __pandas_datetime_type(cls, tz, unit):
+                if tz is None and unit is None:
+                    return cls.__PANDAS_DATETIME_TYPE
+                _unit = unit if unit is not None else DataMapping.DEFAULT_TIMESTAMP_UNIT
+                if tz is None:
+                    return pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
+                else:
+                    return pandas.DatetimeTZDtype(tz=tz, unit=_unit)
+        # Minimum supported version for Pandas is 1.2, when pandas.Float64Dtype was introduced
+        elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
+            __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).dtype
+            __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
+            @classmethod
+            def __pandas_datetime_type(cls, tz, unit):  # noqa
+                if tz is None:
+                    return cls.__PANDAS_DATETIME_TYPE
+                else:
+                    return pandas.DatetimeTZDtype(tz=tz)
         else:
-            df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
-            return DataConformance.conform_to_schema(table, schema, df_types)
+            raise _ex.EStartup(f"Pandas version not supported: [{pandas.__version__}]")
+        # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
+        __ARROW_TO_PANDAS_TYPE_MAPPING = {
+            pa.bool_(): pandas.BooleanDtype(),
+            pa.int8(): pandas.Int8Dtype(),
+            pa.int16(): pandas.Int16Dtype(),
+            pa.int32(): pandas.Int32Dtype(),
+            pa.int64(): pandas.Int64Dtype(),
+            pa.uint8(): pandas.UInt8Dtype(),
+            pa.uint16(): pandas.UInt16Dtype(),
+            pa.uint32(): pandas.UInt32Dtype(),
+            pa.uint64(): pandas.UInt64Dtype(),
+            pa.float16(): pandas.Float32Dtype(),
+            pa.float32(): pandas.Float32Dtype(),
+            pa.float64(): pandas.Float64Dtype(),
+            pa.string(): pandas.StringDtype(),
+            pa.utf8(): pandas.StringDtype()
+        }
+        __DEFAULT_TEMPORAL_OBJECTS = False
+        # Expose date type for testing
+        @classmethod
+        def pandas_date_type(cls):
+            return cls.__PANDAS_DATE_TYPE
+        # Expose datetime type for testing
+        @classmethod
+        def pandas_datetime_type(cls, tz=None, unit=None):
+            return cls.__pandas_datetime_type(tz, unit)
+        def __init__(self, framework: _api.DataFramework[T_DATA_API], use_temporal_objects: tp.Optional[bool] = None):
+            super().__init__(framework)
+            if use_temporal_objects is None:
+                self.__temporal_objects_flag = self.__DEFAULT_TEMPORAL_OBJECTS
+            else:
+                self.__temporal_objects_flag = use_temporal_objects
+        def from_internal(self, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> pandas.DataFrame:
+            if schema is not None:
+                table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
+            else:
+                DataConformance.check_duplicate_fields(table.schema.names, False)
+                # Use Arrow's built-in function to convert to Pandas
+            return table.to_pandas(
+                # Mapping for arrow -> pandas types for core types
+                types_mapper=self.__ARROW_TO_PANDAS_TYPE_MAPPING.get,
+                # Use Python objects for dates and times if temporal_objects_flag is set
+                date_as_object=self.__temporal_objects_flag,  # noqa
+                timestamp_as_object=self.__temporal_objects_flag,  # noqa
+                # Do not bring any Arrow metadata into Pandas dataframe
+                ignore_metadata=True,  # noqa
+                # Do not consolidate memory across columns when preparing the Pandas vectors
+                # This is a significant performance win for very wide datasets
+                split_blocks=True)  # noqa
+        def to_internal(self, df: pandas.DataFrame, schema: tp.Optional[pa.Schema] = None) -> pa.Table:
+            # Converting pandas -> arrow needs care to ensure type coercion is applied correctly
+            # Calling Table.from_pandas with the supplied schema will very often reject data
+            # Instead, we convert the dataframe as-is and then apply type conversion in a second step
+            # This allows us to apply specific coercion rules for each data type
+            # As an optimisation, the column filter means columns will not be converted if they are not needed
+            # E.g. if a model outputs lots of undeclared columns, there is no need to convert them
+            column_filter = DataConformance.column_filter(df.columns, schema)  # noqa
+            if len(df) > 0:
+                table = pa.Table.from_pandas(df, columns=column_filter, preserve_index=False)  # noqa
+            # Special case handling for converting an empty dataframe
+            # These must flow through the pipe with valid schemas, like any other dataset
+            # Type coercion and column filtering happen in conform_to_schema, if a schema has been supplied
+            else:
+                empty_df = df.filter(column_filter) if column_filter else df
+                empty_schema = pa.Schema.from_pandas(empty_df, preserve_index=False)  # noqa
+                table = pa.Table.from_batches(list(), empty_schema)  # noqa
+            # If there is no explict schema, give back the table exactly as it was received from Pandas
+            # There could be an option here to infer and coerce for TRAC standard types
+            # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type
+            if schema is None:
+                DataConformance.check_duplicate_fields(table.schema.names, False)
+                return table
+            # If a schema has been supplied, apply data conformance
+            # If column filtering has been applied, we also need to filter the pandas dtypes used for hinting
+            else:
+                df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
+                return DataConformance.conform_to_schema(table, schema, df_types)
+        def infer_schema(self, dataset: pandas.DataFrame) -> _meta.SchemaDefinition:
+            arrow_schema = pa.Schema.from_pandas(dataset, preserve_index=False)  # noqa
+            return DataMapping.arrow_to_trac_schema(arrow_schema)
+# Data frameworks are optional, do not blow up the module just because one framework is unavailable!
+if polars is not None:
+    class PolarsArrowConverter(DataConverter[polars.DataFrame, pa.Table, pa.Schema]):
+        def __init__(self, framework: _api.DataFramework[T_DATA_API]):
+            super().__init__(framework)
+        def from_internal(self, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> polars.DataFrame:
+            if schema is not None:
+                table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
+            else:
+                DataConformance.check_duplicate_fields(table.schema.names, False)
+            return polars.from_arrow(table)
+        def to_internal(self, df: polars.DataFrame, schema: tp.Optional[pa.Schema] = None,) -> pa.Table:
+            column_filter = DataConformance.column_filter(df.columns, schema)
+            filtered_df = df.select(polars.col(*column_filter)) if column_filter else df
+            table = filtered_df.to_arrow()
+            if schema is None:
+                DataConformance.check_duplicate_fields(table.schema.names, False)
+                return table
+            else:
+                return DataConformance.conform_to_schema(table, schema, None)
+        def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
+            arrow_schema = dataset.top_k(1).to_arrow().schema
+            return DataMapping.arrow_to_trac_schema(arrow_schema)
 class DataConformance:
@@ -597,7 +763,7 @@ class DataConformance:
         # Columns not defined in the schema will not be included in the conformed output
         if warn_extra_columns and table.num_columns > len(schema.types):
-            schema_columns = set(map(str.lower, schema.names))
+            schema_columns = set(map(lambda c: c.lower(), schema.names))
             extra_columns = [
                 f"[{col}]"
                 for col in table.schema.names
@@ -784,21 +950,32 @@ class DataConformance:
     @classmethod
     def _coerce_string(cls, vector: pa.Array, field: pa.Field) -> pa.Array:
-        if pa.types.is_string(field.type):
-            if pa.types.is_string(vector.type):
-                return vector
+        try:
-        if pa.types.is_large_string(field.type):
-            if pa.types.is_large_string(vector.type):
-                return vector
-            # Allow up-casting string -> large_string
-            if pa.types.is_string(vector.type):
-                return pc.cast(vector, field.type)
+            if pa.types.is_string(field.type):
+                if pa.types.is_string(vector.type):
+                    return vector
+                # Try to down-cast large string -> string, will raise ArrowInvalid if data does not fit
+                if pa.types.is_large_string(vector.type):
+                    return pc.cast(vector, field.type, safe=True)
+            if pa.types.is_large_string(field.type):
+                if pa.types.is_large_string(vector.type):
+                    return vector
+                # Allow up-casting string -> large_string
+                if pa.types.is_string(vector.type):
+                    return pc.cast(vector, field.type)
-        error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
-        cls.__log.error(error_message)
+            error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)
+            cls.__log.error(error_message)
+            raise _ex.EDataConformance(error_message)
+        except pa.ArrowInvalid as e:
+            error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR, vector, field, e)
+            cls.__log.error(error_message)
+            raise _ex.EDataConformance(error_message) from e
-        raise _ex.EDataConformance(error_message)
     @classmethod
     def _coerce_date(cls, vector: pa.Array, field: pa.Field, pandas_type=None) -> pa.Array:
@@ -816,7 +993,7 @@ class DataConformance:
         # For Pandas 2.x dates are still np.datetime64 but can be in s, ms, us or ns
         # This conversion will not apply to dates held in Pandas using the Python date object types
         if pandas_type is not None:
-            if pa.types.is_timestamp(vector.type) and pd.api.types.is_datetime64_any_dtype(pandas_type):
+            if pa.types.is_timestamp(vector.type) and pandas.api.types.is_datetime64_any_dtype(pandas_type):
                 return pc.cast(vector, field.type)
         error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector, field)

tracdap-runtime 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl

tracdap-runtime 0.6.4py3-none-any.whl → 0.6.6py3-none-any.whl