PyPI - kumoai - Versions diffs - 2.14.0.dev202512151351__cp313-cp313-macosx_11_0_arm64.whl → 2.15.0.dev202601121731__cp313-cp313-macosx_11_0_arm64.whl - Mend

kumoai 2.14.0.dev202512151351__cp313-cp313-macosx_11_0_arm64.whl → 2.15.0.dev202601121731__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

kumoai/__init__.py +23 -26
kumoai/_version.py +1 -1
kumoai/client/client.py +6 -0
kumoai/client/jobs.py +26 -0
kumoai/connector/utils.py +21 -7
kumoai/experimental/rfm/__init__.py +51 -24
kumoai/experimental/rfm/authenticate.py +3 -4
kumoai/experimental/rfm/backend/local/graph_store.py +37 -46
kumoai/experimental/rfm/backend/local/sampler.py +0 -3
kumoai/experimental/rfm/backend/local/table.py +24 -30
kumoai/experimental/rfm/backend/snow/sampler.py +197 -90
kumoai/experimental/rfm/backend/snow/table.py +159 -52
kumoai/experimental/rfm/backend/sqlite/__init__.py +2 -2
kumoai/experimental/rfm/backend/sqlite/sampler.py +199 -99
kumoai/experimental/rfm/backend/sqlite/table.py +103 -45
kumoai/experimental/rfm/base/__init__.py +6 -1
kumoai/experimental/rfm/base/column.py +96 -10
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/mapper.py +69 -0
kumoai/experimental/rfm/base/sampler.py +28 -18
kumoai/experimental/rfm/base/source.py +1 -1
kumoai/experimental/rfm/base/sql_sampler.py +342 -13
kumoai/experimental/rfm/base/table.py +374 -208
kumoai/experimental/rfm/base/utils.py +27 -0
kumoai/experimental/rfm/graph.py +335 -180
kumoai/experimental/rfm/infer/__init__.py +6 -4
kumoai/experimental/rfm/infer/dtype.py +7 -4
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/infer/pkey.py +4 -2
kumoai/experimental/rfm/infer/stype.py +35 -0
kumoai/experimental/rfm/infer/time_col.py +5 -4
kumoai/experimental/rfm/pquery/executor.py +27 -27
kumoai/experimental/rfm/pquery/pandas_executor.py +29 -31
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +600 -360
kumoai/experimental/rfm/sagemaker.py +4 -4
kumoai/experimental/rfm/task_table.py +292 -0
kumoai/pquery/training_table.py +16 -2
kumoai/testing/snow.py +3 -3
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/utils/__init__.py +1 -2
kumoai/utils/display.py +87 -0
kumoai/utils/progress_logger.py +190 -12
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/METADATA +3 -2
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/RECORD +48 -40
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/backend/local/table.py CHANGED Viewed

@@ -1,15 +1,15 @@
-import warnings
-from typing import List, Optional, cast
+from typing import Sequence, cast
 import pandas as pd
+from kumoapi.model_plan import MissingType
 from kumoai.experimental.rfm.base import (
+    ColumnSpec,
     DataBackend,
     SourceColumn,
     SourceForeignKey,
     Table,
 )
-from kumoai.experimental.rfm.infer import infer_dtype
 class LocalTable(Table):
@@ -57,9 +57,9 @@ class LocalTable(Table):
         self,
         df: pd.DataFrame,
         name: str,
-        primary_key: Optional[str] = None,
-        time_column: Optional[str] = None,
-        end_time_column: Optional[str] = None,
+        primary_key: MissingType | str | None = MissingType.VALUE,
+        time_column: str | None = None,
+        end_time_column: str | None = None,
     ) -> None:
         if df.empty:
@@ -75,7 +75,6 @@ class LocalTable(Table):
         super().__init__(
             name=name,
-            columns=list(df.columns),
             primary_key=primary_key,
             time_column=time_column,
             end_time_column=end_time_column,
@@ -85,35 +84,30 @@ class LocalTable(Table):
     def backend(self) -> DataBackend:
         return cast(DataBackend, DataBackend.LOCAL)
-    def _get_source_columns(self) -> List[SourceColumn]:
-        source_columns: List[SourceColumn] = []
-        for column in self._data.columns:
-            ser = self._data[column]
-            try:
-                dtype = infer_dtype(ser)
-            except Exception:
-                warnings.warn(f"Data type inference for column '{column}' in "
-                              f"table '{self.name}' failed. Consider changing "
-                              f"the data type of the column to use it within "
-                              f"this table.")
-                continue
-            source_column = SourceColumn(
-                name=column,
-                dtype=dtype,
+    def _get_source_columns(self) -> list[SourceColumn]:
+        return [
+            SourceColumn(
+                name=column_name,
+                dtype=None,
                 is_primary_key=False,
                 is_unique_key=False,
                 is_nullable=True,
-            )
-            source_columns.append(source_column)
+            ) for column_name in self._data.columns
+        ]
-        return source_columns
-    def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
+    def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
         return []
-    def _get_sample_df(self) -> pd.DataFrame:
+    def _get_source_sample_df(self) -> pd.DataFrame:
         return self._data
-    def _get_num_rows(self) -> Optional[int]:
+    def _get_expr_sample_df(
+        self,
+        columns: Sequence[ColumnSpec],
+    ) -> pd.DataFrame:
+        raise RuntimeError(f"Column expressions are not supported in "
+                           f"'{self.__class__.__name__}'. Please apply your "
+                           f"expressions on the `pd.DataFrame` directly.")
+    def _get_num_rows(self) -> int | None:
         return len(self._data)

kumoai/experimental/rfm/backend/snow/sampler.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import json
+from collections.abc import Iterator
+from contextlib import contextmanager
 from typing import TYPE_CHECKING
 import numpy as np
@@ -6,15 +8,23 @@ import pandas as pd
 import pyarrow as pa
 from kumoapi.pquery import ValidatedPredictiveQuery
-from kumoai.experimental.rfm.backend.snow import SnowTable
-from kumoai.experimental.rfm.base import SQLSampler
+from kumoai.experimental.rfm.backend.snow import Connection, SnowTable
+from kumoai.experimental.rfm.base import SQLSampler, Table
 from kumoai.experimental.rfm.pquery import PQueryPandasExecutor
-from kumoai.utils import ProgressLogger, quote_ident
+from kumoai.utils import ProgressLogger
 if TYPE_CHECKING:
     from kumoai.experimental.rfm import Graph
+@contextmanager
+def paramstyle(connection: Connection, style: str = 'qmark') -> Iterator[None]:
+    _style = connection._paramstyle
+    connection._paramstyle = style
+    yield
+    connection._paramstyle = _style
 class SnowSampler(SQLSampler):
     def __init__(
         self,
@@ -23,16 +33,9 @@ class SnowSampler(SQLSampler):
     ) -> None:
         super().__init__(graph=graph, verbose=verbose)
-        self._fqn_dict: dict[str, str] = {}
         for table in graph.tables.values():
             assert isinstance(table, SnowTable)
             self._connection = table._connection
-            self._fqn_dict[table.name] = table.fqn
-    @property
-    def fqn_dict(self) -> dict[str, str]:
-        r"""The fully-qualified quoted names for all tables in the graph."""
-        return self._fqn_dict
     def _get_min_max_time_dict(
         self,
@@ -40,24 +43,25 @@ class SnowSampler(SQLSampler):
     ) -> dict[str, tuple[pd.Timestamp, pd.Timestamp]]:
         selects: list[str] = []
         for table_name in table_names:
-            time_column = self.time_column_dict[table_name]
+            column = self.time_column_dict[table_name]
+            column_ref = self.table_column_ref_dict[table_name][column]
             select = (f"SELECT\n"
-                      f"  %s as table_name,\n"
-                      f"  MIN({quote_ident(time_column)}) as min_date,\n"
-                      f"  MAX({quote_ident(time_column)}) as max_date\n"
-                      f"FROM {self.fqn_dict[table_name]}")
+                      f"  ? as table_name,\n"
+                      f"  MIN({column_ref}) as min_date,\n"
+                      f"  MAX({column_ref}) as max_date\n"
+                      f"FROM {self.source_name_dict[table_name]}")
             selects.append(select)
         sql = "\nUNION ALL\n".join(selects)
         out_dict: dict[str, tuple[pd.Timestamp, pd.Timestamp]] = {}
-        with self._connection.cursor() as cursor:
+        with paramstyle(self._connection), self._connection.cursor() as cursor:
             cursor.execute(sql, table_names)
             rows = cursor.fetchall()
-            for table_name, _min, _max in rows:
-                out_dict[table_name] = (
-                    pd.Timestamp.max if _min is None else pd.Timestamp(_min),
-                    pd.Timestamp.min if _max is None else pd.Timestamp(_max),
-                )
+        for table_name, _min, _max in rows:
+            out_dict[table_name] = (
+                pd.Timestamp.max if _min is None else pd.Timestamp(_min),
+                pd.Timestamp.min if _max is None else pd.Timestamp(_max),
+            )
         return out_dict
@@ -71,17 +75,27 @@ class SnowSampler(SQLSampler):
         # NOTE Snowflake does support `SEED` only as part of `SYSTEM` sampling.
         num_rows = min(num_rows, 1_000_000)  # Snowflake's upper limit.
+        source_table = self.source_table_dict[table_name]
         filters: list[str] = []
-        primary_key = self.primary_key_dict[table_name]
-        if self.source_table_dict[table_name][primary_key].is_nullable:
-            filters.append(f" {quote_ident(primary_key)} IS NOT NULL")
-        time_column = self.time_column_dict.get(table_name)
-        if (time_column is not None and
-                self.source_table_dict[table_name][time_column].is_nullable):
-            filters.append(f" {quote_ident(time_column)} IS NOT NULL")
-        sql = (f"SELECT {', '.join(quote_ident(col) for col in columns)}\n"
-               f"FROM {self.fqn_dict[table_name]}\n"
+        key = self.primary_key_dict[table_name]
+        if key not in source_table or source_table[key].is_nullable:
+            key_ref = self.table_column_ref_dict[table_name][key]
+            filters.append(f" {key_ref} IS NOT NULL")
+        column = self.time_column_dict.get(table_name)
+        if column is None:
+            pass
+        elif column not in source_table or source_table[column].is_nullable:
+            column_ref = self.table_column_ref_dict[table_name][column]
+            filters.append(f" {column_ref} IS NOT NULL")
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
+        sql = (f"SELECT {', '.join(projections)}\n"
+               f"FROM {self.source_name_dict[table_name]}\n"
                f"SAMPLE ROW ({num_rows} ROWS)")
         if len(filters) > 0:
             sql += f"\nWHERE{' AND'.join(filters)}"
@@ -91,7 +105,11 @@ class SnowSampler(SQLSampler):
             cursor.execute(sql)
             table = cursor.fetch_arrow_all()
-        return self._sanitize(table_name, table)
+        return Table._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        )
     def _sample_target(
         self,
@@ -126,11 +144,11 @@ class SnowSampler(SQLSampler):
             query.entity_table: np.arange(len(entity_df)),
         }
         for edge_type, (min_offset, max_offset) in time_offset_dict.items():
-            table_name, fkey, _ = edge_type
+            table_name, foreign_key, _ = edge_type
             feat_dict[table_name], batch_dict[table_name] = self._by_time(
                 table_name=table_name,
-                fkey=fkey,
-                pkey=entity_df[self.primary_key_dict[query.entity_table]],
+                foreign_key=foreign_key,
+                index=entity_df[self.primary_key_dict[query.entity_table]],
                 anchor_time=time,
                 min_offset=min_offset,
                 max_offset=max_offset,
@@ -161,104 +179,193 @@ class SnowSampler(SQLSampler):
     def _by_pkey(
         self,
         table_name: str,
-        pkey: pd.Series,
+        index: pd.Series,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
+        key = self.primary_key_dict[table_name]
+        key_ref = self.table_column_ref_dict[table_name][key]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
-        pkey_name = self.primary_key_dict[table_name]
-        source_table = self.source_table_dict[table_name]
-        payload = json.dumps(list(pkey))
+        payload = json.dumps(list(index))
         sql = ("WITH TMP as (\n"
                "  SELECT\n"
-               "    f.index as BATCH,\n")
-        if source_table[pkey_name].dtype.is_int():
-            sql += "    f.value::NUMBER as ID\n"
-        elif source_table[pkey_name].dtype.is_float():
-            sql += "    f.value::FLOAT as ID\n"
+               "    f.index as __KUMO_BATCH__,\n")
+        if self.table_dtype_dict[table_name][key].is_int():
+            sql += "    f.value::NUMBER as __KUMO_ID__\n"
+        elif self.table_dtype_dict[table_name][key].is_float():
+            sql += "    f.value::FLOAT as __KUMO_ID__\n"
         else:
-            sql += "    f.value::VARCHAR as ID\n"
-        sql += (f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(%s))) f\n"
+            sql += "    f.value::VARCHAR as __KUMO_ID__\n"
+        sql += (f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(?))) f\n"
                 f")\n"
-                f"SELECT TMP.BATCH as __BATCH__, "
-                f"{', '.join('ENT.' + quote_ident(col) for col in columns)}\n"
+                f"SELECT "
+                f"TMP.__KUMO_BATCH__ as __KUMO_BATCH__, "
+                f"{', '.join(projections)}\n"
                 f"FROM TMP\n"
-                f"JOIN {self.fqn_dict[table_name]} ENT\n"
-                f"  ON ENT.{quote_ident(pkey_name)} = TMP.ID")
+                f"JOIN {self.source_name_dict[table_name]}\n"
+                f"  ON {key_ref} = TMP.__KUMO_ID__")
-        with self._connection.cursor() as cursor:
+        with paramstyle(self._connection), self._connection.cursor() as cursor:
             cursor.execute(sql, (payload, ))
             table = cursor.fetch_arrow_all()
         # Remove any duplicated primary keys in post-processing:
-        tmp = table.append_column('__TMP__', pa.array(range(len(table))))
-        gb = tmp.group_by('__BATCH__').aggregate([('__TMP__', 'min')])
-        table = table.take(gb['__TMP___min'])
+        tmp = table.append_column('__KUMO_ID__', pa.array(range(len(table))))
+        gb = tmp.group_by('__KUMO_BATCH__').aggregate([('__KUMO_ID__', 'min')])
+        table = table.take(gb['__KUMO_ID___min'])
-        batch = table['__BATCH__'].cast(pa.int64()).to_numpy()
-        table = table.remove_column(table.schema.get_field_index('__BATCH__'))
+        batch = table['__KUMO_BATCH__'].cast(pa.int64()).to_numpy()
+        batch_index = table.schema.get_field_index('__KUMO_BATCH__')
+        table = table.remove_column(batch_index)
-        return table.to_pandas(), batch  # TODO Use `self._sanitize`.
+        return Table._sanitize(
+            df=table.to_pandas(),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch
+    def _by_fkey(
+        self,
+        table_name: str,
+        foreign_key: str,
+        index: pd.Series,
+        num_neighbors: int,
+        anchor_time: pd.Series | None,
+        columns: set[str],
+    ) -> tuple[pd.DataFrame, np.ndarray]:
+        time_column = self.time_column_dict.get(table_name)
+        if time_column is not None and anchor_time is not None:
+            anchor_time = anchor_time.dt.strftime("%Y-%m-%d %H:%M:%S")
+            payload = json.dumps(list(zip(index, anchor_time)))
+        else:
+            payload = json.dumps(list(zip(index)))
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
+        sql = ("WITH TMP as (\n"
+               "  SELECT\n"
+               "    f.index as __KUMO_BATCH__,\n")
+        if self.table_dtype_dict[table_name][foreign_key].is_int():
+            sql += "    f.value[0]::NUMBER as __KUMO_ID__"
+        elif self.table_dtype_dict[table_name][foreign_key].is_float():
+            sql += "    f.value[0]::FLOAT as __KUMO_ID__"
+        else:
+            sql += "    f.value[0]::VARCHAR as __KUMO_ID__"
+        if time_column is not None and anchor_time is not None:
+            sql += (",\n"
+                    "    f.value[1]::TIMESTAMP_NTZ as __KUMO_TIME__")
+        sql += (f"\n"
+                f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(?))) f\n"
+                f")\n"
+                f"SELECT "
+                f"TMP.__KUMO_BATCH__ as __KUMO_BATCH__, "
+                f"{', '.join(projections)}\n"
+                f"FROM TMP\n"
+                f"JOIN {self.source_name_dict[table_name]}\n"
+                f"  ON {key_ref} = TMP.__KUMO_ID__\n")
+        if time_column is not None and anchor_time is not None:
+            time_ref = self.table_column_ref_dict[table_name][time_column]
+            sql += f" AND {time_ref} <= TMP.__KUMO_TIME__\n"
+        sql += ("QUALIFY ROW_NUMBER() OVER (\n"
+                "  PARTITION BY TMP.__KUMO_BATCH__\n")
+        if time_column is not None:
+            sql += f"  ORDER BY {time_ref} DESC\n"
+        else:
+            sql += f"  ORDER BY {key_ref}\n"
+        sql += f") <= {num_neighbors}"
+        with paramstyle(self._connection), self._connection.cursor() as cursor:
+            cursor.execute(sql, (payload, ))
+            table = cursor.fetch_arrow_all()
+        batch = table['__KUMO_BATCH__'].cast(pa.int64()).to_numpy()
+        batch_index = table.schema.get_field_index('__KUMO_BATCH__')
+        table = table.remove_column(batch_index)
+        return Table._sanitize(
+            df=table.to_pandas(),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch
     # Helper Methods ##########################################################
     def _by_time(
         self,
         table_name: str,
-        fkey: str,
-        pkey: pd.Series,
+        foreign_key: str,
+        index: pd.Series,
         anchor_time: pd.Series,
         min_offset: pd.DateOffset | None,
         max_offset: pd.DateOffset,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
+        time_column = self.time_column_dict[table_name]
         end_time = anchor_time + max_offset
         end_time = end_time.dt.strftime("%Y-%m-%d %H:%M:%S")
+        start_time: pd.Series | None = None
         if min_offset is not None:
             start_time = anchor_time + min_offset
             start_time = start_time.dt.strftime("%Y-%m-%d %H:%M:%S")
-            payload = json.dumps(list(zip(pkey, end_time, start_time)))
+            payload = json.dumps(list(zip(index, end_time, start_time)))
         else:
-            payload = json.dumps(list(zip(pkey, end_time)))
-        # Based on benchmarking, JSON payload is the fastest way to query by
-        # custom indices (compared to large `IN` clauses or temporary tables):
-        source_table = self.source_table_dict[table_name]
-        time_column = self.time_column_dict[table_name]
+            payload = json.dumps(list(zip(index, end_time)))
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
+        time_ref = self.table_column_ref_dict[table_name][time_column]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
         sql = ("WITH TMP as (\n"
                "  SELECT\n"
-               "    f.index as BATCH,\n")
-        if source_table[fkey].dtype.is_int():
-            sql += "    f.value[0]::NUMBER as ID,\n"
-        elif source_table[fkey].dtype.is_float():
-            sql += "    f.value[0]::FLOAT as ID,\n"
+               "    f.index as __KUMO_BATCH__,\n")
+        if self.table_dtype_dict[table_name][foreign_key].is_int():
+            sql += "    f.value[0]::NUMBER as __KUMO_ID__,\n"
+        elif self.table_dtype_dict[table_name][foreign_key].is_float():
+            sql += "    f.value[0]::FLOAT as __KUMO_ID__,\n"
         else:
-            sql += "    f.value[0]::VARCHAR as ID,\n"
-        sql += "    f.value[1]::TIMESTAMP_NTZ as END_TIME"
+            sql += "    f.value[0]::VARCHAR as __KUMO_ID__,\n"
+        sql += "    f.value[1]::TIMESTAMP_NTZ as __KUMO_END_TIME__"
         if min_offset is not None:
-            sql += ",\n    f.value[2]::TIMESTAMP_NTZ as START_TIME"
+            sql += ",\n    f.value[2]::TIMESTAMP_NTZ as __KUMO_START_TIME__"
         sql += (f"\n"
-                f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(%s))) f\n"
+                f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(?))) f\n"
                 f")\n"
-                f"SELECT TMP.BATCH as __BATCH__, "
-                f"{', '.join('FACT.' + quote_ident(col) for col in columns)}\n"
+                f"SELECT "
+                f"TMP.__KUMO_BATCH__ as __KUMO_BATCH__, "
+                f"{', '.join(projections)}\n"
                 f"FROM TMP\n"
-                f"JOIN {self.fqn_dict[table_name]} FACT\n"
-                f"  ON FACT.{quote_ident(fkey)} = TMP.ID\n"
-                f" AND FACT.{quote_ident(time_column)} <= TMP.END_TIME")
-        if min_offset is not None:
-            sql += f"\n AND FACT.{quote_ident(time_column)} > TMP.START_TIME"
-        with self._connection.cursor() as cursor:
+                f"JOIN {self.source_name_dict[table_name]}\n"
+                f"  ON {key_ref} = TMP.__KUMO_ID__\n"
+                f" AND {time_ref} <= TMP.__KUMO_END_TIME__\n")
+        if start_time is not None:
+            sql += f"AND {time_ref} > TMP.__KUMO_START_TIME__\n"
+        # Add global time bounds to enable partition pruning:
+        sql += f"WHERE {time_ref} <= '{end_time.max()}'"
+        if start_time is not None:
+            sql += f"\nAND {time_ref} > '{start_time.min()}'"
+        with paramstyle(self._connection), self._connection.cursor() as cursor:
             cursor.execute(sql, (payload, ))
             table = cursor.fetch_arrow_all()
-        batch = table['__BATCH__'].cast(pa.int64()).to_numpy()
-        table = table.remove_column(table.schema.get_field_index('__BATCH__'))
-        return self._sanitize(table_name, table), batch
+        batch = table['__KUMO_BATCH__'].cast(pa.int64()).to_numpy()
+        batch_index = table.schema.get_field_index('__KUMO_BATCH__')
+        table = table.remove_column(batch_index)
-    def _sanitize(self, table_name: str, table: pa.table) -> pd.DataFrame:
-        return table.to_pandas(types_mapper=pd.ArrowDtype)
+        return Table._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch