PyPI - kumoai - Versions diffs - 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl → 2.14.0.dev202601081732__cp313-cp313-win_amd64.whl - Mend

kumoai 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl → 2.14.0.dev202601081732__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

kumoai/__init__.py +23 -26
kumoai/_version.py +1 -1
kumoai/client/client.py +6 -0
kumoai/client/jobs.py +26 -0
kumoai/connector/utils.py +21 -7
kumoai/experimental/rfm/__init__.py +24 -22
kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
kumoai/experimental/rfm/backend/local/sampler.py +0 -3
kumoai/experimental/rfm/backend/local/table.py +24 -25
kumoai/experimental/rfm/backend/snow/sampler.py +184 -70
kumoai/experimental/rfm/backend/snow/table.py +137 -64
kumoai/experimental/rfm/backend/sqlite/sampler.py +191 -86
kumoai/experimental/rfm/backend/sqlite/table.py +85 -55
kumoai/experimental/rfm/base/__init__.py +6 -9
kumoai/experimental/rfm/base/column.py +95 -11
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/sampler.py +26 -17
kumoai/experimental/rfm/base/source.py +1 -1
kumoai/experimental/rfm/base/sql_sampler.py +182 -19
kumoai/experimental/rfm/base/table.py +275 -109
kumoai/experimental/rfm/graph.py +115 -107
kumoai/experimental/rfm/infer/dtype.py +4 -1
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +530 -304
kumoai/experimental/rfm/task_table.py +292 -0
kumoai/kumolib.cp313-win_amd64.pyd +0 -0
kumoai/pquery/training_table.py +16 -2
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/utils/display.py +87 -0
kumoai/utils/progress_logger.py +13 -1
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/METADATA +1 -1
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/RECORD +36 -33
kumoai/experimental/rfm/base/column_expression.py +0 -50
kumoai/experimental/rfm/base/sql_table.py +0 -229
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/backend/snow/sampler.py CHANGED Viewed

@@ -1,16 +1,20 @@
 import json
 from collections.abc import Iterator
 from contextlib import contextmanager
+from typing import TYPE_CHECKING
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 from kumoapi.pquery import ValidatedPredictiveQuery
-from kumoai.experimental.rfm.backend.snow import Connection
-from kumoai.experimental.rfm.base import SQLSampler
+from kumoai.experimental.rfm.backend.snow import Connection, SnowTable
+from kumoai.experimental.rfm.base import SQLSampler, Table
 from kumoai.experimental.rfm.pquery import PQueryPandasExecutor
-from kumoai.utils import quote_ident
+from kumoai.utils import ProgressLogger
+if TYPE_CHECKING:
+    from kumoai.experimental.rfm import Graph
 @contextmanager
@@ -22,18 +26,30 @@ def paramstyle(connection: Connection, style: str = 'qmark') -> Iterator[None]:
 class SnowSampler(SQLSampler):
+    def __init__(
+        self,
+        graph: 'Graph',
+        verbose: bool | ProgressLogger = True,
+    ) -> None:
+        super().__init__(graph=graph, verbose=verbose)
+        for table in graph.tables.values():
+            assert isinstance(table, SnowTable)
+            self._connection = table._connection
     def _get_min_max_time_dict(
         self,
         table_names: list[str],
     ) -> dict[str, tuple[pd.Timestamp, pd.Timestamp]]:
         selects: list[str] = []
         for table_name in table_names:
-            time_column = self.time_column_dict[table_name]
+            column = self.time_column_dict[table_name]
+            column_ref = self.table_column_ref_dict[table_name][column]
             select = (f"SELECT\n"
                       f"  ? as table_name,\n"
-                      f"  MIN({quote_ident(time_column)}) as min_date,\n"
-                      f"  MAX({quote_ident(time_column)}) as max_date\n"
-                      f"FROM {self.fqn_dict[table_name]}")
+                      f"  MIN({column_ref}) as min_date,\n"
+                      f"  MAX({column_ref}) as max_date\n"
+                      f"FROM {self.source_name_dict[table_name]}")
             selects.append(select)
         sql = "\nUNION ALL\n".join(selects)
@@ -59,17 +75,27 @@ class SnowSampler(SQLSampler):
         # NOTE Snowflake does support `SEED` only as part of `SYSTEM` sampling.
         num_rows = min(num_rows, 1_000_000)  # Snowflake's upper limit.
+        source_table = self.source_table_dict[table_name]
         filters: list[str] = []
-        primary_key = self.primary_key_dict[table_name]
-        if self.source_table_dict[table_name][primary_key].is_nullable:
-            filters.append(f" {quote_ident(primary_key)} IS NOT NULL")
-        time_column = self.time_column_dict.get(table_name)
-        if (time_column is not None and
-                self.source_table_dict[table_name][time_column].is_nullable):
-            filters.append(f" {quote_ident(time_column)} IS NOT NULL")
-        sql = (f"SELECT {', '.join(quote_ident(col) for col in columns)}\n"
-               f"FROM {self.fqn_dict[table_name]}\n"
+        key = self.primary_key_dict[table_name]
+        if key not in source_table or source_table[key].is_nullable:
+            key_ref = self.table_column_ref_dict[table_name][key]
+            filters.append(f" {key_ref} IS NOT NULL")
+        column = self.time_column_dict.get(table_name)
+        if column is None:
+            pass
+        elif column not in source_table or source_table[column].is_nullable:
+            column_ref = self.table_column_ref_dict[table_name][column]
+            filters.append(f" {column_ref} IS NOT NULL")
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
+        sql = (f"SELECT {', '.join(projections)}\n"
+               f"FROM {self.source_name_dict[table_name]}\n"
                f"SAMPLE ROW ({num_rows} ROWS)")
         if len(filters) > 0:
             sql += f"\nWHERE{' AND'.join(filters)}"
@@ -79,7 +105,11 @@ class SnowSampler(SQLSampler):
             cursor.execute(sql)
             table = cursor.fetch_arrow_all()
-        return self._sanitize(table_name, table)
+        return Table._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        )
     def _sample_target(
         self,
@@ -114,11 +144,11 @@ class SnowSampler(SQLSampler):
             query.entity_table: np.arange(len(entity_df)),
         }
         for edge_type, (min_offset, max_offset) in time_offset_dict.items():
-            table_name, fkey, _ = edge_type
+            table_name, foreign_key, _ = edge_type
             feat_dict[table_name], batch_dict[table_name] = self._by_time(
                 table_name=table_name,
-                fkey=fkey,
-                pkey=entity_df[self.primary_key_dict[query.entity_table]],
+                foreign_key=foreign_key,
+                index=entity_df[self.primary_key_dict[query.entity_table]],
                 anchor_time=time,
                 min_offset=min_offset,
                 max_offset=max_offset,
@@ -149,104 +179,188 @@ class SnowSampler(SQLSampler):
     def _by_pkey(
         self,
         table_name: str,
-        pkey: pd.Series,
+        index: pd.Series,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
+        key = self.primary_key_dict[table_name]
+        key_ref = self.table_column_ref_dict[table_name][key]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
-        pkey_name = self.primary_key_dict[table_name]
-        source_table = self.source_table_dict[table_name]
-        payload = json.dumps(list(pkey))
+        payload = json.dumps(list(index))
         sql = ("WITH TMP as (\n"
                "  SELECT\n"
-               "    f.index as BATCH,\n")
-        if source_table[pkey_name].dtype.is_int():
-            sql += "    f.value::NUMBER as ID\n"
-        elif source_table[pkey_name].dtype.is_float():
-            sql += "    f.value::FLOAT as ID\n"
+               "    f.index as __KUMO_BATCH__,\n")
+        if self.table_dtype_dict[table_name][key].is_int():
+            sql += "    f.value::NUMBER as __KUMO_ID__\n"
+        elif self.table_dtype_dict[table_name][key].is_float():
+            sql += "    f.value::FLOAT as __KUMO_ID__\n"
         else:
-            sql += "    f.value::VARCHAR as ID\n"
+            sql += "    f.value::VARCHAR as __KUMO_ID__\n"
         sql += (f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(?))) f\n"
                 f")\n"
-                f"SELECT TMP.BATCH as __BATCH__, "
-                f"{', '.join('ENT.' + quote_ident(col) for col in columns)}\n"
+                f"SELECT "
+                f"TMP.__KUMO_BATCH__ as __KUMO_BATCH__, "
+                f"{', '.join(projections)}\n"
                 f"FROM TMP\n"
-                f"JOIN {self.fqn_dict[table_name]} ENT\n"
-                f"  ON ENT.{quote_ident(pkey_name)} = TMP.ID")
+                f"JOIN {self.source_name_dict[table_name]}\n"
+                f"  ON {key_ref} = TMP.__KUMO_ID__")
         with paramstyle(self._connection), self._connection.cursor() as cursor:
             cursor.execute(sql, (payload, ))
             table = cursor.fetch_arrow_all()
         # Remove any duplicated primary keys in post-processing:
-        tmp = table.append_column('__TMP__', pa.array(range(len(table))))
-        gb = tmp.group_by('__BATCH__').aggregate([('__TMP__', 'min')])
-        table = table.take(gb['__TMP___min'])
+        tmp = table.append_column('__KUMO_ID__', pa.array(range(len(table))))
+        gb = tmp.group_by('__KUMO_BATCH__').aggregate([('__KUMO_ID__', 'min')])
+        table = table.take(gb['__KUMO_ID___min'])
+        batch = table['__KUMO_BATCH__'].cast(pa.int64()).to_numpy()
+        batch_index = table.schema.get_field_index('__KUMO_BATCH__')
+        table = table.remove_column(batch_index)
+        return Table._sanitize(
+            df=table.to_pandas(),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch
+    def _by_fkey(
+        self,
+        table_name: str,
+        foreign_key: str,
+        index: pd.Series,
+        num_neighbors: int,
+        anchor_time: pd.Series | None,
+        columns: set[str],
+    ) -> tuple[pd.DataFrame, np.ndarray]:
+        time_column = self.time_column_dict.get(table_name)
+        if time_column is not None and anchor_time is not None:
+            anchor_time = anchor_time.dt.strftime("%Y-%m-%d %H:%M:%S")
+            payload = json.dumps(list(zip(index, anchor_time)))
+        else:
+            payload = json.dumps(list(zip(index)))
-        batch = table['__BATCH__'].cast(pa.int64()).to_numpy()
-        table = table.remove_column(table.schema.get_field_index('__BATCH__'))
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
-        return table.to_pandas(), batch  # TODO Use `self._sanitize`.
+        sql = ("WITH TMP as (\n"
+               "  SELECT\n"
+               "    f.index as __KUMO_BATCH__,\n")
+        if self.table_dtype_dict[table_name][foreign_key].is_int():
+            sql += "    f.value[0]::NUMBER as __KUMO_ID__"
+        elif self.table_dtype_dict[table_name][foreign_key].is_float():
+            sql += "    f.value[0]::FLOAT as __KUMO_ID__"
+        else:
+            sql += "    f.value[0]::VARCHAR as __KUMO_ID__"
+        if time_column is not None and anchor_time is not None:
+            sql += (",\n"
+                    "    f.value[1]::TIMESTAMP_NTZ as __KUMO_TIME__")
+        sql += (f"\n"
+                f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(?))) f\n"
+                f")\n"
+                f"SELECT "
+                f"TMP.__KUMO_BATCH__ as __KUMO_BATCH__, "
+                f"{', '.join(projections)}\n"
+                f"FROM TMP\n"
+                f"JOIN {self.source_name_dict[table_name]}\n"
+                f"  ON {key_ref} = TMP.__KUMO_ID__\n")
+        if time_column is not None and anchor_time is not None:
+            time_ref = self.table_column_ref_dict[table_name][time_column]
+            sql += f" AND {time_ref} <= TMP.__KUMO_TIME__\n"
+        sql += ("QUALIFY ROW_NUMBER() OVER (\n"
+                "  PARTITION BY TMP.__KUMO_BATCH__\n")
+        if time_column is not None:
+            sql += f"  ORDER BY {time_ref} DESC\n"
+        else:
+            sql += f"  ORDER BY {key_ref}\n"
+        sql += f") <= {num_neighbors}"
+        with paramstyle(self._connection), self._connection.cursor() as cursor:
+            cursor.execute(sql, (payload, ))
+            table = cursor.fetch_arrow_all()
+        batch = table['__KUMO_BATCH__'].cast(pa.int64()).to_numpy()
+        batch_index = table.schema.get_field_index('__KUMO_BATCH__')
+        table = table.remove_column(batch_index)
+        return Table._sanitize(
+            df=table.to_pandas(),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch
     # Helper Methods ##########################################################
     def _by_time(
         self,
         table_name: str,
-        fkey: str,
-        pkey: pd.Series,
+        foreign_key: str,
+        index: pd.Series,
         anchor_time: pd.Series,
         min_offset: pd.DateOffset | None,
         max_offset: pd.DateOffset,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
+        time_column = self.time_column_dict[table_name]
         end_time = anchor_time + max_offset
         end_time = end_time.dt.strftime("%Y-%m-%d %H:%M:%S")
         if min_offset is not None:
             start_time = anchor_time + min_offset
             start_time = start_time.dt.strftime("%Y-%m-%d %H:%M:%S")
-            payload = json.dumps(list(zip(pkey, end_time, start_time)))
+            payload = json.dumps(list(zip(index, end_time, start_time)))
         else:
-            payload = json.dumps(list(zip(pkey, end_time)))
-        # Based on benchmarking, JSON payload is the fastest way to query by
-        # custom indices (compared to large `IN` clauses or temporary tables):
-        source_table = self.source_table_dict[table_name]
-        time_column = self.time_column_dict[table_name]
+            payload = json.dumps(list(zip(index, end_time)))
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
+        time_ref = self.table_column_ref_dict[table_name][time_column]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
         sql = ("WITH TMP as (\n"
                "  SELECT\n"
-               "    f.index as BATCH,\n")
-        if source_table[fkey].dtype.is_int():
-            sql += "    f.value[0]::NUMBER as ID,\n"
-        elif source_table[fkey].dtype.is_float():
-            sql += "    f.value[0]::FLOAT as ID,\n"
+               "    f.index as __KUMO_BATCH__,\n")
+        if self.table_dtype_dict[table_name][foreign_key].is_int():
+            sql += "    f.value[0]::NUMBER as __KUMO_ID__,\n"
+        elif self.table_dtype_dict[table_name][foreign_key].is_float():
+            sql += "    f.value[0]::FLOAT as __KUMO_ID__,\n"
         else:
-            sql += "    f.value[0]::VARCHAR as ID,\n"
-        sql += "    f.value[1]::TIMESTAMP_NTZ as END_TIME"
+            sql += "    f.value[0]::VARCHAR as __KUMO_ID__,\n"
+        sql += "    f.value[1]::TIMESTAMP_NTZ as __KUMO_END_TIME__"
         if min_offset is not None:
-            sql += ",\n    f.value[2]::TIMESTAMP_NTZ as START_TIME"
+            sql += ",\n    f.value[2]::TIMESTAMP_NTZ as __KUMO_START_TIME__"
         sql += (f"\n"
                 f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(?))) f\n"
                 f")\n"
-                f"SELECT TMP.BATCH as __BATCH__, "
-                f"{', '.join('FACT.' + quote_ident(col) for col in columns)}\n"
+                f"SELECT "
+                f"TMP.__KUMO_BATCH__ as __KUMO_BATCH__, "
+                f"{', '.join(projections)}\n"
                 f"FROM TMP\n"
-                f"JOIN {self.fqn_dict[table_name]} FACT\n"
-                f"  ON FACT.{quote_ident(fkey)} = TMP.ID\n"
-                f" AND FACT.{quote_ident(time_column)} <= TMP.END_TIME")
+                f"JOIN {self.source_name_dict[table_name]}\n"
+                f"  ON {key_ref} = TMP.__KUMO_ID__\n"
+                f" AND {time_ref} <= TMP.__KUMO_END_TIME__")
         if min_offset is not None:
-            sql += f"\n AND FACT.{quote_ident(time_column)} > TMP.START_TIME"
+            sql += f"\n AND {time_ref} > TMP.__KUMO_START_TIME__"
         with paramstyle(self._connection), self._connection.cursor() as cursor:
             cursor.execute(sql, (payload, ))
             table = cursor.fetch_arrow_all()
-        batch = table['__BATCH__'].cast(pa.int64()).to_numpy()
-        table = table.remove_column(table.schema.get_field_index('__BATCH__'))
-        return self._sanitize(table_name, table), batch
+        batch = table['__KUMO_BATCH__'].cast(pa.int64()).to_numpy()
+        batch_index = table.schema.get_field_index('__KUMO_BATCH__')
+        table = table.remove_column(batch_index)
-    def _sanitize(self, table_name: str, table: pa.table) -> pd.DataFrame:
-        return table.to_pandas(types_mapper=pd.ArrowDtype)
+        return Table._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch

kumoai/experimental/rfm/backend/snow/table.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import re
+from collections import Counter
 from collections.abc import Sequence
 from typing import cast
@@ -8,28 +9,27 @@ from kumoapi.typing import Dtype
 from kumoai.experimental.rfm.backend.snow import Connection
 from kumoai.experimental.rfm.base import (
-    ColumnExpressionSpec,
-    ColumnExpressionType,
+    ColumnSpec,
+    ColumnSpecType,
     DataBackend,
     SourceColumn,
     SourceForeignKey,
-    SQLTable,
+    Table,
 )
 from kumoai.utils import quote_ident
-class SnowTable(SQLTable):
+class SnowTable(Table):
     r"""A table backed by a :class:`sqlite` database.
     Args:
         connection: The connection to a :class:`snowflake` database.
-        name: The logical name of this table.
-        source_name: The physical name of this table in the database. If set to
-            ``None``, ``name`` is being used.
+        name: The name of this table.
+        source_name: The source name of this table. If set to ``None``,
+            ``name`` is being used.
         database: The database.
         schema: The schema.
-        columns: The selected physical columns of this table.
-        column_expressions: The logical columns of this table.
+        columns: The selected columns of this table.
         primary_key: The name of the primary key of this table, if it exists.
         time_column: The name of the time column of this table, if it exists.
         end_time_column: The name of the end time column of this table, if it
@@ -42,14 +42,21 @@ class SnowTable(SQLTable):
         source_name: str | None = None,
         database: str | None = None,
         schema: str | None = None,
-        columns: Sequence[str] | None = None,
-        column_expressions: Sequence[ColumnExpressionType] | None = None,
+        columns: Sequence[ColumnSpecType] | None = None,
         primary_key: MissingType | str | None = MissingType.VALUE,
         time_column: str | None = None,
         end_time_column: str | None = None,
     ) -> None:
-        if database is not None and schema is None:
+        if database is None or schema is None:
+            with connection.cursor() as cursor:
+                cursor.execute("SELECT CURRENT_DATABASE(), CURRENT_SCHEMA()")
+                result = cursor.fetchone()
+                database = database or result[0]
+                assert database is not None
+                schema = schema or result[1]
+        if schema is None:
             raise ValueError(f"Unspecified 'schema' for table "
                              f"'{source_name or name}' in database "
                              f"'{database}'")
@@ -62,37 +69,22 @@ class SnowTable(SQLTable):
             name=name,
             source_name=source_name,
             columns=columns,
-            column_expressions=column_expressions,
             primary_key=primary_key,
             time_column=time_column,
             end_time_column=end_time_column,
         )
-    @staticmethod
-    def to_dtype(snowflake_dtype: str | None) -> Dtype | None:
-        if snowflake_dtype is None:
-            return None
-        snowflake_dtype = snowflake_dtype.strip().upper()
-        # TODO 'NUMBER(...)' is not always an integer!
-        if snowflake_dtype.startswith('NUMBER'):
-            return Dtype.int
-        elif snowflake_dtype.startswith('VARCHAR'):
-            return Dtype.string
-        elif snowflake_dtype == 'FLOAT':
-            return Dtype.float
-        elif snowflake_dtype == 'BOOLEAN':
-            return Dtype.bool
-        elif re.search('DATE|TIMESTAMP', snowflake_dtype):
-            return Dtype.date
-        return None
     @property
-    def backend(self) -> DataBackend:
-        return cast(DataBackend, DataBackend.SNOWFLAKE)
+    def source_name(self) -> str:
+        names: list[str] = []
+        if self._database is not None:
+            names.append(self._database)
+        if self._schema is not None:
+            names.append(self._schema)
+        return '.'.join(names + [self._source_name])
     @property
-    def fqn(self) -> str:
-        r"""The fully-qualified quoted table name."""
+    def _quoted_source_name(self) -> str:
         names: list[str] = []
         if self._database is not None:
             names.append(quote_ident(self._database))
@@ -100,32 +92,26 @@ class SnowTable(SQLTable):
             names.append(quote_ident(self._schema))
         return '.'.join(names + [quote_ident(self._source_name)])
+    @property
+    def backend(self) -> DataBackend:
+        return cast(DataBackend, DataBackend.SNOWFLAKE)
     def _get_source_columns(self) -> list[SourceColumn]:
         source_columns: list[SourceColumn] = []
         with self._connection.cursor() as cursor:
             try:
-                sql = f"DESCRIBE TABLE {self.fqn}"
+                sql = f"DESCRIBE TABLE {self._quoted_source_name}"
                 cursor.execute(sql)
             except Exception as e:
-                names: list[str] = []
-                if self._database is not None:
-                    names.append(self._database)
-                if self._schema is not None:
-                    names.append(self._schema)
-                source_name = '.'.join(names + [self._source_name])
-                raise ValueError(f"Table '{source_name}' does not exist in "
-                                 f"the remote data backend") from e
+                raise ValueError(f"Table '{self.source_name}' does not exist "
+                                 f"in the remote data backend") from e
             for row in cursor.fetchall():
-                column, type, _, null, _, is_pkey, is_unique, *_ = row
-                dtype = self.to_dtype(type)
-                if dtype is None:
-                    continue
+                column, dtype, _, null, _, is_pkey, is_unique, *_ = row
                 source_column = SourceColumn(
                     name=column,
-                    dtype=dtype,
+                    dtype=self._to_dtype(dtype),
                     is_primary_key=is_pkey.strip().upper() == 'Y',
                     is_unique_key=is_unique.strip().upper() == 'Y',
                     is_nullable=null.strip().upper() == 'Y',
@@ -135,35 +121,122 @@ class SnowTable(SQLTable):
         return source_columns
     def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
-        source_fkeys: list[SourceForeignKey] = []
+        source_foreign_keys: list[SourceForeignKey] = []
         with self._connection.cursor() as cursor:
-            sql = f"SHOW IMPORTED KEYS IN TABLE {self.fqn}"
+            sql = f"SHOW IMPORTED KEYS IN TABLE {self._quoted_source_name}"
             cursor.execute(sql)
-            for row in cursor.fetchall():
-                _, _, _, dst_table, pkey, _, _, _, fkey, *_ = row
-                source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
-        return source_fkeys
+            rows = cursor.fetchall()
+            counts = Counter(row[13] for row in rows)
+            for row in rows:
+                if counts[row[13]] == 1:
+                    source_foreign_key = SourceForeignKey(
+                        name=row[8],
+                        dst_table=f'{row[1]}.{row[2]}.{row[3]}',
+                        primary_key=row[4],
+                    )
+                    source_foreign_keys.append(source_foreign_key)
+        return source_foreign_keys
     def _get_source_sample_df(self) -> pd.DataFrame:
         with self._connection.cursor() as cursor:
             columns = [quote_ident(col) for col in self._source_column_dict]
-            sql = f"SELECT {', '.join(columns)} FROM {self.fqn} LIMIT 1000"
+            sql = (f"SELECT {', '.join(columns)} "
+                   f"FROM {self._quoted_source_name} "
+                   f"LIMIT {self._NUM_SAMPLE_ROWS}")
             cursor.execute(sql)
             table = cursor.fetch_arrow_all()
-            return table.to_pandas(types_mapper=pd.ArrowDtype)
+        if table is None:
+            raise RuntimeError(f"Table '{self.source_name}' is empty")
+        return self._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict={
+                column.name: column.dtype
+                for column in self._source_column_dict.values()
+            },
+            stype_dict=None,
+        )
     def _get_num_rows(self) -> int | None:
         return None
-    def _get_expression_sample_df(
+    def _get_expr_sample_df(
         self,
-        specs: Sequence[ColumnExpressionSpec],
+        columns: Sequence[ColumnSpec],
     ) -> pd.DataFrame:
         with self._connection.cursor() as cursor:
-            columns = [
-                f"{spec.expr} AS {quote_ident(spec.name)}" for spec in specs
+            projections = [
+                f"{column.expr} AS {quote_ident(column.name)}"
+                for column in columns
             ]
-            sql = f"SELECT {', '.join(columns)} FROM {self.fqn} LIMIT 1000"
+            sql = (f"SELECT {', '.join(projections)} "
+                   f"FROM {self._quoted_source_name} "
+                   f"LIMIT {self._NUM_SAMPLE_ROWS}")
             cursor.execute(sql)
             table = cursor.fetch_arrow_all()
-            return table.to_pandas(types_mapper=pd.ArrowDtype)
+        if table is None:
+            raise RuntimeError(f"Table '{self.source_name}' is empty")
+        return self._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict={column.name: column.dtype
+                        for column in columns},
+            stype_dict=None,
+        )
+    @staticmethod
+    def _to_dtype(dtype: str | None) -> Dtype | None:
+        if dtype is None:
+            return None
+        dtype = dtype.strip().upper()
+        if dtype.startswith('NUMBER'):
+            try:  # Parse `scale` from 'NUMBER(precision, scale)':
+                scale = int(dtype.split(',')[-1].split(')')[0])
+                return Dtype.int if scale == 0 else Dtype.float
+            except Exception:
+                return Dtype.float
+        if dtype == 'FLOAT':
+            return Dtype.float
+        if dtype.startswith('VARCHAR'):
+            return Dtype.string
+        if dtype.startswith('BINARY'):
+            return Dtype.binary
+        if dtype == 'BOOLEAN':
+            return Dtype.bool
+        if dtype.startswith('DATE') or dtype.startswith('TIMESTAMP'):
+            return Dtype.date
+        if dtype.startswith('TIME'):
+            return Dtype.time
+        if dtype.startswith('VECTOR'):
+            try:  # Parse element data type from 'VECTOR(dtype, dimension)':
+                dtype = dtype.split(',')[0].split('(')[1].strip()
+                if dtype == 'INT':
+                    return Dtype.intlist
+                elif dtype == 'FLOAT':
+                    return Dtype.floatlist
+            except Exception:
+                pass
+            return Dtype.unsupported
+        if dtype.startswith('ARRAY'):
+            try:  # Parse element data type from 'ARRAY(dtype)':
+                dtype = dtype.split('(', maxsplit=1)[1]
+                dtype = dtype.rsplit(')', maxsplit=1)[0]
+                _dtype = SnowTable._to_dtype(dtype)
+                if _dtype is not None and _dtype.is_int():
+                    return Dtype.intlist
+                elif _dtype is not None and _dtype.is_float():
+                    return Dtype.floatlist
+                elif _dtype is not None and _dtype.is_string():
+                    return Dtype.stringlist
+            except Exception:
+                pass
+            return Dtype.unsupported
+        # Unsupported data types:
+        if re.search(
+                'DECFLOAT|VARIANT|OBJECT|MAP|FILE|GEOGRAPHY|GEOMETRY',
+                dtype,
+        ):
+            return Dtype.unsupported
+        return None