PyPI - kumoai - Versions diffs - 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl → 2.14.0.dev202601081732__cp313-cp313-win_amd64.whl - Mend

kumoai 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl → 2.14.0.dev202601081732__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

kumoai/__init__.py +23 -26
kumoai/_version.py +1 -1
kumoai/client/client.py +6 -0
kumoai/client/jobs.py +26 -0
kumoai/connector/utils.py +21 -7
kumoai/experimental/rfm/__init__.py +24 -22
kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
kumoai/experimental/rfm/backend/local/sampler.py +0 -3
kumoai/experimental/rfm/backend/local/table.py +24 -25
kumoai/experimental/rfm/backend/snow/sampler.py +184 -70
kumoai/experimental/rfm/backend/snow/table.py +137 -64
kumoai/experimental/rfm/backend/sqlite/sampler.py +191 -86
kumoai/experimental/rfm/backend/sqlite/table.py +85 -55
kumoai/experimental/rfm/base/__init__.py +6 -9
kumoai/experimental/rfm/base/column.py +95 -11
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/sampler.py +26 -17
kumoai/experimental/rfm/base/source.py +1 -1
kumoai/experimental/rfm/base/sql_sampler.py +182 -19
kumoai/experimental/rfm/base/table.py +275 -109
kumoai/experimental/rfm/graph.py +115 -107
kumoai/experimental/rfm/infer/dtype.py +4 -1
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +530 -304
kumoai/experimental/rfm/task_table.py +292 -0
kumoai/kumolib.cp313-win_amd64.pyd +0 -0
kumoai/pquery/training_table.py +16 -2
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/utils/display.py +87 -0
kumoai/utils/progress_logger.py +13 -1
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/METADATA +1 -1
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/RECORD +36 -33
kumoai/experimental/rfm/base/column_expression.py +0 -50
kumoai/experimental/rfm/base/sql_table.py +0 -229
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.14.0.dev202601081732.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/backend/sqlite/sampler.py CHANGED Viewed

@@ -6,9 +6,9 @@ import numpy as np
 import pandas as pd
 import pyarrow as pa
 from kumoapi.pquery import ValidatedPredictiveQuery
-from kumoapi.typing import Stype
-from kumoai.experimental.rfm.base import SQLSampler
+from kumoai.experimental.rfm.backend.sqlite import SQLiteTable
+from kumoai.experimental.rfm.base import SQLSampler, Table
 from kumoai.experimental.rfm.pquery import PQueryPandasExecutor
 from kumoai.utils import ProgressLogger, quote_ident
@@ -25,22 +25,32 @@ class SQLiteSampler(SQLSampler):
     ) -> None:
         super().__init__(graph=graph, verbose=verbose)
+        for table in graph.tables.values():
+            assert isinstance(table, SQLiteTable)
+            self._connection = table._connection
         if optimize:
             with self._connection.cursor() as cursor:
                 cursor.execute("PRAGMA temp_store = MEMORY")
                 cursor.execute("PRAGMA cache_size = -2000000")  # 2 GB
-        # Collect database indices to speed-up sampling:
+        # Collect database indices for speeding sampling:
         index_dict: dict[str, set[tuple[str, ...]]] = defaultdict(set)
         for table_name, primary_key in self.primary_key_dict.items():
             source_table = self.source_table_dict[table_name]
-            if not source_table[primary_key].is_unique_key:
-                index_dict[table_name].add((primary_key, ))
+            if primary_key not in source_table:
+                continue  # No physical column.
+            if source_table[primary_key].is_unique_key:
+                continue
+            index_dict[table_name].add((primary_key, ))
         for src_table_name, foreign_key, _ in graph.edges:
             source_table = self.source_table_dict[src_table_name]
+            if foreign_key not in source_table:
+                continue  # No physical column.
             if source_table[foreign_key].is_unique_key:
-                pass
-            elif time_column := self.time_column_dict.get(src_table_name):
+                continue
+            time_column = self.time_column_dict.get(src_table_name)
+            if time_column is not None and time_column in source_table:
                 index_dict[src_table_name].add((foreign_key, time_column))
             else:
                 index_dict[src_table_name].add((foreign_key, ))
@@ -49,22 +59,22 @@ class SQLiteSampler(SQLSampler):
         with self._connection.cursor() as cursor:
             for table_name in list(index_dict.keys()):
                 indices = index_dict[table_name]
-                sql = f"PRAGMA index_list({self.fqn_dict[table_name]})"
+                source_name = self.source_name_dict[table_name]
+                sql = f"PRAGMA index_list({source_name})"
                 cursor.execute(sql)
                 for _, index_name, *_ in cursor.fetchall():
                     sql = f"PRAGMA index_info({quote_ident(index_name)})"
                     cursor.execute(sql)
-                    index = tuple(info[2] for info in sorted(
+                    # Fetch index information and sort by `seqno`:
+                    index_info = tuple(info[2] for info in sorted(
                         cursor.fetchall(), key=lambda x: x[0]))
-                    indices.discard(index)
+                    # Remove all indices in case primary index already exists:
+                    for index in list(indices):
+                        if index_info[0] == index[0]:
+                            indices.discard(index)
                 if len(indices) == 0:
                     del index_dict[table_name]
-        num = sum(len(indices) for indices in index_dict.values())
-        index_repr = '1 index' if num == 1 else f'{num} indices'
-        num = len(index_dict)
-        table_repr = '1 table' if num == 1 else f'{num} tables'
         if optimize and len(index_dict) > 0:
             if not isinstance(verbose, ProgressLogger):
                 verbose = ProgressLogger.default(
@@ -79,16 +89,27 @@ class SQLiteSampler(SQLSampler):
                         name = quote_ident(name)
                         columns = ', '.join(quote_ident(v) for v in index)
                         columns += ' DESC' if len(index) > 1 else ''
+                        source_name = self.source_name_dict[table_name]
                         sql = (f"CREATE INDEX IF NOT EXISTS {name}\n"
-                               f"ON {self.fqn_dict[table_name]}({columns})")
+                               f"ON {source_name}({columns})")
                         cursor.execute(sql)
-                self._connection.commit()
-                logger.log(f"Created {index_repr} in {table_repr}")
+                        self._connection.commit()
+                        if len(index) > 1:
+                            logger.log(f"Created index on {index} in table "
+                                       f"'{table_name}'")
+                        else:
+                            logger.log(f"Created index on '{index[0]}' in "
+                                       f"table '{table_name}'")
         elif len(index_dict) > 0:
+            num = sum(len(indices) for indices in index_dict.values())
+            index_repr = '1 index' if num == 1 else f'{num} indices'
+            num = len(index_dict)
+            table_repr = '1 table' if num == 1 else f'{num} tables'
             warnings.warn(f"Missing {index_repr} in {table_repr} for optimal "
                           f"database querying. For improving runtime, we "
-                          f"strongly suggest to create these indices by "
+                          f"strongly suggest to create indices for primary "
+                          f"and foreign keys, e.g., automatically by "
                           f"instantiating KumoRFM via "
                           f"`KumoRFM(graph, optimize=True)`.")
@@ -98,12 +119,13 @@ class SQLiteSampler(SQLSampler):
     ) -> dict[str, tuple[pd.Timestamp, pd.Timestamp]]:
         selects: list[str] = []
         for table_name in table_names:
-            time_column = self.time_column_dict[table_name]
+            column = self.time_column_dict[table_name]
+            column_ref = self.table_column_ref_dict[table_name][column]
             select = (f"SELECT\n"
                       f"  ? as table_name,\n"
-                      f"  MIN({quote_ident(time_column)}) as min_date,\n"
-                      f"  MAX({quote_ident(time_column)}) as max_date\n"
-                      f"FROM {self.fqn_dict[table_name]}")
+                      f"  MIN({column_ref}) as min_date,\n"
+                      f"  MAX({column_ref}) as max_date\n"
+                      f"FROM {self.source_name_dict[table_name]}")
             selects.append(select)
         sql = "\nUNION ALL\n".join(selects)
@@ -126,18 +148,28 @@ class SQLiteSampler(SQLSampler):
     ) -> pd.DataFrame:
         # NOTE SQLite does not natively support passing a `random_seed`.
+        source_table = self.source_table_dict[table_name]
         filters: list[str] = []
-        primary_key = self.primary_key_dict[table_name]
-        if self.source_table_dict[table_name][primary_key].is_nullable:
-            filters.append(f" {quote_ident(primary_key)} IS NOT NULL")
-        time_column = self.time_column_dict.get(table_name)
-        if (time_column is not None and
-                self.source_table_dict[table_name][time_column].is_nullable):
-            filters.append(f" {quote_ident(time_column)} IS NOT NULL")
+        key = self.primary_key_dict[table_name]
+        if key not in source_table or source_table[key].is_nullable:
+            key_ref = self.table_column_ref_dict[table_name][key]
+            filters.append(f" {key_ref} IS NOT NULL")
+        column = self.time_column_dict.get(table_name)
+        if column is None:
+            pass
+        elif column not in source_table or source_table[column].is_nullable:
+            column_ref = self.table_column_ref_dict[table_name][column]
+            filters.append(f" {column_ref} IS NOT NULL")
         # TODO Make this query more efficient - it does full table scan.
-        sql = (f"SELECT {', '.join(quote_ident(col) for col in columns)}\n"
-               f"FROM {self.fqn_dict[table_name]}")
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
+        sql = (f"SELECT {', '.join(projections)}\n"
+               f"FROM {self.source_name_dict[table_name]}")
         if len(filters) > 0:
             sql += f"\nWHERE{' AND'.join(filters)}"
         sql += f"\nORDER BY RANDOM() LIMIT {num_rows}"
@@ -147,7 +179,11 @@ class SQLiteSampler(SQLSampler):
             cursor.execute(sql)
             table = cursor.fetch_arrow_table()
-        return self._sanitize(table_name, table)
+        return Table._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        )
     def _sample_target(
         self,
@@ -190,84 +226,163 @@ class SQLiteSampler(SQLSampler):
     def _by_pkey(
         self,
         table_name: str,
-        pkey: pd.Series,
+        index: pd.Series,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
-        pkey_name = self.primary_key_dict[table_name]
+        source_table = self.source_table_dict[table_name]
+        key = self.primary_key_dict[table_name]
+        key_ref = self.table_column_ref_dict[table_name][key]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
+        tmp = pa.table([pa.array(index)], names=['__kumo_id__'])
+        tmp_name = f'tmp_{table_name}_{key}_{id(tmp)}'
+        sql = (f"SELECT "
+               f"tmp.rowid - 1 as __kumo_batch__, "
+               f"{', '.join(projections)}\n"
+               f"FROM {quote_ident(tmp_name)} tmp\n"
+               f"JOIN {self.source_name_dict[table_name]} ent\n")
+        if key in source_table and source_table[key].is_unique_key:
+            sql += (f"  ON {key_ref} = tmp.__kumo_id__")
+        else:
+            sql += (f"  ON ent.rowid = (\n"
+                    f"    SELECT rowid\n"
+                    f"    FROM {self.source_name_dict[table_name]}\n"
+                    f"    WHERE {key_ref} == tmp.__kumo_id__\n"
+                    f"    LIMIT 1\n"
+                    f")")
-        tmp = pa.table([pa.array(pkey)], names=['id'])
-        tmp_name = f'tmp_{table_name}_{pkey_name}_{id(tmp)}'
+        with self._connection.cursor() as cursor:
+            cursor.adbc_ingest(tmp_name, tmp, mode='replace')
+            cursor.execute(sql)
+            table = cursor.fetch_arrow_table()
-        if self.source_table_dict[table_name][pkey_name].is_unique_key:
-            sql = (f"SELECT tmp.rowid - 1 as __batch__, "
-                   f"{', '.join('ent.' + quote_ident(c) for c in columns)}\n"
-                   f"FROM {quote_ident(tmp_name)} tmp\n"
-                   f"JOIN {self.fqn_dict[table_name]} ent\n"
-                   f"  ON ent.{quote_ident(pkey_name)} = tmp.id")
-        else:
-            sql = (f"SELECT tmp.rowid - 1 as __batch__, "
-                   f"{', '.join('ent.' + quote_ident(c) for c in columns)}\n"
-                   f"FROM {quote_ident(tmp_name)} tmp\n"
-                   f"JOIN {self.fqn_dict[table_name]} ent\n"
-                   f"  ON ent.rowid = (\n"
-                   f"    SELECT rowid FROM {self.fqn_dict[table_name]}\n"
-                   f"    WHERE {quote_ident(pkey_name)} == tmp.id\n"
-                   f"    LIMIT 1\n"
-                   f")")
+        batch = table['__kumo_batch__'].to_numpy()
+        batch_index = table.schema.get_field_index('__kumo_batch__')
+        table = table.remove_column(batch_index)
+        return Table._sanitize(
+            df=table.to_pandas(),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch
+    def _by_fkey(
+        self,
+        table_name: str,
+        foreign_key: str,
+        index: pd.Series,
+        num_neighbors: int,
+        anchor_time: pd.Series | None,
+        columns: set[str],
+    ) -> tuple[pd.DataFrame, np.ndarray]:
+        time_column = self.time_column_dict.get(table_name)
+        # NOTE SQLite does not have a native datetime format. Currently, we
+        # assume timestamps are given as `TEXT` in `ISO-8601 UTC`:
+        tmp = pa.table([pa.array(index)], names=['__kumo_id__'])
+        if time_column is not None and anchor_time is not None:
+            anchor_time = anchor_time.dt.strftime("%Y-%m-%d %H:%M:%S")
+            tmp = tmp.append_column('__kumo_time__', pa.array(anchor_time))
+        tmp_name = f'tmp_{table_name}_{foreign_key}_{id(tmp)}'
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
+        sql = (f"SELECT "
+               f"tmp.rowid - 1 as __kumo_batch__, "
+               f"{', '.join(projections)}\n"
+               f"FROM {quote_ident(tmp_name)} tmp\n"
+               f"JOIN {self.source_name_dict[table_name]} fact\n"
+               f"ON fact.rowid IN (\n"
+               f"  SELECT rowid\n"
+               f"  FROM {self.source_name_dict[table_name]}\n"
+               f"  WHERE {key_ref} = tmp.__kumo_id__\n")
+        if time_column is not None and anchor_time is not None:
+            time_ref = self.table_column_ref_dict[table_name][time_column]
+            sql += f"  AND {time_ref} <= tmp.__kumo_time__\n"
+        if time_column is not None:
+            time_ref = self.table_column_ref_dict[table_name][time_column]
+            sql += f"  ORDER BY {time_ref} DESC\n"
+        sql += (f"  LIMIT {num_neighbors}\n"
+                f")")
         with self._connection.cursor() as cursor:
             cursor.adbc_ingest(tmp_name, tmp, mode='replace')
             cursor.execute(sql)
             table = cursor.fetch_arrow_table()
-        batch = table['__batch__'].to_numpy()
-        table = table.remove_column(table.schema.get_field_index('__batch__'))
+        batch = table['__kumo_batch__'].to_numpy()
+        batch_index = table.schema.get_field_index('__kumo_batch__')
+        table = table.remove_column(batch_index)
-        return table.to_pandas(), batch  # TODO Use `self._sanitize`.
+        return Table._sanitize(
+            df=table.to_pandas(),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch
     # Helper Methods ##########################################################
     def _by_time(
         self,
         table_name: str,
-        fkey: str,
-        pkey: pd.Series,
+        foreign_key: str,
+        index: pd.Series,
         anchor_time: pd.Series,
         min_offset: pd.DateOffset | None,
         max_offset: pd.DateOffset,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
+        time_column = self.time_column_dict[table_name]
         # NOTE SQLite does not have a native datetime format. Currently, we
         # assume timestamps are given as `TEXT` in `ISO-8601 UTC`:
-        tmp = pa.table([pa.array(pkey)], names=['id'])
+        tmp = pa.table([pa.array(index)], names=['__kumo_id__'])
         end_time = anchor_time + max_offset
         end_time = end_time.dt.strftime("%Y-%m-%d %H:%M:%S")
-        tmp = tmp.append_column('end', pa.array(end_time))
+        tmp = tmp.append_column('__kumo_end__', pa.array(end_time))
         if min_offset is not None:
             start_time = anchor_time + min_offset
             start_time = start_time.dt.strftime("%Y-%m-%d %H:%M:%S")
-            tmp = tmp.append_column('start', pa.array(start_time))
-        tmp_name = f'tmp_{table_name}_{fkey}_{id(tmp)}'
-        time_column = self.time_column_dict[table_name]
-        sql = (f"SELECT tmp.rowid - 1 as __batch__, "
-               f"{', '.join('fact.' + quote_ident(col) for col in columns)}\n"
+            tmp = tmp.append_column('__kumo_start__', pa.array(start_time))
+        tmp_name = f'tmp_{table_name}_{foreign_key}_{id(tmp)}'
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
+        time_ref = self.table_column_ref_dict[table_name][time_column]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
+        sql = (f"SELECT "
+               f"tmp.rowid - 1 as __kumo_batch__, "
+               f"{', '.join(projections)}\n"
                f"FROM {quote_ident(tmp_name)} tmp\n"
-               f"JOIN {self.fqn_dict[table_name]} fact\n"
-               f"  ON fact.{quote_ident(fkey)} = tmp.id\n"
-               f" AND fact.{quote_ident(time_column)} <= tmp.end")
+               f"JOIN {self.source_name_dict[table_name]}\n"
+               f"  ON {key_ref} = tmp.__kumo_id__\n"
+               f" AND {time_ref} <= tmp.__kumo_end__")
         if min_offset is not None:
-            sql += f"\n AND fact.{quote_ident(time_column)} > tmp.start"
+            sql += f"\n AND {time_ref} > tmp.__kumo_start__"
         with self._connection.cursor() as cursor:
             cursor.adbc_ingest(tmp_name, tmp, mode='replace')
             cursor.execute(sql)
             table = cursor.fetch_arrow_table()
-        batch = table['__batch__'].to_numpy()
-        table = table.remove_column(table.schema.get_field_index('__batch__'))
+        batch = table['__kumo_batch__'].to_numpy()
+        batch_index = table.schema.get_field_index('__kumo_batch__')
+        table = table.remove_column(batch_index)
-        return self._sanitize(table_name, table), batch
+        return Table._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch
     def _sample_target_set(
         self,
@@ -300,11 +415,11 @@ class SQLiteSampler(SQLSampler):
                 query.entity_table: np.arange(len(df)),
             }
             for edge_type, (_min, _max) in time_offset_dict.items():
-                table_name, fkey, _ = edge_type
+                table_name, foreign_key, _ = edge_type
                 feat_dict[table_name], batch_dict[table_name] = self._by_time(
                     table_name=table_name,
-                    fkey=fkey,
-                    pkey=df[self.primary_key_dict[query.entity_table]],
+                    foreign_key=foreign_key,
+                    index=df[self.primary_key_dict[query.entity_table]],
                     anchor_time=time,
                     min_offset=_min,
                     max_offset=_max,
@@ -337,13 +452,3 @@ class SQLiteSampler(SQLSampler):
             y = pd.concat(ys, axis=0, ignore_index=True)
         return y, mask
-    def _sanitize(self, table_name: str, table: pa.table) -> pd.DataFrame:
-        df = table.to_pandas(types_mapper=pd.ArrowDtype)
-        stype_dict = self.table_stype_dict[table_name]
-        for column_name in df.columns:
-            if stype_dict.get(column_name) == Stype.timestamp:
-                df[column_name] = pd.to_datetime(df[column_name])
-        return df

kumoai/experimental/rfm/backend/sqlite/table.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import re
-import warnings
+from collections import Counter
 from collections.abc import Sequence
 from typing import cast
@@ -9,27 +9,25 @@ from kumoapi.typing import Dtype
 from kumoai.experimental.rfm.backend.sqlite import Connection
 from kumoai.experimental.rfm.base import (
-    ColumnExpressionSpec,
-    ColumnExpressionType,
+    ColumnSpec,
+    ColumnSpecType,
     DataBackend,
     SourceColumn,
     SourceForeignKey,
-    SQLTable,
+    Table,
 )
-from kumoai.experimental.rfm.infer import infer_dtype
 from kumoai.utils import quote_ident
-class SQLiteTable(SQLTable):
+class SQLiteTable(Table):
     r"""A table backed by a :class:`sqlite` database.
     Args:
         connection: The connection to a :class:`sqlite` database.
-        name: The logical name of this table.
-        source_name: The physical name of this table in the database. If set to
-            ``None``, ``name`` is being used.
-        columns: The selected physical columns of this table.
-        column_expressions: The logical columns of this table.
+        name: The name of this table.
+        source_name: The source name of this table. If set to ``None``,
+            ``name`` is being used.
+        columns: The selected columns of this table.
         primary_key: The name of the primary key of this table, if it exists.
         time_column: The name of the time column of this table, if it exists.
         end_time_column: The name of the end time column of this table, if it
@@ -40,8 +38,7 @@ class SQLiteTable(SQLTable):
         connection: Connection,
         name: str,
         source_name: str | None = None,
-        columns: Sequence[str] | None = None,
-        column_expressions: Sequence[ColumnExpressionType] | None = None,
+        columns: Sequence[ColumnSpecType] | None = None,
         primary_key: MissingType | str | None = MissingType.VALUE,
         time_column: str | None = None,
         end_time_column: str | None = None,
@@ -53,7 +50,6 @@ class SQLiteTable(SQLTable):
             name=name,
             source_name=source_name,
             columns=columns,
-            column_expressions=column_expressions,
             primary_key=primary_key,
             time_column=time_column,
             end_time_column=end_time_column,
@@ -66,16 +62,16 @@ class SQLiteTable(SQLTable):
     def _get_source_columns(self) -> list[SourceColumn]:
         source_columns: list[SourceColumn] = []
         with self._connection.cursor() as cursor:
-            sql = f"PRAGMA table_info({self.fqn})"
+            sql = f"PRAGMA table_info({self._quoted_source_name})"
             cursor.execute(sql)
             columns = cursor.fetchall()
             if len(columns) == 0:
-                raise ValueError(f"Table '{self._source_name}' does not exist "
+                raise ValueError(f"Table '{self.source_name}' does not exist "
                                  f"in the SQLite database")
             unique_keys: set[str] = set()
-            sql = f"PRAGMA index_list({self.fqn})"
+            sql = f"PRAGMA index_list({self._quoted_source_name})"
             cursor.execute(sql)
             for _, index_name, is_unique, *_ in cursor.fetchall():
                 if bool(is_unique):
@@ -85,32 +81,19 @@ class SQLiteTable(SQLTable):
                     if len(index) == 1:
                         unique_keys.add(index[0][2])
-            for _, column, type, notnull, _, is_pkey in columns:
-                # Determine column affinity:
-                type = type.strip().upper()
-                if re.search('INT', type):
-                    dtype = Dtype.int
-                elif re.search('TEXT|CHAR|CLOB', type):
-                    dtype = Dtype.string
-                elif re.search('REAL|FLOA|DOUB', type):
-                    dtype = Dtype.float
-                else:  # NUMERIC affinity.
-                    ser = self._source_sample_df[column]
-                    try:
-                        dtype = infer_dtype(ser)
-                    except Exception:
-                        warnings.warn(f"Encountered unsupported data type "
-                                      f"'{ser.dtype}' with source data type "
-                                      f"'{type}' for column '{column}' in "
-                                      f"table '{self.name}'. If possible, "
-                                      f"change the data type of the column in "
-                                      f"your SQLite database to use it within "
-                                      f"this table.")
-                        continue
+            # Special SQLite case that creates a rowid alias for
+            # `INTEGER PRIMARY KEY` annotated columns:
+            rowid_candidates = [
+                column for _, column, dtype, _, _, is_pkey in columns
+                if bool(is_pkey) and dtype.strip().upper() == 'INTEGER'
+            ]
+            if len(rowid_candidates) == 1:
+                unique_keys.add(rowid_candidates[0])
+            for _, column, dtype, notnull, _, is_pkey in columns:
                 source_column = SourceColumn(
                     name=column,
-                    dtype=dtype,
+                    dtype=self._to_dtype(dtype),
                     is_primary_key=bool(is_pkey),
                     is_unique_key=column in unique_keys,
                     is_nullable=not bool(is_pkey) and not bool(notnull),
@@ -120,35 +103,82 @@ class SQLiteTable(SQLTable):
         return source_columns
     def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
-        source_fkeys: list[SourceForeignKey] = []
+        source_foreign_keys: list[SourceForeignKey] = []
         with self._connection.cursor() as cursor:
-            sql = f"PRAGMA foreign_key_list({self.fqn})"
+            sql = f"PRAGMA foreign_key_list({self._quoted_source_name})"
             cursor.execute(sql)
-            for _, _, dst_table, fkey, pkey, *_ in cursor.fetchall():
-                source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
-        return source_fkeys
+            rows = cursor.fetchall()
+            counts = Counter(row[0] for row in rows)
+            for idx, _, dst_table, foreign_key, primary_key, *_ in rows:
+                if counts[idx] == 1:
+                    source_foreign_key = SourceForeignKey(
+                        name=foreign_key,
+                        dst_table=dst_table,
+                        primary_key=primary_key,
+                    )
+                    source_foreign_keys.append(source_foreign_key)
+        return source_foreign_keys
     def _get_source_sample_df(self) -> pd.DataFrame:
         with self._connection.cursor() as cursor:
-            sql = (f"SELECT * FROM {self.fqn} "
-                   f"ORDER BY rowid LIMIT 1000")
+            columns = [quote_ident(col) for col in self._source_column_dict]
+            sql = (f"SELECT {', '.join(columns)} "
+                   f"FROM {self._quoted_source_name} "
+                   f"ORDER BY rowid "
+                   f"LIMIT {self._NUM_SAMPLE_ROWS}")
             cursor.execute(sql)
             table = cursor.fetch_arrow_table()
-            return table.to_pandas(types_mapper=pd.ArrowDtype)
+        if len(table) == 0:
+            raise RuntimeError(f"Table '{self.source_name}' is empty")
+        return self._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict={
+                column.name: column.dtype
+                for column in self._source_column_dict.values()
+            },
+            stype_dict=None,
+        )
     def _get_num_rows(self) -> int | None:
         return None
-    def _get_expression_sample_df(
+    def _get_expr_sample_df(
         self,
-        specs: Sequence[ColumnExpressionSpec],
+        columns: Sequence[ColumnSpec],
     ) -> pd.DataFrame:
         with self._connection.cursor() as cursor:
-            columns = [
-                f"{spec.expr} AS {quote_ident(spec.name)}" for spec in specs
+            projections = [
+                f"{column.expr} AS {quote_ident(column.name)}"
+                for column in columns
             ]
-            sql = (f"SELECT {', '.join(columns)} FROM {self.fqn} "
-                   f"ORDER BY rowid LIMIT 1000")
+            sql = (f"SELECT {', '.join(projections)} "
+                   f"FROM {self._quoted_source_name} "
+                   f"ORDER BY rowid "
+                   f"LIMIT {self._NUM_SAMPLE_ROWS}")
             cursor.execute(sql)
             table = cursor.fetch_arrow_table()
-            return table.to_pandas(types_mapper=pd.ArrowDtype)
+        if len(table) == 0:
+            raise RuntimeError(f"Table '{self.source_name}' is empty")
+        return self._sanitize(
+            df=table.to_pandas(types_mapper=pd.ArrowDtype),
+            dtype_dict={column.name: column.dtype
+                        for column in columns},
+            stype_dict=None,
+        )
+    @staticmethod
+    def _to_dtype(dtype: str | None) -> Dtype | None:
+        if dtype is None:
+            return None
+        dtype = dtype.strip().upper()
+        if re.search('INT', dtype):
+            return Dtype.int
+        if re.search('TEXT|CHAR|CLOB', dtype):
+            return Dtype.string
+        if re.search('REAL|FLOA|DOUB', dtype):
+            return Dtype.float
+        return None  # NUMERIC affinity.

kumoai/experimental/rfm/base/__init__.py CHANGED Viewed

@@ -8,12 +8,9 @@ class DataBackend(StrEnum):
 from .source import SourceColumn, SourceForeignKey  # noqa: E402
-from .column import Column  # noqa: E402
-from .column_expression import ColumnExpressionSpec  # noqa: E402
-from .column_expression import ColumnExpressionType  # noqa: E402
-from .column_expression import ColumnExpression  # noqa: E402
+from .expression import Expression, LocalExpression  # noqa: E402
+from .column import ColumnSpec, ColumnSpecType, Column  # noqa: E402
 from .table import Table  # noqa: E402
-from .sql_table import SQLTable  # noqa: E402
 from .sampler import SamplerOutput, Sampler  # noqa: E402
 from .sql_sampler import SQLSampler  # noqa: E402
@@ -21,12 +18,12 @@ __all__ = [
     'DataBackend',
     'SourceColumn',
     'SourceForeignKey',
+    'Expression',
+    'LocalExpression',
+    'ColumnSpec',
+    'ColumnSpecType',
     'Column',
-    'ColumnExpressionSpec',
-    'ColumnExpressionType',
-    'ColumnExpression',
     'Table',
-    'SQLTable',
     'SamplerOutput',
     'Sampler',
     'SQLSampler',