PyPI - kumoai - Versions diffs - 2.14.0.dev202512151351__cp313-cp313-macosx_11_0_arm64.whl → 2.14.0.dev202512211732__cp313-cp313-macosx_11_0_arm64.whl - Mend

kumoai 2.14.0.dev202512151351__cp313-cp313-macosx_11_0_arm64.whl → 2.14.0.dev202512211732__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

kumoai/_version.py +1 -1
kumoai/experimental/rfm/__init__.py +33 -8
kumoai/experimental/rfm/authenticate.py +3 -4
kumoai/experimental/rfm/backend/local/graph_store.py +25 -25
kumoai/experimental/rfm/backend/local/table.py +16 -21
kumoai/experimental/rfm/backend/snow/sampler.py +22 -34
kumoai/experimental/rfm/backend/snow/table.py +67 -33
kumoai/experimental/rfm/backend/sqlite/__init__.py +2 -2
kumoai/experimental/rfm/backend/sqlite/sampler.py +21 -26
kumoai/experimental/rfm/backend/sqlite/table.py +54 -26
kumoai/experimental/rfm/base/__init__.py +8 -0
kumoai/experimental/rfm/base/column.py +14 -12
kumoai/experimental/rfm/base/column_expression.py +50 -0
kumoai/experimental/rfm/base/sql_sampler.py +31 -3
kumoai/experimental/rfm/base/sql_table.py +229 -0
kumoai/experimental/rfm/base/table.py +162 -143
kumoai/experimental/rfm/graph.py +242 -95
kumoai/experimental/rfm/infer/__init__.py +6 -4
kumoai/experimental/rfm/infer/dtype.py +3 -3
kumoai/experimental/rfm/infer/pkey.py +4 -2
kumoai/experimental/rfm/infer/stype.py +35 -0
kumoai/experimental/rfm/infer/time_col.py +1 -2
kumoai/experimental/rfm/pquery/executor.py +27 -27
kumoai/experimental/rfm/pquery/pandas_executor.py +29 -31
kumoai/experimental/rfm/rfm.py +86 -80
kumoai/experimental/rfm/sagemaker.py +4 -4
kumoai/utils/__init__.py +1 -2
kumoai/utils/progress_logger.py +178 -12
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/METADATA +2 -1
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/RECORD +33 -30
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512151351.dist-info → kumoai-2.14.0.dev202512211732.dist-info}/top_level.txt +0 -0

kumoai/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '2.14.0.~~dev202512151351~~'
1	+ __version__ = '2.14.0.dev202512211732'

kumoai/experimental/rfm/__init__.py CHANGED Viewed

@@ -6,11 +6,11 @@ import socket
 import threading
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, Optional, Tuple
 from urllib.parse import urlparse
 import kumoai
 from kumoai.client.client import KumoClient
+from kumoai.spcs import _get_active_session
 from .authenticate import authenticate
 from .sagemaker import (
@@ -49,7 +49,8 @@ class InferenceBackend(str, Enum):
 def _detect_backend(
-        url: str) -> Tuple[InferenceBackend, Optional[str], Optional[str]]:
+        url: str,  #
+) -> tuple[InferenceBackend, str | None, str | None]:
     parsed = urlparse(url)
     # Remote SageMaker
@@ -73,12 +74,27 @@ def _detect_backend(
     return InferenceBackend.REST, None, None
+def _get_snowflake_url(snowflake_application: str) -> str:
+    snowpark_session = _get_active_session()
+    if not snowpark_session:
+        raise ValueError(
+            "Client creation failed: snowflake_application is specified "
+            "without an active snowpark session. If running outside "
+            "a snowflake notebook, specify a URL and credentials.")
+    with snowpark_session.connection.cursor() as cur:
+        cur.execute(
+            f"DESCRIBE SERVICE {snowflake_application}.user_schema.rfm_service"
+            f" ->> SELECT \"dns_name\" from $1")
+        dns_name: str = cur.fetchone()[0]
+    return f"http://{dns_name}:8000/api"
 @dataclass
 class RfmGlobalState:
     _url: str = '__url_not_provided__'
     _backend: InferenceBackend = InferenceBackend.UNKNOWN
-    _region: Optional[str] = None
-    _endpoint_name: Optional[str] = None
+    _region: str | None = None
+    _endpoint_name: str | None = None
     _thread_local = threading.local()
     # Thread-safe init-once.
@@ -121,10 +137,10 @@ global_state = RfmGlobalState()
 def init(
-    url: Optional[str] = None,
-    api_key: Optional[str] = None,
-    snowflake_credentials: Optional[Dict[str, str]] = None,
-    snowflake_application: Optional[str] = None,
+    url: str | None = None,
+    api_key: str | None = None,
+    snowflake_credentials: dict[str, str] | None = None,
+    snowflake_application: str | None = None,
     log_level: str = "INFO",
 ) -> None:
     with global_state._lock:
@@ -136,6 +152,15 @@ def init(
                     "supported.")
             return
+        if snowflake_application:
+            if url is not None:
+                raise ValueError(
+                    "Client creation failed: both snowflake_application and "
+                    "url are specified. If running from a snowflake notebook, "
+                    "specify only snowflake_application.")
+            url = _get_snowflake_url(snowflake_application)
+            api_key = "test:DISABLED"
         if url is None:
             url = os.getenv("RFM_API_URL", "https://kumorfm.ai/api")

kumoai/experimental/rfm/authenticate.py CHANGED Viewed

@@ -2,12 +2,11 @@ import logging
 import os
 import platform
 from datetime import datetime
-from typing import Optional
 from kumoai import in_notebook
-def authenticate(api_url: Optional[str] = None) -> None:
+def authenticate(api_url: str | None = None) -> None:
     """Authenticates the user and sets the Kumo API key for the SDK.
     This function detects the current environment and launches the appropriate
@@ -65,11 +64,11 @@ def _authenticate_local(api_url: str, redirect_port: int = 8765) -> None:
     import webbrowser
     from getpass import getpass
     from socketserver import TCPServer
-    from typing import Any, Dict
+    from typing import Any
     logger = logging.getLogger('kumoai')
-    token_status: Dict[str, Any] = {
+    token_status: dict[str, Any] = {
         'token': None,
         'token_name': None,
         'failed': False

kumoai/experimental/rfm/backend/local/graph_store.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import warnings
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING
 import numpy as np
 import pandas as pd
@@ -7,7 +7,7 @@ from kumoapi.rfm.context import Subgraph
 from kumoapi.typing import Stype
 from kumoai.experimental.rfm.backend.local import LocalTable
-from kumoai.utils import InteractiveProgressLogger, ProgressLogger
+from kumoai.utils import ProgressLogger
 try:
     import torch
@@ -23,12 +23,12 @@ class LocalGraphStore:
     def __init__(
         self,
         graph: 'Graph',
-        verbose: Union[bool, ProgressLogger] = True,
+        verbose: bool | ProgressLogger = True,
     ) -> None:
         if not isinstance(verbose, ProgressLogger):
-            verbose = InteractiveProgressLogger(
-                "Materializing graph",
+            verbose = ProgressLogger.default(
+                msg="Materializing graph",
                 verbose=verbose,
             )
@@ -94,7 +94,7 @@ class LocalGraphStore:
     def sanitize(
         self,
         graph: 'Graph',
-    ) -> Tuple[Dict[str, pd.DataFrame], Dict[str, np.ndarray]]:
+    ) -> tuple[dict[str, pd.DataFrame], dict[str, np.ndarray]]:
         r"""Sanitizes raw data according to table schema definition:
         In particular, it:
@@ -103,13 +103,13 @@ class LocalGraphStore:
         * drops duplicate primary keys
         * removes rows with missing primary keys or time values
         """
-        df_dict: Dict[str, pd.DataFrame] = {}
+        df_dict: dict[str, pd.DataFrame] = {}
         for table_name, table in graph.tables.items():
             assert isinstance(table, LocalTable)
             df = table._data
             df_dict[table_name] = df.copy(deep=False).reset_index(drop=True)
-        mask_dict: Dict[str, np.ndarray] = {}
+        mask_dict: dict[str, np.ndarray] = {}
         for table in graph.tables.values():
             for col in table.columns:
                 if col.stype == Stype.timestamp:
@@ -126,7 +126,7 @@ class LocalGraphStore:
                         ser = ser.dt.tz_localize(None)
                         df_dict[table.name][col.name] = ser
-            mask: Optional[np.ndarray] = None
+            mask: np.ndarray | None = None
             if table._time_column is not None:
                 ser = df_dict[table.name][table._time_column]
                 mask = ser.notna().to_numpy()
@@ -144,8 +144,8 @@ class LocalGraphStore:
     def get_pkey_map_dict(
         self,
         graph: 'Graph',
-    ) -> Dict[str, pd.DataFrame]:
-        pkey_map_dict: Dict[str, pd.DataFrame] = {}
+    ) -> dict[str, pd.DataFrame]:
+        pkey_map_dict: dict[str, pd.DataFrame] = {}
         for table in graph.tables.values():
             if table._primary_key is None:
@@ -177,12 +177,12 @@ class LocalGraphStore:
     def get_time_data(
         self,
         graph: 'Graph',
-    ) -> Tuple[
-            Dict[str, np.ndarray],
-            Dict[str, Tuple[pd.Timestamp, pd.Timestamp]],
+    ) -> tuple[
+            dict[str, np.ndarray],
+            dict[str, tuple[pd.Timestamp, pd.Timestamp]],
     ]:
-        time_dict: Dict[str, np.ndarray] = {}
-        min_max_time_dict: Dict[str, tuple[pd.Timestamp, pd.Timestamp]] = {}
+        time_dict: dict[str, np.ndarray] = {}
+        min_max_time_dict: dict[str, tuple[pd.Timestamp, pd.Timestamp]] = {}
         for table in graph.tables.values():
             if table._time_column is None:
                 continue
@@ -207,15 +207,15 @@ class LocalGraphStore:
     def get_csc(
         self,
         graph: 'Graph',
-    ) -> Tuple[
-            Dict[Tuple[str, str, str], np.ndarray],
-            Dict[Tuple[str, str, str], np.ndarray],
+    ) -> tuple[
+            dict[tuple[str, str, str], np.ndarray],
+            dict[tuple[str, str, str], np.ndarray],
     ]:
         # A mapping from raw primary keys to node indices (0 to N-1):
-        map_dict: Dict[str, pd.CategoricalDtype] = {}
+        map_dict: dict[str, pd.CategoricalDtype] = {}
         # A dictionary to manage offsets of node indices for invalid rows:
-        offset_dict: Dict[str, np.ndarray] = {}
-        for table_name in set(edge.dst_table for edge in graph.edges):
+        offset_dict: dict[str, np.ndarray] = {}
+        for table_name in {edge.dst_table for edge in graph.edges}:
             ser = self.df_dict[table_name][graph[table_name]._primary_key]
             if table_name in self.mask_dict.keys():
                 mask = self.mask_dict[table_name]
@@ -224,8 +224,8 @@ class LocalGraphStore:
             map_dict[table_name] = pd.CategoricalDtype(ser, ordered=True)
         # Build CSC graph representation:
-        row_dict: Dict[Tuple[str, str, str], np.ndarray] = {}
-        colptr_dict: Dict[Tuple[str, str, str], np.ndarray] = {}
+        row_dict: dict[tuple[str, str, str], np.ndarray] = {}
+        colptr_dict: dict[tuple[str, str, str], np.ndarray] = {}
         for src_table, fkey, dst_table in graph.edges:
             src_df = self.df_dict[src_table]
             dst_df = self.df_dict[dst_table]
@@ -287,7 +287,7 @@ def _argsort(input: np.ndarray) -> np.ndarray:
     return torch.from_numpy(input).argsort().numpy()
-def _lexsort(inputs: List[np.ndarray]) -> np.ndarray:
+def _lexsort(inputs: list[np.ndarray]) -> np.ndarray:
     assert len(inputs) >= 1
     if not WITH_TORCH:

kumoai/experimental/rfm/backend/local/table.py CHANGED Viewed

@@ -1,14 +1,10 @@
 import warnings
-from typing import List, Optional, cast
+from typing import cast
 import pandas as pd
+from kumoapi.model_plan import MissingType
-from kumoai.experimental.rfm.base import (
-    DataBackend,
-    SourceColumn,
-    SourceForeignKey,
-    Table,
-)
+from kumoai.experimental.rfm.base import DataBackend, SourceColumn, Table
 from kumoai.experimental.rfm.infer import infer_dtype
@@ -57,9 +53,9 @@ class LocalTable(Table):
         self,
         df: pd.DataFrame,
         name: str,
-        primary_key: Optional[str] = None,
-        time_column: Optional[str] = None,
-        end_time_column: Optional[str] = None,
+        primary_key: MissingType | str | None = MissingType.VALUE,
+        time_column: str | None = None,
+        end_time_column: str | None = None,
     ) -> None:
         if df.empty:
@@ -85,17 +81,19 @@ class LocalTable(Table):
     def backend(self) -> DataBackend:
         return cast(DataBackend, DataBackend.LOCAL)
-    def _get_source_columns(self) -> List[SourceColumn]:
-        source_columns: List[SourceColumn] = []
+    def _get_source_columns(self) -> list[SourceColumn]:
+        source_columns: list[SourceColumn] = []
         for column in self._data.columns:
             ser = self._data[column]
             try:
                 dtype = infer_dtype(ser)
             except Exception:
-                warnings.warn(f"Data type inference for column '{column}' in "
-                              f"table '{self.name}' failed. Consider changing "
-                              f"the data type of the column to use it within "
-                              f"this table.")
+                warnings.warn(f"Encountered unsupported data type "
+                              f"'{ser.dtype}' for column '{column}' in table "
+                              f"'{self.name}'. Please change the data type of "
+                              f"the column in the `pandas.DataFrame` to use "
+                              f"it within this table, or remove it to "
+                              f"suppress this warning.")
                 continue
             source_column = SourceColumn(
@@ -109,11 +107,8 @@ class LocalTable(Table):
         return source_columns
-    def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
-        return []
-    def _get_sample_df(self) -> pd.DataFrame:
+    def _get_source_sample_df(self) -> pd.DataFrame:
         return self._data
-    def _get_num_rows(self) -> Optional[int]:
+    def _get_num_rows(self) -> int | None:
         return len(self._data)

kumoai/experimental/rfm/backend/snow/sampler.py CHANGED Viewed

@@ -1,39 +1,27 @@
 import json
-from typing import TYPE_CHECKING
+from collections.abc import Iterator
+from contextlib import contextmanager
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 from kumoapi.pquery import ValidatedPredictiveQuery
-from kumoai.experimental.rfm.backend.snow import SnowTable
+from kumoai.experimental.rfm.backend.snow import Connection
 from kumoai.experimental.rfm.base import SQLSampler
 from kumoai.experimental.rfm.pquery import PQueryPandasExecutor
-from kumoai.utils import ProgressLogger, quote_ident
+from kumoai.utils import quote_ident
-if TYPE_CHECKING:
-    from kumoai.experimental.rfm import Graph
+@contextmanager
+def paramstyle(connection: Connection, style: str = 'qmark') -> Iterator[None]:
+    _style = connection._paramstyle
+    connection._paramstyle = style
+    yield
+    connection._paramstyle = _style
-class SnowSampler(SQLSampler):
-    def __init__(
-        self,
-        graph: 'Graph',
-        verbose: bool | ProgressLogger = True,
-    ) -> None:
-        super().__init__(graph=graph, verbose=verbose)
-        self._fqn_dict: dict[str, str] = {}
-        for table in graph.tables.values():
-            assert isinstance(table, SnowTable)
-            self._connection = table._connection
-            self._fqn_dict[table.name] = table.fqn
-    @property
-    def fqn_dict(self) -> dict[str, str]:
-        r"""The fully-qualified quoted names for all tables in the graph."""
-        return self._fqn_dict
+class SnowSampler(SQLSampler):
     def _get_min_max_time_dict(
         self,
         table_names: list[str],
@@ -42,7 +30,7 @@ class SnowSampler(SQLSampler):
         for table_name in table_names:
             time_column = self.time_column_dict[table_name]
             select = (f"SELECT\n"
-                      f"  %s as table_name,\n"
+                      f"  ? as table_name,\n"
                       f"  MIN({quote_ident(time_column)}) as min_date,\n"
                       f"  MAX({quote_ident(time_column)}) as max_date\n"
                       f"FROM {self.fqn_dict[table_name]}")
@@ -50,14 +38,14 @@ class SnowSampler(SQLSampler):
         sql = "\nUNION ALL\n".join(selects)
         out_dict: dict[str, tuple[pd.Timestamp, pd.Timestamp]] = {}
-        with self._connection.cursor() as cursor:
+        with paramstyle(self._connection), self._connection.cursor() as cursor:
             cursor.execute(sql, table_names)
             rows = cursor.fetchall()
-            for table_name, _min, _max in rows:
-                out_dict[table_name] = (
-                    pd.Timestamp.max if _min is None else pd.Timestamp(_min),
-                    pd.Timestamp.min if _max is None else pd.Timestamp(_max),
-                )
+        for table_name, _min, _max in rows:
+            out_dict[table_name] = (
+                pd.Timestamp.max if _min is None else pd.Timestamp(_min),
+                pd.Timestamp.min if _max is None else pd.Timestamp(_max),
+            )
         return out_dict
@@ -179,7 +167,7 @@ class SnowSampler(SQLSampler):
             sql += "    f.value::FLOAT as ID\n"
         else:
             sql += "    f.value::VARCHAR as ID\n"
-        sql += (f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(%s))) f\n"
+        sql += (f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(?))) f\n"
                 f")\n"
                 f"SELECT TMP.BATCH as __BATCH__, "
                 f"{', '.join('ENT.' + quote_ident(col) for col in columns)}\n"
@@ -187,7 +175,7 @@ class SnowSampler(SQLSampler):
                 f"JOIN {self.fqn_dict[table_name]} ENT\n"
                 f"  ON ENT.{quote_ident(pkey_name)} = TMP.ID")
-        with self._connection.cursor() as cursor:
+        with paramstyle(self._connection), self._connection.cursor() as cursor:
             cursor.execute(sql, (payload, ))
             table = cursor.fetch_arrow_all()
@@ -240,7 +228,7 @@ class SnowSampler(SQLSampler):
         if min_offset is not None:
             sql += ",\n    f.value[2]::TIMESTAMP_NTZ as START_TIME"
         sql += (f"\n"
-                f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(%s))) f\n"
+                f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(?))) f\n"
                 f")\n"
                 f"SELECT TMP.BATCH as __BATCH__, "
                 f"{', '.join('FACT.' + quote_ident(col) for col in columns)}\n"
@@ -251,7 +239,7 @@ class SnowSampler(SQLSampler):
         if min_offset is not None:
             sql += f"\n AND FACT.{quote_ident(time_column)} > TMP.START_TIME"
-        with self._connection.cursor() as cursor:
+        with paramstyle(self._connection), self._connection.cursor() as cursor:
             cursor.execute(sql, (payload, ))
             table = cursor.fetch_arrow_all()

kumoai/experimental/rfm/backend/snow/table.py CHANGED Viewed

@@ -1,28 +1,35 @@
 import re
-from typing import List, Optional, Sequence, cast
+from collections.abc import Sequence
+from typing import cast
 import pandas as pd
+from kumoapi.model_plan import MissingType
 from kumoapi.typing import Dtype
 from kumoai.experimental.rfm.backend.snow import Connection
 from kumoai.experimental.rfm.base import (
+    ColumnExpressionSpec,
+    ColumnExpressionType,
     DataBackend,
     SourceColumn,
     SourceForeignKey,
-    Table,
+    SQLTable,
 )
 from kumoai.utils import quote_ident
-class SnowTable(Table):
+class SnowTable(SQLTable):
     r"""A table backed by a :class:`sqlite` database.
     Args:
         connection: The connection to a :class:`snowflake` database.
-        name: The name of this table.
+        name: The logical name of this table.
+        source_name: The physical name of this table in the database. If set to
+            ``None``, ``name`` is being used.
         database: The database.
         schema: The schema.
-        columns: The selected columns of this table.
+        columns: The selected physical columns of this table.
+        column_expressions: The logical columns of this table.
         primary_key: The name of the primary key of this table, if it exists.
         time_column: The name of the time column of this table, if it exists.
         end_time_column: The name of the end time column of this table, if it
@@ -32,17 +39,20 @@ class SnowTable(Table):
         self,
         connection: Connection,
         name: str,
+        source_name: str | None = None,
         database: str | None = None,
         schema: str | None = None,
-        columns: Optional[Sequence[str]] = None,
-        primary_key: Optional[str] = None,
-        time_column: Optional[str] = None,
-        end_time_column: Optional[str] = None,
+        columns: Sequence[str] | None = None,
+        column_expressions: Sequence[ColumnExpressionType] | None = None,
+        primary_key: MissingType | str | None = MissingType.VALUE,
+        time_column: str | None = None,
+        end_time_column: str | None = None,
     ) -> None:
         if database is not None and schema is None:
-            raise ValueError(f"Missing 'schema' for table '{name}' in "
-                             f"database '{database}'")
+            raise ValueError(f"Unspecified 'schema' for table "
+                             f"'{source_name or name}' in database "
+                             f"'{database}'")
         self._connection = connection
         self._database = database
@@ -50,12 +60,32 @@ class SnowTable(Table):
         super().__init__(
             name=name,
+            source_name=source_name,
             columns=columns,
+            column_expressions=column_expressions,
             primary_key=primary_key,
             time_column=time_column,
             end_time_column=end_time_column,
         )
+    @staticmethod
+    def to_dtype(snowflake_dtype: str | None) -> Dtype | None:
+        if snowflake_dtype is None:
+            return None
+        snowflake_dtype = snowflake_dtype.strip().upper()
+        # TODO 'NUMBER(...)' is not always an integer!
+        if snowflake_dtype.startswith('NUMBER'):
+            return Dtype.int
+        elif snowflake_dtype.startswith('VARCHAR'):
+            return Dtype.string
+        elif snowflake_dtype == 'FLOAT':
+            return Dtype.float
+        elif snowflake_dtype == 'BOOLEAN':
+            return Dtype.bool
+        elif re.search('DATE|TIMESTAMP', snowflake_dtype):
+            return Dtype.date
+        return None
     @property
     def backend(self) -> DataBackend:
         return cast(DataBackend, DataBackend.SNOWFLAKE)
@@ -63,15 +93,15 @@ class SnowTable(Table):
     @property
     def fqn(self) -> str:
         r"""The fully-qualified quoted table name."""
-        names: List[str] = []
+        names: list[str] = []
         if self._database is not None:
             names.append(quote_ident(self._database))
         if self._schema is not None:
             names.append(quote_ident(self._schema))
-        return '.'.join(names + [quote_ident(self._name)])
+        return '.'.join(names + [quote_ident(self._source_name)])
-    def _get_source_columns(self) -> List[SourceColumn]:
-        source_columns: List[SourceColumn] = []
+    def _get_source_columns(self) -> list[SourceColumn]:
+        source_columns: list[SourceColumn] = []
         with self._connection.cursor() as cursor:
             try:
                 sql = f"DESCRIBE TABLE {self.fqn}"
@@ -82,24 +112,15 @@ class SnowTable(Table):
                     names.append(self._database)
                 if self._schema is not None:
                     names.append(self._schema)
-                name = '.'.join(names + [self._name])
-                raise ValueError(f"Table '{name}' does not exist") from e
+                source_name = '.'.join(names + [self._source_name])
+                raise ValueError(f"Table '{source_name}' does not exist in "
+                                 f"the remote data backend") from e
             for row in cursor.fetchall():
                 column, type, _, null, _, is_pkey, is_unique, *_ = row
-                type = type.strip().upper()
-                if type.startswith('NUMBER'):
-                    dtype = Dtype.int
-                elif type.startswith('VARCHAR'):
-                    dtype = Dtype.string
-                elif type == 'FLOAT':
-                    dtype = Dtype.float
-                elif type == 'BOOLEAN':
-                    dtype = Dtype.bool
-                elif re.search('DATE|TIMESTAMP', type):
-                    dtype = Dtype.date
-                else:
+                dtype = self.to_dtype(type)
+                if dtype is None:
                     continue
                 source_column = SourceColumn(
@@ -113,8 +134,8 @@ class SnowTable(Table):
         return source_columns
-    def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
-        source_fkeys: List[SourceForeignKey] = []
+    def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
+        source_fkeys: list[SourceForeignKey] = []
         with self._connection.cursor() as cursor:
             sql = f"SHOW IMPORTED KEYS IN TABLE {self.fqn}"
             cursor.execute(sql)
@@ -123,7 +144,7 @@ class SnowTable(Table):
                 source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
         return source_fkeys
-    def _get_sample_df(self) -> pd.DataFrame:
+    def _get_source_sample_df(self) -> pd.DataFrame:
         with self._connection.cursor() as cursor:
             columns = [quote_ident(col) for col in self._source_column_dict]
             sql = f"SELECT {', '.join(columns)} FROM {self.fqn} LIMIT 1000"
@@ -131,5 +152,18 @@ class SnowTable(Table):
             table = cursor.fetch_arrow_all()
             return table.to_pandas(types_mapper=pd.ArrowDtype)
-    def _get_num_rows(self) -> Optional[int]:
+    def _get_num_rows(self) -> int | None:
         return None
+    def _get_expression_sample_df(
+        self,
+        specs: Sequence[ColumnExpressionSpec],
+    ) -> pd.DataFrame:
+        with self._connection.cursor() as cursor:
+            columns = [
+                f"{spec.expr} AS {quote_ident(spec.name)}" for spec in specs
+            ]
+            sql = f"SELECT {', '.join(columns)} FROM {self.fqn} LIMIT 1000"
+            cursor.execute(sql)
+            table = cursor.fetch_arrow_all()
+            return table.to_pandas(types_mapper=pd.ArrowDtype)

kumoai/experimental/rfm/backend/sqlite/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Any, TypeAlias, Union
+from typing import Any, TypeAlias
 try:
     import adbc_driver_sqlite.dbapi as adbc
@@ -11,7 +11,7 @@ except ImportError:
 Connection: TypeAlias = adbc.AdbcSqliteConnection
-def connect(uri: Union[str, Path, None] = None, **kwargs: Any) -> Connection:
+def connect(uri: str | Path | None = None, **kwargs: Any) -> Connection:
     r"""Opens a connection to a :class:`sqlite` database.
     uri: The path to the database file to be opened.