PyPI - kumoai - Versions diffs - 2.13.0.dev202512011731__cp312-cp312-macosx_11_0_arm64.whl → 2.13.0.dev202512031731__cp312-cp312-macosx_11_0_arm64.whl - Mend

kumoai 2.13.0.dev202512011731__cp312-cp312-macosx_11_0_arm64.whl → 2.13.0.dev202512031731__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

kumoai/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '2.13.0.~~dev202512011731~~'
1	+ __version__ = '2.13.0.dev202512031731'

kumoai/experimental/rfm/backend/local/table.py CHANGED Viewed

@@ -1,11 +1,9 @@
 from typing import List, Optional
 import pandas as pd
-from kumoapi.typing import Dtype, Stype
-from typing_extensions import Self
-from kumoai.experimental.rfm import utils
-from kumoai.experimental.rfm.base import Column, Table
+from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
+from kumoai.experimental.rfm.infer import infer_dtype
 class LocalTable(Table):
@@ -59,7 +57,7 @@ class LocalTable(Table):
     ) -> None:
         if df.empty:
-            raise ValueError("Data frame must have at least one row")
+            raise ValueError("Data frame is empty")
         if isinstance(df.columns, pd.MultiIndex):
             raise ValueError("Data frame must not have a multi-index")
         if not df.columns.is_unique:
@@ -77,75 +75,21 @@ class LocalTable(Table):
             end_time_column=end_time_column,
         )
-    def infer_metadata(self, verbose: bool = True) -> Self:
-        r"""Infers metadata, *i.e.*, primary keys and time columns, in the
-        table.
-        Args:
-            verbose: Whether to print verbose output.
-        """
-        logs = []
-        # Try to detect primary key if not set:
-        if not self.has_primary_key():
-            def is_candidate(column: Column) -> bool:
-                if column.stype == Stype.ID:
-                    return True
-                if all(column.stype != Stype.ID for column in self.columns):
-                    if self.name == column.name:
-                        return True
-                    if (self.name.endswith('s')
-                            and self.name[:-1] == column.name):
-                        return True
-                return False
-            candidates = [
-                column.name for column in self.columns if is_candidate(column)
-            ]
-            if primary_key := utils.detect_primary_key(
-                    table_name=self.name,
-                    df=self._data,
-                    candidates=candidates,
-            ):
-                self.primary_key = primary_key
-                logs.append(f"primary key '{primary_key}'")
-        # Try to detect time column if not set:
-        if not self.has_time_column():
-            candidates = [
-                column.name for column in self.columns
-                if column.stype == Stype.timestamp
-                and column.name != self._end_time_column
-            ]
-            if time_column := utils.detect_time_column(self._data, candidates):
-                self.time_column = time_column
-                logs.append(f"time column '{time_column}'")
-        if verbose and len(logs) > 0:
-            print(f"Detected {' and '.join(logs)} in table '{self.name}'")
-        return self
-    def _has_source_column(self, name: str) -> bool:
-        return name in self._data.columns
-    def _get_source_dtype(self, name: str) -> Dtype:
-        return utils.to_dtype(self._data[name])
-    def _get_source_stype(self, name: str, dtype: Dtype) -> Stype:
-        return utils.infer_stype(self._data[name], name, dtype)
-    def _infer_primary_key(self, candidates: List[str]) -> Optional[str]:
-        return utils.detect_primary_key(
-            table_name=self.name,
-            df=self._data,
-            candidates=candidates,
-        )
+    def _get_source_columns(self) -> List[SourceColumn]:
+        return [
+            SourceColumn(
+                name=column,
+                dtype=infer_dtype(self._data[column]),
+                is_primary_key=False,
+                is_unique_key=False,
+            ) for column in self._data.columns
+        ]
+    def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
+        return []
-    def _infer_time_column(self, candidates: List[str]) -> Optional[str]:
-        return utils.detect_time_column(df=self._data, candidates=candidates)
+    def _get_sample_df(self) -> pd.DataFrame:
+        return self._data
-    def _num_rows(self) -> Optional[int]:
+    def _get_num_rows(self) -> Optional[int]:
         return len(self._data)

kumoai/experimental/rfm/backend/snow/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+from typing import Any, TypeAlias
+try:
+    import snowflake.connector
+except ImportError:
+    raise ImportError("No module named 'snowflake'. Please install Kumo SDK "
+                      "with the 'snowflake' extension via "
+                      "`pip install kumoai[snowflake]`.")
+Connection: TypeAlias = snowflake.connector.SnowflakeConnection
+def connect(**kwargs: Any) -> Connection:
+    r"""Opens a connection to a :class:`snowflake` database.
+    If available, will return a connection to the active session.
+    kwargs: Connection arguments, following the :class:`snowflake` protocol.
+    """
+    try:
+        from snowflake.snowpark.context import get_active_session
+        return get_active_session().connection
+    except Exception:
+        pass
+    return snowflake.connector.connect(**kwargs)
+from .table import SnowTable  # noqa: E402
+__all__ = [
+    'connect',
+    'Connection',
+    'SnowTable',
+]

kumoai/experimental/rfm/backend/snow/table.py ADDED Viewed

@@ -0,0 +1,95 @@
+import re
+from typing import List, Optional, Sequence
+import pandas as pd
+from kumoapi.typing import Dtype
+from kumoai.experimental.rfm.backend.sqlite import Connection
+from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
+class SnowTable(Table):
+    r"""A table backed by a :class:`sqlite` database.
+    Args:
+        connection: The connection to a :class:`snowflake` database.
+        name: The name of this table.
+        columns: The selected columns of this table.
+        primary_key: The name of the primary key of this table, if it exists.
+        time_column: The name of the time column of this table, if it exists.
+        end_time_column: The name of the end time column of this table, if it
+            exists.
+    """
+    def __init__(
+        self,
+        connection: Connection,
+        name: str,
+        columns: Optional[Sequence[str]] = None,
+        primary_key: Optional[str] = None,
+        time_column: Optional[str] = None,
+        end_time_column: Optional[str] = None,
+    ) -> None:
+        self._connection = connection
+        super().__init__(
+            name=name,
+            columns=columns,
+            primary_key=primary_key,
+            time_column=time_column,
+            end_time_column=end_time_column,
+        )
+    def _get_source_columns(self) -> List[SourceColumn]:
+        source_columns: List[SourceColumn] = []
+        with self._connection.cursor() as cursor:
+            try:
+                cursor.execute(f"DESCRIBE TABLE {self.name}")
+            except Exception as e:
+                raise ValueError(f"Table '{self.name}' does not exist") from e
+            for row in cursor.fetchall():
+                column, type, _, _, _, is_pkey, is_unique = row[:7]
+                type = type.strip().upper()
+                if type.startswith('NUMBER'):
+                    dtype = Dtype.int
+                elif type.startswith('VARCHAR'):
+                    dtype = Dtype.string
+                elif type == 'FLOAT':
+                    dtype = Dtype.float
+                elif type == 'BOOLEAN':
+                    dtype = Dtype.bool
+                elif re.search('DATE|TIMESTAMP', type):
+                    dtype = Dtype.date
+                else:
+                    continue
+                source_column = SourceColumn(
+                    name=column,
+                    dtype=dtype,
+                    is_primary_key=is_pkey.strip().upper() == 'Y',
+                    is_unique_key=is_unique.strip().upper() == 'Y',
+                )
+                source_columns.append(source_column)
+        return source_columns
+    def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
+        source_fkeys: List[SourceForeignKey] = []
+        with self._connection.cursor() as cursor:
+            cursor.execute(f"SHOW IMPORTED KEYS IN TABLE {self.name}")
+            for row in cursor.fetchall():
+                _, _, _, dst_table, pkey, _, _, _, fkey = row[:9]
+                source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
+        return source_fkeys
+    def _get_sample_df(self) -> pd.DataFrame:
+        with self._connection.cursor() as cursor:
+            columns = ', '.join(self._source_column_dict.keys())
+            cursor.execute(f"SELECT {columns} FROM {self.name} LIMIT 1000")
+            table = cursor.fetch_arrow_all()
+            return table.to_pandas()
+    def _get_num_rows(self) -> Optional[int]:
+        return None

kumoai/experimental/rfm/backend/sqlite/__init__.py CHANGED Viewed

@@ -12,12 +12,19 @@ Connection: TypeAlias = adbc.AdbcSqliteConnection
 def connect(uri: Union[str, Path, None] = None, **kwargs: Any) -> Connection:
+    r"""Opens a connection to a :class:`sqlite` database.
+    uri: The path to the database file to be opened.
+    kwargs: Additional connection arguments, following the
+        :class:`adbc_driver_sqlite` protocol.
+    """
     return adbc.connect(uri, **kwargs)
 from .table import SQLiteTable  # noqa: E402
 __all__ = [
+    'connect',
     'Connection',
     'SQLiteTable',
 ]

kumoai/experimental/rfm/backend/sqlite/table.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import re
-from typing import Dict, List, Optional, Sequence
+from typing import List, Optional, Sequence
-import pyarrow as pa
-from kumoapi.typing import Dtype, Stype
-from typing_extensions import Self
+import pandas as pd
+from kumoapi.typing import Dtype
-from kumoai.experimental.rfm import utils
 from kumoai.experimental.rfm.backend.sqlite import Connection
-from kumoai.experimental.rfm.base import Table
+from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
+from kumoai.experimental.rfm.infer import infer_dtype
 class SQLiteTable(Table):
@@ -33,85 +32,63 @@ class SQLiteTable(Table):
     ) -> None:
         self._connection = connection
-        self._dtype_dict: Dict[str, Dtype] = {}
-        with connection.cursor() as cursor:
-            cursor.execute(f"PRAGMA table_info({name})")
-            for _, column, dtype, _, _, is_pkey in cursor.fetchall():
-                if bool(is_pkey):
-                    if primary_key is not None and primary_key != column:
-                        raise ValueError(f"Found duplicate primary key "
-                                         f"definition '{primary_key}' and "
-                                         f"'{column}' in table '{name}'")
-                    primary_key = column
-                # Determine colun affinity:
-                dtype = dtype.strip().upper()
-                if re.search('INT', dtype):
-                    self._dtype_dict[column] = Dtype.int
-                elif re.search('TEXT|CHAR|CLOB', dtype):
-                    self._dtype_dict[column] = Dtype.string
-                elif re.search('REAL|FLOA|DOUB', dtype):
-                    self._dtype_dict[column] = Dtype.float
-                else:  # NUMERIC affinity.
-                    self._dtype_dict[column] = Dtype.unsupported
-            if len(self._dtype_dict) > 0:
-                column_names = ', '.join(self._dtype_dict.keys())
-                cursor.execute(f"SELECT {column_names} FROM {name} "
-                               f"ORDER BY rowid LIMIT 1000")
-                self._sample = cursor.fetch_arrow_table()
-            for column_name in list(self._dtype_dict.keys()):
-                if self._dtype_dict[column_name] == Dtype.unsupported:
-                    dtype = self._sample[column_name].type
-                    if pa.types.is_integer(dtype):
-                        self._dtype_dict[column_name] = Dtype.int
-                    elif pa.types.is_floating(dtype):
-                        self._dtype_dict[column_name] = Dtype.float
-                    elif pa.types.is_decimal(dtype):
-                        self._dtype_dict[column_name] = Dtype.float
-                    elif pa.types.is_string(dtype):
-                        self._dtype_dict[column_name] = Dtype.string
-                    else:
-                        del self._dtype_dict[column_name]
-        if len(self._dtype_dict) == 0:
-            raise RuntimeError(f"Table '{name}' does not exist or does not "
-                               f"hold any column with a supported data type")
         super().__init__(
             name=name,
-            columns=columns or list(self._dtype_dict.keys()),
+            columns=columns,
             primary_key=primary_key,
             time_column=time_column,
             end_time_column=end_time_column,
         )
-    def infer_metadata(self, verbose: bool = True) -> Self:
-        r"""Infers metadata, *i.e.*, primary keys and time columns, in the
-        table.
-        Args:
-            verbose: Whether to print verbose output.
-        """
-        return self
-    def _has_source_column(self, name: str) -> bool:
-        return name in self._dtype_dict
-    def _get_source_dtype(self, name: str) -> Dtype:
-        return self._dtype_dict[name]
-    def _get_source_stype(self, name: str, dtype: Dtype) -> Stype:
-        ser = self._sample[name].to_pandas()
-        return utils.infer_stype(ser, name, dtype)
-    def _infer_primary_key(self, candidates: List[str]) -> Optional[str]:
-        return None  # TODO
-    def _infer_time_column(self, candidates: List[str]) -> Optional[str]:
-        return None  # TODO
-    def _num_rows(self) -> Optional[int]:
+    def _get_source_columns(self) -> List[SourceColumn]:
+        source_columns: List[SourceColumn] = []
+        with self._connection.cursor() as cursor:
+            cursor.execute(f"PRAGMA table_info({self.name})")
+            rows = cursor.fetchall()
+            if len(rows) == 0:
+                raise ValueError(f"Table '{self.name}' does not exist")
+            for _, column, type, _, _, is_pkey in rows:
+                # Determine column affinity:
+                type = type.strip().upper()
+                if re.search('INT', type):
+                    dtype = Dtype.int
+                elif re.search('TEXT|CHAR|CLOB', type):
+                    dtype = Dtype.string
+                elif re.search('REAL|FLOA|DOUB', type):
+                    dtype = Dtype.float
+                else:  # NUMERIC affinity.
+                    try:
+                        dtype = infer_dtype(self._sample_df[column])
+                    except Exception as e:
+                        raise e
+                source_column = SourceColumn(
+                    name=column,
+                    dtype=dtype,
+                    is_primary_key=bool(is_pkey),
+                    is_unique_key=False,
+                )
+                source_columns.append(source_column)
+        return source_columns
+    def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
+        source_fkeys: List[SourceForeignKey] = []
+        with self._connection.cursor() as cursor:
+            cursor.execute(f"PRAGMA foreign_key_list({self.name})")
+            for _, _, dst_table, fkey, pkey, _, _, _ in cursor.fetchall():
+                source_fkeys.append(SourceForeignKey(fkey, dst_table, pkey))
+        return source_fkeys
+    def _get_sample_df(self) -> pd.DataFrame:
+        with self._connection.cursor() as cursor:
+            cursor.execute(f"SELECT * FROM {self.name} "
+                           f"ORDER BY rowid LIMIT 1000")
+            table = cursor.fetch_arrow_table()
+            return table.to_pandas()
+    def _get_num_rows(self) -> Optional[int]:
         return None

kumoai/experimental/rfm/base/__init__.py CHANGED Viewed

@@ -1,7 +1,10 @@
+from .source import SourceColumn, SourceForeignKey
 from .column import Column
 from .table import Table
 __all__ = [
+    'SourceColumn',
+    'SourceForeignKey',
     'Column',
     'Table',
 ]

kumoai/experimental/rfm/base/source.py ADDED Viewed

@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+from kumoapi.typing import Dtype
+@dataclass
+class SourceColumn:
+    name: str
+    dtype: Dtype
+    is_primary_key: bool
+    is_unique_key: bool
+@dataclass
+class SourceForeignKey:
+    name: str
+    dst_table: str
+    primary_key: str

kumoai/experimental/rfm/base/table.py CHANGED Viewed

@@ -1,15 +1,25 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Sequence
+from collections import defaultdict
+from functools import cached_property
+from typing import Dict, List, Optional, Sequence, Set
 import pandas as pd
 from kumoapi.source_table import UnavailableSourceTable
 from kumoapi.table import Column as ColumnDefinition
 from kumoapi.table import TableDefinition
-from kumoapi.typing import Dtype, Stype
+from kumoapi.typing import Stype
 from typing_extensions import Self
 from kumoai import in_notebook
-from kumoai.experimental.rfm.base import Column
+from kumoai.experimental.rfm.base import Column, SourceColumn, SourceForeignKey
+from kumoai.experimental.rfm.infer import (
+    contains_categorical,
+    contains_id,
+    contains_multicategorical,
+    contains_timestamp,
+    infer_primary_key,
+    infer_time_column,
+)
 class Table(ABC):
@@ -39,8 +49,30 @@ class Table(ABC):
         self._time_column: Optional[str] = None
         self._end_time_column: Optional[str] = None
+        if len(self._source_column_dict) == 0:
+            raise ValueError(f"Table '{name}' does not hold any column with "
+                             f"a supported data type")
+        primary_keys = [
+            column.name for column in self._source_column_dict.values()
+            if column.is_primary_key
+        ]
+        if len(primary_keys) == 1:  # NOTE No composite keys yet.
+            if primary_key is not None and primary_key != primary_keys[0]:
+                raise ValueError(f"Found duplicate primary key "
+                                 f"definition '{primary_key}' and "
+                                 f"'{primary_keys[0]}' in table '{name}'")
+            primary_key = primary_keys[0]
+        unique_keys = [
+            column.name for column in self._source_column_dict.values()
+            if column.is_unique_key
+        ]
+        if primary_key is None and len(unique_keys) == 1:
+            primary_key = unique_keys[0]
         self._columns: Dict[str, Column] = {}
-        for column_name in columns or []:
+        for column_name in columns or list(self._source_column_dict.keys()):
             self.add_column(column_name)
         if primary_key is not None:
@@ -104,12 +136,12 @@ class Table(ABC):
             raise KeyError(f"Column '{name}' already exists in table "
                            f"'{self.name}'")
-        if not self._has_source_column(name):
+        if name not in self._source_column_dict:
             raise KeyError(f"Column '{name}' does not exist in the underlying "
                            f"source table")
         try:
-            dtype = self._get_source_dtype(name)
+            dtype = self._source_column_dict[name].dtype
         except Exception as e:
             raise RuntimeError(f"Could not obtain data type for column "
                                f"'{name}' in table '{self.name}'. Change "
@@ -117,7 +149,17 @@ class Table(ABC):
                                f"table or remove it from the table.") from e
         try:
-            stype = self._get_source_stype(name, dtype)
+            ser = self._sample_df[name]
+            if contains_id(ser, name, dtype):
+                stype = Stype.ID
+            elif contains_timestamp(ser, name, dtype):
+                stype = Stype.timestamp
+            elif contains_multicategorical(ser, name, dtype):
+                stype = Stype.multicategorical
+            elif contains_categorical(ser, name, dtype):
+                stype = Stype.categorical
+            else:
+                stype = dtype.default_stype
         except Exception as e:
             raise RuntimeError(f"Could not obtain semantic type for column "
                                f"'{name}' in table '{self.name}'. Change "
@@ -338,8 +380,9 @@ class Table(ABC):
     def print_metadata(self) -> None:
         r"""Prints the :meth:`~metadata` of this table."""
-        num_rows = self._num_rows()
-        num_rows_repr = ' ({num_rows:,} rows)' if num_rows is not None else ''
+        num_rows_repr = ''
+        if self._num_rows is not None:
+            num_rows_repr = ' ({self._num_rows:,} rows)'
         if in_notebook():
             from IPython.display import Markdown, display
@@ -384,7 +427,11 @@ class Table(ABC):
                 column.name for column in self.columns if is_candidate(column)
             ]
-            if primary_key := self._infer_primary_key(candidates):
+            if primary_key := infer_primary_key(
+                    table_name=self.name,
+                    df=self._sample_df,
+                    candidates=candidates,
+            ):
                 self.primary_key = primary_key
                 logs.append(f"primary key '{primary_key}'")
@@ -395,7 +442,10 @@ class Table(ABC):
                 if column.stype == Stype.timestamp
                 and column.name != self._end_time_column
             ]
-            if time_column := self._infer_time_column(candidates):
+            if time_column := infer_time_column(
+                    df=self._sample_df,
+                    candidates=candidates,
+            ):
                 self.time_column = time_column
                 logs.append(f"time column '{time_column}'")
@@ -448,26 +498,43 @@ class Table(ABC):
     # Abstract method #########################################################
-    @abstractmethod
-    def _has_source_column(self, name: str) -> bool:
-        pass
+    @cached_property
+    def _source_column_dict(self) -> Dict[str, SourceColumn]:
+        return {col.name: col for col in self._get_source_columns()}
     @abstractmethod
-    def _get_source_dtype(self, name: str) -> Dtype:
+    def _get_source_columns(self) -> List[SourceColumn]:
         pass
-    @abstractmethod
-    def _get_source_stype(self, name: str, dtype: Dtype) -> Stype:
-        pass
+    @cached_property
+    def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
+        fkeys = self._get_source_foreign_keys()
+        # NOTE Drop all keys that link to different primary keys in the same
+        # table since we don't support composite keys yet:
+        table_pkeys: Dict[str, Set[str]] = defaultdict(set)
+        for fkey in fkeys:
+            table_pkeys[fkey.dst_table].add(fkey.primary_key)
+        return {
+            fkey.name: fkey
+            for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
+        }
     @abstractmethod
-    def _infer_primary_key(self, candidates: List[str]) -> Optional[str]:
+    def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
         pass
+    @cached_property
+    def _sample_df(self) -> pd.DataFrame:
+        return self._get_sample_df()
     @abstractmethod
-    def _infer_time_column(self, candidates: List[str]) -> Optional[str]:
+    def _get_sample_df(self) -> pd.DataFrame:
         pass
-    @abstractmethod
+    @cached_property
     def _num_rows(self) -> Optional[int]:
+        return self._get_num_rows()
+    @abstractmethod
+    def _get_num_rows(self) -> Optional[int]:
         pass