PyPI - kumoai - Versions diffs - 2.13.0.dev202512021731__cp310-cp310-win_amd64.whl → 2.13.0.dev202512040252__cp310-cp310-win_amd64.whl - Mend

kumoai 2.13.0.dev202512021731__cp310-cp310-win_amd64.whl → 2.13.0.dev202512040252__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

kumoai/experimental/rfm/base/table.py CHANGED Viewed

@@ -1,15 +1,25 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Sequence, Tuple
+from collections import defaultdict
+from functools import cached_property
+from typing import Dict, List, Optional, Sequence, Set
 import pandas as pd
 from kumoapi.source_table import UnavailableSourceTable
 from kumoapi.table import Column as ColumnDefinition
 from kumoapi.table import TableDefinition
-from kumoapi.typing import Dtype, Stype
+from kumoapi.typing import Stype
 from typing_extensions import Self
 from kumoai import in_notebook
-from kumoai.experimental.rfm.base import Column
+from kumoai.experimental.rfm.base import Column, SourceColumn, SourceForeignKey
+from kumoai.experimental.rfm.infer import (
+    contains_categorical,
+    contains_id,
+    contains_multicategorical,
+    contains_timestamp,
+    infer_primary_key,
+    infer_time_column,
+)
 class Table(ABC):
@@ -39,8 +49,30 @@ class Table(ABC):
         self._time_column: Optional[str] = None
         self._end_time_column: Optional[str] = None
+        if len(self._source_column_dict) == 0:
+            raise ValueError(f"Table '{name}' does not hold any column with "
+                             f"a supported data type")
+        primary_keys = [
+            column.name for column in self._source_column_dict.values()
+            if column.is_primary_key
+        ]
+        if len(primary_keys) == 1:  # NOTE No composite keys yet.
+            if primary_key is not None and primary_key != primary_keys[0]:
+                raise ValueError(f"Found duplicate primary key "
+                                 f"definition '{primary_key}' and "
+                                 f"'{primary_keys[0]}' in table '{name}'")
+            primary_key = primary_keys[0]
+        unique_keys = [
+            column.name for column in self._source_column_dict.values()
+            if column.is_unique_key
+        ]
+        if primary_key is None and len(unique_keys) == 1:
+            primary_key = unique_keys[0]
         self._columns: Dict[str, Column] = {}
-        for column_name in columns or []:
+        for column_name in columns or list(self._source_column_dict.keys()):
             self.add_column(column_name)
         if primary_key is not None:
@@ -104,12 +136,12 @@ class Table(ABC):
             raise KeyError(f"Column '{name}' already exists in table "
                            f"'{self.name}'")
-        if not self._has_source_column(name):
+        if name not in self._source_column_dict:
             raise KeyError(f"Column '{name}' does not exist in the underlying "
                            f"source table")
         try:
-            dtype = self._get_source_dtype(name)
+            dtype = self._source_column_dict[name].dtype
         except Exception as e:
             raise RuntimeError(f"Could not obtain data type for column "
                                f"'{name}' in table '{self.name}'. Change "
@@ -117,7 +149,17 @@ class Table(ABC):
                                f"table or remove it from the table.") from e
         try:
-            stype = self._get_source_stype(name, dtype)
+            ser = self._sample_df[name]
+            if contains_id(ser, name, dtype):
+                stype = Stype.ID
+            elif contains_timestamp(ser, name, dtype):
+                stype = Stype.timestamp
+            elif contains_multicategorical(ser, name, dtype):
+                stype = Stype.multicategorical
+            elif contains_categorical(ser, name, dtype):
+                stype = Stype.categorical
+            else:
+                stype = dtype.default_stype
         except Exception as e:
             raise RuntimeError(f"Could not obtain semantic type for column "
                                f"'{name}' in table '{self.name}'. Change "
@@ -338,8 +380,9 @@ class Table(ABC):
     def print_metadata(self) -> None:
         r"""Prints the :meth:`~metadata` of this table."""
-        num_rows = self._num_rows()
-        num_rows_repr = ' ({num_rows:,} rows)' if num_rows is not None else ''
+        num_rows_repr = ''
+        if self._num_rows is not None:
+            num_rows_repr = ' ({self._num_rows:,} rows)'
         if in_notebook():
             from IPython.display import Markdown, display
@@ -384,7 +427,11 @@ class Table(ABC):
                 column.name for column in self.columns if is_candidate(column)
             ]
-            if primary_key := self._infer_primary_key(candidates):
+            if primary_key := infer_primary_key(
+                    table_name=self.name,
+                    df=self._sample_df,
+                    candidates=candidates,
+            ):
                 self.primary_key = primary_key
                 logs.append(f"primary key '{primary_key}'")
@@ -395,7 +442,10 @@ class Table(ABC):
                 if column.stype == Stype.timestamp
                 and column.name != self._end_time_column
             ]
-            if time_column := self._infer_time_column(candidates):
+            if time_column := infer_time_column(
+                    df=self._sample_df,
+                    candidates=candidates,
+            ):
                 self.time_column = time_column
                 logs.append(f"time column '{time_column}'")
@@ -448,30 +498,43 @@ class Table(ABC):
     # Abstract method #########################################################
-    @abstractmethod
-    def _has_source_column(self, name: str) -> bool:
-        pass
+    @cached_property
+    def _source_column_dict(self) -> Dict[str, SourceColumn]:
+        return {col.name: col for col in self._get_source_columns()}
     @abstractmethod
-    def _get_source_dtype(self, name: str) -> Dtype:
+    def _get_source_columns(self) -> List[SourceColumn]:
         pass
-    @abstractmethod
-    def _get_source_stype(self, name: str, dtype: Dtype) -> Stype:
-        pass
+    @cached_property
+    def _source_foreign_key_dict(self) -> Dict[str, SourceForeignKey]:
+        fkeys = self._get_source_foreign_keys()
+        # NOTE Drop all keys that link to different primary keys in the same
+        # table since we don't support composite keys yet:
+        table_pkeys: Dict[str, Set[str]] = defaultdict(set)
+        for fkey in fkeys:
+            table_pkeys[fkey.dst_table].add(fkey.primary_key)
+        return {
+            fkey.name: fkey
+            for fkey in fkeys if len(table_pkeys[fkey.dst_table]) == 1
+        }
     @abstractmethod
-    def _get_source_foreign_keys(self) -> List[Tuple[str, str, str]]:
+    def _get_source_foreign_keys(self) -> List[SourceForeignKey]:
         pass
-    @abstractmethod
-    def _infer_primary_key(self, candidates: List[str]) -> Optional[str]:
-        pass
+    @cached_property
+    def _sample_df(self) -> pd.DataFrame:
+        return self._get_sample_df()
     @abstractmethod
-    def _infer_time_column(self, candidates: List[str]) -> Optional[str]:
+    def _get_sample_df(self) -> pd.DataFrame:
         pass
-    @abstractmethod
+    @cached_property
     def _num_rows(self) -> Optional[int]:
+        return self._get_num_rows()
+    @abstractmethod
+    def _get_num_rows(self) -> Optional[int]:
         pass

kumoai/experimental/rfm/graph.py CHANGED Viewed

@@ -2,7 +2,9 @@ import contextlib
 import io
 import warnings
 from collections import defaultdict
+from dataclasses import dataclass, field
 from importlib.util import find_spec
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union
 import pandas as pd
@@ -14,9 +16,18 @@ from typing_extensions import Self
 from kumoai import in_notebook
 from kumoai.experimental.rfm import Table
 from kumoai.graph import Edge
+from kumoai.mixin import CastMixin
 if TYPE_CHECKING:
     import graphviz
+    from adbc_driver_sqlite.dbapi import AdbcSqliteConnection
+    from snowflake.connector import SnowflakeConnection
+@dataclass
+class SqliteConnectionConfig(CastMixin):
+    uri: Union[str, Path]
+    kwargs: Dict[str, Any] = field(default_factory=dict)
 class Graph:
@@ -86,14 +97,17 @@ class Graph:
             self.add_table(table)
         for table in tables:
-            for fkey, dst_table, pkey in table._get_source_foreign_keys():
-                if self[dst_table].primary_key is None:
-                    self[dst_table].primary_key = pkey
-                elif self[dst_table]._primary_key != pkey:
+            for fkey in table._source_foreign_key_dict.values():
+                if fkey.name not in table or fkey.dst_table not in self:
+                    continue
+                if self[fkey.dst_table].primary_key is None:
+                    self[fkey.dst_table].primary_key = fkey.primary_key
+                elif self[fkey.dst_table]._primary_key != fkey.primary_key:
                     raise ValueError(f"Found duplicate primary key definition "
-                                     f"'{self[dst_table]._primary_key}' and "
-                                     f"'{pkey}' in table '{dst_table}'.")
-                self.link(table.name, fkey, dst_table)
+                                     f"'{self[fkey.dst_table]._primary_key}' "
+                                     f"and '{fkey.primary_key}' in table "
+                                     f"'{fkey.dst_table}'.")
+                self.link(table.name, fkey.name, fkey.dst_table)
         for edge in (edges or []):
             _edge = Edge._cast(edge)
@@ -132,13 +146,6 @@ class Graph:
             ...     "table3": df3,
             ... })
-            >>> # Inspect table metadata:
-            >>> for table in graph.tables.values():
-            ...     table.print_metadata()
-            >>> # Visualize graph (if graphviz is installed):
-            >>> graph.visualize()
         Args:
             df_dict: A dictionary of data frames, where the keys are the names
                 of the tables and the values hold table data.
@@ -169,12 +176,17 @@ class Graph:
     @classmethod
     def from_sqlite(
         cls,
-        uri: Any,
+        connection: Union[
+            'AdbcSqliteConnection',
+            SqliteConnectionConfig,
+            str,
+            Path,
+            Dict[str, Any],
+        ],
         table_names: Optional[Sequence[str]] = None,
         edges: Optional[Sequence[Edge]] = None,
         infer_metadata: bool = True,
         verbose: bool = True,
-        conn_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Self:
         r"""Creates a :class:`Graph` from a :class:`sqlite` database.
@@ -188,16 +200,10 @@ class Graph:
             >>> # Create a graph from a SQLite database:
             >>> graph = rfm.Graph.from_sqlite('data.db')
-            >>> # Inspect table metadata:
-            >>> for table in graph.tables.values():
-            ...     table.print_metadata()
-            >>> # Visualize graph (if graphviz is installed):
-            >>> graph.visualize()
         Args:
-            uri: The path to the database file or an open connection obtained
-                from :meth:`~kumoai.experimental.rfm.backend.sqlite.connect`.
+            connection: An open connection from
+                :meth:`~kumoai.experimental.rfm.backend.sqlite.connect` or the
+                path to the database file.
             table_names: Set of table names to include. If ``None``, will add
                 all tables present in the database.
             edges: An optional list of :class:`~kumoai.graph.Edge` objects to
@@ -206,8 +212,6 @@ class Graph:
             infer_metadata: Whether to infer metadata for all tables in the
                 graph.
             verbose: Whether to print verbose output.
-            conn_kwargs: Additional connection arguments, following the
-                :class:`adbc_driver_sqlite` protocol.
         """
         from kumoai.experimental.rfm.backend.sqlite import (
             Connection,
@@ -215,10 +219,11 @@ class Graph:
             connect,
         )
-        if not isinstance(uri, Connection):
-            connection = connect(uri, **(conn_kwargs or {}))
-        else:
-            connection = uri
+        if not isinstance(connection, Connection):
+            connection = SqliteConnectionConfig._cast(connection)
+            assert isinstance(connection, SqliteConnectionConfig)
+            connection = connect(connection.uri, **connection.kwargs)
+        assert isinstance(connection, Connection)
         if table_names is None:
             with connection.cursor() as cursor:
@@ -242,6 +247,140 @@ class Graph:
         return graph
+    @classmethod
+    def from_snowflake(
+        cls,
+        connection: Union['SnowflakeConnection', Dict[str, Any], None] = None,
+        table_names: Optional[Sequence[str]] = None,
+        edges: Optional[Sequence[Edge]] = None,
+        infer_metadata: bool = True,
+        verbose: bool = True,
+    ) -> Self:
+        r"""Creates a :class:`Graph` from a :class:`snowflake` database and
+        schema.
+        Automatically infers table metadata and links by default.
+        .. code-block:: python
+            >>> # doctest: +SKIP
+            >>> import kumoai.experimental.rfm as rfm
+            >>> # Create a graph directly in a Snowflake notebook:
+            >>> graph = rfm.Graph.from_snowflake()
+        Args:
+            connection: An open connection from
+                :meth:`~kumoai.experimental.rfm.backend.snow.connect` or the
+                :class:`snowflake` connector keyword arguments to open a new
+                connection. If ``None``, will re-use an active session in case
+                it exists, or create a new connection from credentials stored
+                in environment variables.
+            table_names: Set of table names to include. If ``None``, will add
+                all tables present in the database.
+            edges: An optional list of :class:`~kumoai.graph.Edge` objects to
+                add to the graph. If not provided, edges will be automatically
+                inferred from the data in case ``infer_metadata=True``.
+            infer_metadata: Whether to infer metadata for all tables in the
+                graph.
+            verbose: Whether to print verbose output.
+        """
+        from kumoai.experimental.rfm.backend.snow import (
+            Connection,
+            SnowTable,
+            connect,
+        )
+        if not isinstance(connection, Connection):
+            connection = connect(**(connection or {}))
+        assert isinstance(connection, Connection)
+        if table_names is None:
+            with connection.cursor() as cursor:
+                cursor.execute("SELECT CURRENT_DATABASE(), CURRENT_SCHEMA()")
+                database, schema = cursor.fetchone()
+                query = f"""
+                SELECT TABLE_NAME
+                FROM {database}.INFORMATION_SCHEMA.TABLES
+                WHERE TABLE_SCHEMA = '{schema}'
+                """
+                cursor.execute(query)
+                table_names = [row[0] for row in cursor.fetchall()]
+        tables = [SnowTable(connection, name) for name in table_names]
+        graph = cls(tables, edges=edges or [])
+        if infer_metadata:
+            graph.infer_metadata(False)
+            if edges is None:
+                graph.infer_links(False)
+        if verbose:
+            graph.print_metadata()
+            graph.print_links()
+        return graph
+    @classmethod
+    def from_snowflake_semantic_view(
+        cls,
+        semantic_view_name: str,
+        connection: Union['SnowflakeConnection', Dict[str, Any], None] = None,
+        verbose: bool = True,
+    ) -> Self:
+        import yaml
+        from kumoai.experimental.rfm.backend.snow import (
+            Connection,
+            SnowTable,
+            connect,
+        )
+        if not isinstance(connection, Connection):
+            connection = connect(**(connection or {}))
+        assert isinstance(connection, Connection)
+        with connection.cursor() as cursor:
+            cursor.execute(f"SELECT SYSTEM$READ_YAML_FROM_SEMANTIC_VIEW("
+                           f"'{semantic_view_name}')")
+            view = yaml.safe_load(cursor.fetchone()[0])
+        graph = cls(tables=[])
+        for table_desc in view['tables']:
+            primary_key: Optional[str] = None
+            if ('primary_key' in table_desc  # NOTE No composite keys yet.
+                    and len(table_desc['primary_key']['columns']) == 1):
+                primary_key = table_desc['primary_key']['columns'][0]
+            table = SnowTable(
+                connection,
+                name=table_desc['base_table']['table'],
+                database=table_desc['base_table']['database'],
+                schema=table_desc['base_table']['schema'],
+                primary_key=primary_key,
+            )
+            graph.add_table(table)
+        # TODO Find a solution to register time columns!
+        for relations in view['relationships']:
+            if len(relations['relationship_columns']) != 1:
+                continue  # NOTE No composite keys yet.
+            graph.link(
+                src_table=relations['left_table'],
+                fkey=relations['relationship_columns'][0]['left_column'],
+                dst_table=relations['right_table'],
+            )
+        if verbose:
+            graph.print_metadata()
+            graph.print_links()
+        return graph
     # Tables ##############################################################
     def has_table(self, name: str) -> bool:
@@ -612,10 +751,9 @@ class Graph:
                         score += 1.0
                     # Cardinality ratio:
-                    src_num_rows = src_table._num_rows()
-                    dst_num_rows = dst_table._num_rows()
-                    if (src_num_rows is not None and dst_num_rows is not None
-                            and src_num_rows > dst_num_rows):
+                    if (src_table._num_rows is not None
+                            and dst_table._num_rows is not None
+                            and src_table._num_rows > dst_table._num_rows):
                         score += 1.0
                     if score < 5.0:

kumoai/experimental/rfm/infer/__init__.py CHANGED Viewed

@@ -1,13 +1,17 @@
+from .dtype import infer_dtype
+from .pkey import infer_primary_key
+from .time_col import infer_time_column
 from .id import contains_id
 from .timestamp import contains_timestamp
 from .categorical import contains_categorical
 from .multicategorical import contains_multicategorical
-from .stype import infer_stype
 __all__ = [
+    'infer_dtype',
+    'infer_primary_key',
+    'infer_time_column',
     'contains_id',
     'contains_timestamp',
     'contains_categorical',
     'contains_multicategorical',
-    'infer_stype',
 ]

kumoai/experimental/rfm/infer/dtype.py ADDED Viewed

@@ -0,0 +1,79 @@
+from typing import Dict
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+from kumoapi.typing import Dtype
+PANDAS_TO_DTYPE: Dict[str, Dtype] = {
+    'bool': Dtype.bool,
+    'boolean': Dtype.bool,
+    'int8': Dtype.int,
+    'int16': Dtype.int,
+    'int32': Dtype.int,
+    'int64': Dtype.int,
+    'float16': Dtype.float,
+    'float32': Dtype.float,
+    'float64': Dtype.float,
+    'object': Dtype.string,
+    'string': Dtype.string,
+    'string[python]': Dtype.string,
+    'string[pyarrow]': Dtype.string,
+    'binary': Dtype.binary,
+}
+def infer_dtype(ser: pd.Series) -> Dtype:
+    """Extracts the :class:`Dtype` from a :class:`pandas.Series`.
+    Args:
+        ser: A :class:`pandas.Series` to analyze.
+    Returns:
+        The data type.
+    """
+    if pd.api.types.is_datetime64_any_dtype(ser.dtype):
+        return Dtype.date
+    if pd.api.types.is_timedelta64_dtype(ser.dtype):
+        return Dtype.timedelta
+    if isinstance(ser.dtype, pd.CategoricalDtype):
+        return Dtype.string
+    if (pd.api.types.is_object_dtype(ser.dtype)
+            and not isinstance(ser.dtype, pd.ArrowDtype)):
+        index = ser.iloc[:1000].first_valid_index()
+        if index is not None and pd.api.types.is_list_like(ser[index]):
+            pos = ser.index.get_loc(index)
+            assert isinstance(pos, int)
+            ser = ser.iloc[pos:pos + 1000].dropna()
+            arr = pa.array(ser.tolist())
+            ser = pd.Series(arr, dtype=pd.ArrowDtype(arr.type))
+    if isinstance(ser.dtype, pd.ArrowDtype):
+        if pa.types.is_list(ser.dtype.pyarrow_dtype):
+            elem_dtype = ser.dtype.pyarrow_dtype.value_type
+            if pa.types.is_integer(elem_dtype):
+                return Dtype.intlist
+            if pa.types.is_floating(elem_dtype):
+                return Dtype.floatlist
+            if pa.types.is_decimal(elem_dtype):
+                return Dtype.floatlist
+            if pa.types.is_string(elem_dtype):
+                return Dtype.stringlist
+            if pa.types.is_null(elem_dtype):
+                return Dtype.floatlist
+    if isinstance(ser.dtype, np.dtype):
+        dtype_str = str(ser.dtype).lower()
+    elif isinstance(ser.dtype, pd.api.extensions.ExtensionDtype):
+        dtype_str = ser.dtype.name.lower()
+        dtype_str = dtype_str.split('[')[0]  # Remove backend metadata
+    elif isinstance(ser.dtype, pa.DataType):
+        dtype_str = str(ser.dtype).lower()
+    else:
+        dtype_str = 'object'
+    if dtype_str not in PANDAS_TO_DTYPE:
+        raise ValueError(f"Unsupported data type '{ser.dtype}'")
+    return PANDAS_TO_DTYPE[dtype_str]

kumoai/experimental/rfm/{utils.py → infer/pkey.py} RENAMED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import pandas as pd
-def detect_primary_key(
+def infer_primary_key(
     table_name: str,
     df: pd.DataFrame,
     candidates: list[str],
@@ -14,7 +14,7 @@ def detect_primary_key(
     Args:
         table_name: The table name.
-        df: The pandas DataFrame to analyze
+        df: The pandas DataFrame to analyze.
         candidates: A list of potential candidates.
     Returns:
@@ -124,102 +124,3 @@ def detect_primary_key(
                   f"key for this table manually.")
     return None
-def detect_time_column(
-    df: pd.DataFrame,
-    candidates: list[str],
-) -> Optional[str]:
-    r"""Auto-detect potential time column.
-    Args:
-        df: The pandas DataFrame to analyze
-        candidates: A list of potential candidates.
-    Returns:
-        The name of the detected time column, or ``None`` if not found.
-    """
-    candidates = [  # Exclude all candidates with `*last*` in column names:
-        col_name for col_name in candidates
-        if not re.search(r'(^|_)last(_|$)', col_name, re.IGNORECASE)
-    ]
-    if len(candidates) == 0:
-        return None
-    if len(candidates) == 1:
-        return candidates[0]
-    # If there exists a dedicated `create*` column, use it as time column:
-    create_candidates = [
-        candidate for candidate in candidates
-        if candidate.lower().startswith('create')
-    ]
-    if len(create_candidates) == 1:
-        return create_candidates[0]
-    if len(create_candidates) > 1:
-        candidates = create_candidates
-    # Find the most optimal time column. Usually, it is the one pointing to
-    # the oldest timestamps:
-    with warnings.catch_warnings():
-        warnings.filterwarnings('ignore', message='Could not infer format')
-        min_timestamp_dict = {
-            key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
-            for key in candidates
-        }
-    min_timestamp_dict = {
-        key: value.min().tz_localize(None)
-        for key, value in min_timestamp_dict.items()
-    }
-    min_timestamp_dict = {
-        key: value
-        for key, value in min_timestamp_dict.items() if not pd.isna(value)
-    }
-    if len(min_timestamp_dict) == 0:
-        return None
-    return min(min_timestamp_dict, key=min_timestamp_dict.get)  # type: ignore
-PUNCTUATION = re.compile(r"[\'\"\.,\(\)\!\?\;\:]")
-MULTISPACE = re.compile(r"\s+")
-def normalize_text(
-    ser: pd.Series,
-    max_words: Optional[int] = 50,
-) -> pd.Series:
-    r"""Normalizes text into a list of lower-case words.
-    Args:
-        ser: The :class:`pandas.Series` to normalize.
-        max_words: The maximum number of words to return.
-            This will auto-shrink any large text column to avoid blowing up
-            context size.
-    """
-    if len(ser) == 0 or pd.api.types.is_list_like(ser.iloc[0]):
-        return ser
-    def normalize_fn(line: str) -> list[str]:
-        line = PUNCTUATION.sub(" ", line)
-        line = re.sub(r"<br\s*/?>", " ", line)  # Handle <br /> or <br>
-        line = MULTISPACE.sub(" ", line)
-        words = line.split()
-        if max_words is not None:
-            words = words[:max_words]
-        return words
-    ser = ser.fillna('').astype(str)
-    if max_words is not None:
-        # We estimate the number of words as 5 characters + 1 space in an
-        # English text on average. We need this pre-filter here, as word
-        # splitting on a giant text can be very expensive:
-        ser = ser.str[:6 * max_words]
-    ser = ser.str.lower()
-    ser = ser.map(normalize_fn)
-    return ser