PyPI - kumoai - Versions diffs - 2.14.0.dev202512271732__cp310-cp310-macosx_11_0_arm64.whl → 2.14.0rc2__cp310-cp310-macosx_11_0_arm64.whl - Mend

kumoai 2.14.0.dev202512271732__cp310-cp310-macosx_11_0_arm64.whl → 2.14.0rc2__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

kumoai/__init__.py +23 -26
kumoai/_version.py +1 -1
kumoai/client/jobs.py +2 -0
kumoai/connector/utils.py +21 -7
kumoai/experimental/rfm/__init__.py +24 -22
kumoai/experimental/rfm/backend/snow/sampler.py +83 -14
kumoai/experimental/rfm/backend/sqlite/sampler.py +68 -12
kumoai/experimental/rfm/base/mapper.py +67 -0
kumoai/experimental/rfm/base/sampler.py +21 -0
kumoai/experimental/rfm/base/sql_sampler.py +233 -10
kumoai/experimental/rfm/base/table.py +41 -53
kumoai/experimental/rfm/graph.py +57 -60
kumoai/experimental/rfm/infer/dtype.py +2 -1
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +529 -303
kumoai/experimental/rfm/task_table.py +292 -0
kumoai/pquery/training_table.py +16 -2
kumoai/utils/display.py +87 -0
kumoai/utils/progress_logger.py +13 -1
{kumoai-2.14.0.dev202512271732.dist-info → kumoai-2.14.0rc2.dist-info}/METADATA +2 -2
{kumoai-2.14.0.dev202512271732.dist-info → kumoai-2.14.0rc2.dist-info}/RECORD +24 -20
{kumoai-2.14.0.dev202512271732.dist-info → kumoai-2.14.0rc2.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512271732.dist-info → kumoai-2.14.0rc2.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512271732.dist-info → kumoai-2.14.0rc2.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/base/sampler.py CHANGED Viewed

@@ -59,6 +59,17 @@ class Sampler(ABC):
             self._edge_types.append(edge_type)
             self._edge_types.append(Subgraph.rev_edge_type(edge_type))
+        # Source Table -> [(Foreign Key, Destination Table)]
+        self._foreign_key_dict: dict[str, list[tuple[str, str]]] = {}
+        # Destination Table -> [(Source Table, Foreign Key)]
+        self._rev_foreign_key_dict: dict[str, list[tuple[str, str]]] = {}
+        for table in graph.tables.values():
+            self._foreign_key_dict[table.name] = []
+            self._rev_foreign_key_dict[table.name] = []
+        for src_table, fkey, dst_table in graph.edges:
+            self._foreign_key_dict[src_table].append((fkey, dst_table))
+            self._rev_foreign_key_dict[dst_table].append((src_table, fkey))
         self._primary_key_dict: dict[str, str] = {
             table.name: table._primary_key
             for table in graph.tables.values()
@@ -98,6 +109,16 @@ class Sampler(ABC):
         r"""All available edge types in the graph."""
         return self._edge_types
+    @property
+    def foreign_key_dict(self) -> dict[str, list[tuple[str, str]]]:
+        r"""The foreign keys for all tables in the graph."""
+        return self._foreign_key_dict
+    @property
+    def rev_foreign_key_dict(self) -> dict[str, list[tuple[str, str]]]:
+        r"""The foreign key back references for all tables in the graph."""
+        return self._rev_foreign_key_dict
     @property
     def primary_key_dict(self) -> dict[str, str]:
         r"""All available primary keys in the graph."""

kumoai/experimental/rfm/base/sql_sampler.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from abc import abstractmethod
+from collections import defaultdict
 from typing import TYPE_CHECKING, Literal
 import numpy as np
 import pandas as pd
+from kumoapi.rfm.context import Subgraph
 from kumoapi.typing import Dtype
 from kumoai.experimental.rfm.base import (
@@ -11,11 +13,14 @@ from kumoai.experimental.rfm.base import (
     SamplerOutput,
     SourceColumn,
 )
+from kumoai.experimental.rfm.base.mapper import Mapper
 from kumoai.utils import ProgressLogger, quote_ident
 if TYPE_CHECKING:
     from kumoai.experimental.rfm import Graph
+EdgeType = tuple[str, str, str]
 class SQLSampler(Sampler):
     def __init__(
@@ -100,10 +105,28 @@ class SQLSampler(Sampler):
         num_neighbors: list[int],
     ) -> SamplerOutput:
+        # Make sure to always include primary key, foreign key and time columns
+        # during data fetching since these are needed for graph traversal:
+        sample_columns_dict: dict[str, set[str]] = {}
+        for table, columns in columns_dict.items():
+            sample_columns = columns | {
+                foreign_key
+                for foreign_key, _ in self.foreign_key_dict[table]
+            }
+            if primary_key := self.primary_key_dict.get(table):
+                sample_columns |= {primary_key}
+            sample_columns_dict[table] = sample_columns
+        if not isinstance(anchor_time, pd.Series):
+            sample_columns_dict[entity_table_name] |= {
+                self.time_column_dict[entity_table_name]
+            }
+        # Sample Entity Table #################################################
         df, batch = self._by_pkey(
             table_name=entity_table_name,
-            pkey=entity_pkey,
-            columns=columns_dict[entity_table_name],
+            index=entity_pkey,
+            columns=sample_columns_dict[entity_table_name],
         )
         if len(batch) != len(entity_pkey):
             mask = np.ones(len(entity_pkey), dtype=bool)
@@ -112,23 +135,211 @@ class SQLSampler(Sampler):
                            f"{entity_pkey.iloc[mask].tolist()} do not exist "
                            f"in the '{entity_table_name}' table")
+        # Make sure that entities are returned in expected order:
         perm = batch.argsort()
         batch = batch[perm]
         df = df.iloc[perm].reset_index(drop=True)
+        # Fill 'entity' anchor times with actual values:
         if not isinstance(anchor_time, pd.Series):
             time_column = self.time_column_dict[entity_table_name]
             anchor_time = df[time_column]
+        assert isinstance(anchor_time, pd.Series)
+        # Recursive Neighbor Sampling #########################################
+        mapper_dict: dict[str, Mapper] = defaultdict(
+            lambda: Mapper(num_examples=len(entity_pkey)))
+        mapper_dict[entity_table_name].add(
+            pkey=df[self.primary_key_dict[entity_table_name]],
+            batch=batch,
+        )
+        dfs_dict: dict[str, list[pd.DataFrame]] = defaultdict(list)
+        dfs_dict[entity_table_name].append(df)
+        batches_dict: dict[str, list[np.ndarray]] = defaultdict(list)
+        batches_dict[entity_table_name].append(batch)
+        num_sampled_nodes_dict: dict[str, list[int]] = defaultdict(
+            lambda: [0] * (len(num_neighbors) + 1))
+        num_sampled_nodes_dict[entity_table_name][0] = len(entity_pkey)
+        rows_dict: dict[EdgeType, list[np.ndarray]] = defaultdict(list)
+        cols_dict: dict[EdgeType, list[np.ndarray]] = defaultdict(list)
+        num_sampled_edges_dict: dict[EdgeType, list[int]] = defaultdict(
+            lambda: [0] * len(num_neighbors))
+        # The start index of data frame slices of the previous hop:
+        offset_dict: dict[str, int] = defaultdict(int)
+        for hop, neighbors in enumerate(num_neighbors):
+            if neighbors == 0:
+                break  # Abort early.
+            for table in list(num_sampled_nodes_dict.keys()):
+                # Only sample from tables that have been visited in the
+                # previous hop:
+                if num_sampled_nodes_dict[table][hop] == 0:
+                    continue
+                # Collect the slices of data sampled in the previous hop
+                # (but maintain only required key information):
+                cols = [fkey for fkey, _ in self.foreign_key_dict[table]]
+                if table in self.primary_key_dict:
+                    cols.append(self.primary_key_dict[table])
+                dfs = [df[cols] for df in dfs_dict[table][offset_dict[table]:]]
+                df = pd.concat(
+                    dfs,
+                    axis=0,
+                    ignore_index=True,
+                ) if len(dfs) > 1 else dfs[0]
+                batches = batches_dict[table][offset_dict[table]:]
+                batch = (np.concatenate(batches)
+                         if len(batches) > 1 else batches[0])
+                offset_dict[table] = len(batches_dict[table])  # Increase.
+                pkey: pd.Series | None = None
+                index: pd.ndarray | None = None
+                if table in self.primary_key_dict:
+                    pkey = df[self.primary_key_dict[table]]
+                    index = mapper_dict[table].get(pkey, batch)
+                # Iterate over foreign keys in the current table:
+                for fkey, dst_table in self.foreign_key_dict[table]:
+                    row = mapper_dict[dst_table].get(df[fkey], batch)
+                    mask = row == -1
+                    if mask.any():
+                        key_df = pd.DataFrame({
+                            'fkey': df[fkey],
+                            'batch': batch,
+                        }).iloc[mask]
+                        # Only maintain unique keys per example:
+                        unique_key_df = key_df.drop_duplicates()
+                        # Fully de-duplicate keys across examples:
+                        code, fkey_index = pd.factorize(unique_key_df['fkey'])
+                        _df, _batch = self._by_pkey(
+                            table_name=dst_table,
+                            index=fkey_index,
+                            columns=sample_columns_dict[dst_table],
+                        )  # Ensure result is sorted according to input order:
+                        _df = _df.iloc[_batch.argsort()]
+                        # Compute valid entries (without dangling foreign keys)
+                        # in `unique_fkey_df`:
+                        _mask = np.full(len(fkey_index), fill_value=False)
+                        _mask[_batch] = True
+                        _mask = _mask[code]
+                        # Recontruct unique (key, batch) pairs:
+                        code, _ = pd.factorize(unique_key_df['fkey'][_mask])
+                        _df = _df.iloc[code].reset_index(drop=True)
+                        _batch = unique_key_df['batch'].to_numpy()[_mask]
+                        # Register node IDs:
+                        mapper_dict[dst_table].add(_df[fkey], _batch)
+                        row[mask] = mapper_dict[dst_table].get(
+                            pkey=key_df['fkey'],
+                            batch=key_df['batch'].to_numpy(),
+                        )  # NOTE `row` may still hold `-1` for dangling fkeys.
+                        dfs_dict[dst_table].append(_df)
+                        batches_dict[dst_table].append(_batch)
+                        num_sampled_nodes_dict[dst_table][hop + 1] += (  #
+                            len(_batch))
+                    mask = row != -1
+                    col = index
+                    if col is None:
+                        num_nodes = num_sampled_nodes_dict[table][hop]
+                        col = np.arange(num_nodes, num_nodes + len(row))
+                    row = row[mask]
+                    col = col[mask]
+                    edge_type = (table, fkey, dst_table)
+                    edge_type = Subgraph.rev_edge_type(edge_type)
+                    rows_dict[edge_type].append(row)
+                    cols_dict[edge_type].append(col)
+                    num_sampled_edges_dict[edge_type][hop] = len(col)
+                # Iterate over foreign keys that reference the current table:
+                for src_table, fkey in self.rev_foreign_key_dict[table]:
+                    assert pkey is not None and index is not None
+                    _df, _batch = self._by_fkey(
+                        table_name=src_table,
+                        foreign_key=fkey,
+                        index=pkey,
+                        num_neighbors=neighbors,
+                        anchor_time=anchor_time.iloc[batch],
+                        columns=sample_columns_dict[src_table],
+                    )
+                    edge_type = (src_table, fkey, table)
+                    cols_dict[edge_type].append(index[_batch])
+                    num_sampled_edges_dict[edge_type][hop] = len(_batch)
+                    _batch = batch[_batch]
+                    num_nodes = sum(num_sampled_nodes_dict[src_table])
+                    if src_table in self.primary_key_dict:
+                        _pkey = _df[self.primary_key_dict]
+                        mapper_dict[src_table].add(_pkey, _batch)
+                        row = mapper_dict[src_table].get(_pkey, _batch)
+                        # Only preserve unknown rows:
+                        mask = row >= num_nodes  # type: ignore
+                        mask[pd.duplicated(row)] = False
+                        _df = _df.iloc[mask]
+                        _batch = _batch[mask]
+                    else:
+                        row = np.arange(num_nodes, num_nodes + len(_batch))
+                    rows_dict[edge_type].append(row)
+                    num_sampled_nodes_dict[src_table][hop + 1] += len(_batch)
+                    dfs_dict[src_table].append(_df)
+                    batches_dict[src_table].append(_batch)
+        # Post-Processing #####################################################
+        df_dict = {
+            table:
+            pd.concat(dfs, axis=0, ignore_index=True)
+            if len(dfs) > 1 else dfs[0]
+            for table, dfs in dfs_dict.items()
+        }
+        df_dict = {  # Post-filter column set:
+            table: df[list(columns_dict[table])]
+            for table, df in df_dict.items()
+        }
+        batch_dict = {
+            table: np.concatenate(batches) if len(batches) > 1 else batches[0]
+            for table, batches in batches_dict.items()
+        }
+        row_dict = {
+            edge_type: np.concatenate(rows)
+            for edge_type, rows in rows_dict.items()
+        }
+        col_dict = {
+            edge_type: np.concatenate(cols)
+            for edge_type, cols in cols_dict.items()
+        }
+        if len(num_sampled_edges_dict) == 0:  # Single table:
+            num_sampled_nodes_dict = {
+                key: value[:1]
+                for key, value in num_sampled_nodes_dict.items()
+            }
         return SamplerOutput(
             anchor_time=anchor_time.astype(int).to_numpy(),
-            df_dict={entity_table_name: df},
-            inverse_dict={},
-            batch_dict={entity_table_name: batch},
-            num_sampled_nodes_dict={entity_table_name: [len(batch)]},
-            row_dict={},
-            col_dict={},
-            num_sampled_edges_dict={},
+            df_dict=df_dict,
+            inverse_dict={},  # TODO
+            batch_dict=batch_dict,
+            num_sampled_nodes_dict=num_sampled_nodes_dict,
+            row_dict=row_dict,
+            col_dict=col_dict,
+            num_sampled_edges_dict=num_sampled_edges_dict,
         )
     # Abstract Methods ########################################################
@@ -137,7 +348,19 @@ class SQLSampler(Sampler):
     def _by_pkey(
         self,
         table_name: str,
-        pkey: pd.Series,
+        index: pd.Series,
+        columns: set[str],
+    ) -> tuple[pd.DataFrame, np.ndarray]:
+        pass
+    @abstractmethod
+    def _by_fkey(
+        self,
+        table_name: str,
+        foreign_key: str,
+        index: pd.Series,
+        num_neighbors: int,
+        anchor_time: pd.Series | None,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
         pass

kumoai/experimental/rfm/base/table.py CHANGED Viewed

@@ -5,6 +5,7 @@ from functools import cached_property
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from kumoapi.model_plan import MissingType
 from kumoapi.source_table import UnavailableSourceTable
 from kumoapi.table import Column as ColumnDefinition
@@ -12,7 +13,6 @@ from kumoapi.table import TableDefinition
 from kumoapi.typing import Dtype, Stype
 from typing_extensions import Self
-from kumoai import in_notebook, in_snowflake_notebook
 from kumoai.experimental.rfm.base import (
     Column,
     ColumnSpec,
@@ -27,7 +27,7 @@ from kumoai.experimental.rfm.infer import (
     infer_stype,
     infer_time_column,
 )
-from kumoai.utils import quote_ident
+from kumoai.utils import display, quote_ident
 class Table(ABC):
@@ -196,7 +196,7 @@ class Table(ABC):
                     raise RuntimeError(
                         f"Encountered unsupported data type '{ser.dtype}' for "
                         f"column '{column_spec.name}' in table '{self.name}'. "
-                        f"Please either manually specify the columns's data "
+                        f"Please either manually override the columns's data "
                         f"type or remove the column from this table.") from e
             if stype is None:
@@ -272,8 +272,8 @@ class Table(ABC):
         no such primary key is present.
         The setter sets a column as a primary key on this table, and raises a
-        :class:`ValueError` if the primary key has a non-ID semantic type or
-        if the column name does not match a column in the data frame.
+        :class:`ValueError` if the primary key has a non-ID compatible data
+        type or if the column name does not match a column in the data frame.
         """
         if self._primary_key is None:
             return None
@@ -317,8 +317,9 @@ class Table(ABC):
         such time column is present.
         The setter sets a column as a time column on this table, and raises a
-        :class:`ValueError` if the time column has a non-timestamp semantic
-        type or if the column name does not match a column in the data frame.
+        :class:`ValueError` if the time column has a non-timestamp compatible
+        data type or if the column name does not match a column in the data
+        frame.
         """
         if self._time_column is None:
             return None
@@ -363,8 +364,8 @@ class Table(ABC):
         The setter sets a column as an end time column on this table, and
         raises a :class:`ValueError` if the end time column has a non-timestamp
-        semantic type or if the column name does not match a column in the data
-        frame.
+        compatible data type or if the column name does not match a column in
+        the data frame.
         """
         if self._end_time_column is None:
             return None
@@ -399,39 +400,39 @@ class Table(ABC):
         r"""Returns a :class:`pandas.DataFrame` object containing metadata
         information about the columns in this table.
-        The returned dataframe has columns ``name``, ``dtype``, ``stype``,
-        ``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
-        which provide an aggregate view of the properties of the columns of
-        this table.
+        The returned dataframe has columns ``"Name"``, ``"Data Type"``,
+        ``"Semantic Type"``, ``"Primary Key"``, ``"Time Column"`` and
+        ``"End Time Column"``, which provide an aggregated view of the
+        properties of the columns of this table.
         Example:
             >>> # doctest: +SKIP
             >>> import kumoai.experimental.rfm as rfm
             >>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
             >>> table.metadata
-                name        dtype    stype  is_primary_key  is_time_column  is_end_time_column
-            0   CustomerID  float64  ID     True            False           False
+                Name        Data Type  Semantic Type  Primary Key  Time Column  End Time Column
+            0   CustomerID  float64    ID             True         False        False
         """  # noqa: E501
         cols = self.columns
         return pd.DataFrame({
-            'name':
+            'Name':
             pd.Series(dtype=str, data=[c.name for c in cols]),
-            'dtype':
+            'Data Type':
             pd.Series(dtype=str, data=[c.dtype for c in cols]),
-            'stype':
+            'Semantic Type':
             pd.Series(dtype=str, data=[c.stype for c in cols]),
-            'is_primary_key':
+            'Primary Key':
             pd.Series(
                 dtype=bool,
                 data=[self._primary_key == c.name for c in cols],
             ),
-            'is_time_column':
+            'Time Column':
             pd.Series(
                 dtype=bool,
                 data=[self._time_column == c.name for c in cols],
             ),
-            'is_end_time_column':
+            'End Time Column':
             pd.Series(
                 dtype=bool,
                 data=[self._end_time_column == c.name for c in cols],
@@ -440,30 +441,12 @@ class Table(ABC):
     def print_metadata(self) -> None:
         r"""Prints the :meth:`~metadata` of this table."""
-        num_rows_repr = ''
+        msg = f"🏷️ Metadata of Table `{self.name}`"
         if num := self._num_rows:
-            num_rows_repr = f' ({num} row)' if num == 1 else f' ({num:,} rows)'
-        if in_snowflake_notebook():
-            import streamlit as st
-            md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
-            st.markdown(md_repr)
-            st.dataframe(self.metadata, hide_index=True)
-        elif in_notebook():
-            from IPython.display import Markdown, display
-            md_repr = f"### 🏷️ Metadata of Table `{self.name}`{num_rows_repr}"
-            display(Markdown(md_repr))
-            df = self.metadata
-            try:
-                if hasattr(df.style, 'hide'):
-                    display(df.style.hide(axis='index'))  # pandas=2
-                else:
-                    display(df.style.hide_index())  # pandas<1.3
-            except ImportError:
-                print(df.to_string(index=False))  # missing jinja2
-        else:
-            print(f"🏷️ Metadata of Table '{self.name}'{num_rows_repr}")
-            print(self.metadata.to_string(index=False))
+            msg += " (1 row)" if num == 1 else f" ({num:,} rows)"
+        display.title(msg)
+        display.dataframe(self.metadata)
     def infer_primary_key(self, verbose: bool = True) -> Self:
         r"""Infers the primary key in this table.
@@ -477,8 +460,8 @@ class Table(ABC):
         def _set_primary_key(primary_key: str) -> None:
             self.primary_key = primary_key
             if verbose:
-                print(f"Inferred primary key '{primary_key}' for table "
-                      f"'{self.name}'")
+                display.message(f"Inferred primary key `{primary_key}` for "
+                                f"table `{self.name}`")
         # Inference from source column metadata:
         if any(column.is_source for column in self.columns):
@@ -543,8 +526,8 @@ class Table(ABC):
             self.time_column = time_column
             if verbose:
-                print(f"Inferred time column '{time_column}' for table "
-                      f"'{self.name}'")
+                display.message(f"Inferred time column `{time_column}` for "
+                                f"table `{self.name}`")
         return self
@@ -560,15 +543,16 @@ class Table(ABC):
         if not self.has_primary_key():
             self.infer_primary_key(verbose=False)
             if self.has_primary_key():
-                logs.append(f"primary key '{self._primary_key}'")
+                logs.append(f"primary key `{self._primary_key}`")
         if not self.has_time_column():
             self.infer_time_column(verbose=False)
             if self.has_time_column():
-                logs.append(f"time column '{self._time_column}'")
+                logs.append(f"time column `{self._time_column}`")
         if verbose and len(logs) > 0:
-            print(f"Inferred {' and '.join(logs)} for table '{self.name}'")
+            display.message(f"Inferred {' and '.join(logs)} for table "
+                            f"`{self.name}`")
         return self
@@ -641,14 +625,18 @@ class Table(ABC):
         types match table data and semantic type specification.
         """
         def _to_datetime(ser: pd.Series) -> pd.Series:
-            if not pd.api.types.is_datetime64_any_dtype(ser):
+            if (not pd.api.types.is_datetime64_any_dtype(ser)
+                    and not (isinstance(ser.dtype, pd.ArrowDtype) and
+                             pa.types.is_timestamp(ser.dtype.pyarrow_dtype))):
                 with warnings.catch_warnings():
                     warnings.filterwarnings(
                         'ignore',
                         message='Could not infer format',
                     )
                     ser = pd.to_datetime(ser, errors='coerce')
-            if isinstance(ser.dtype, pd.DatetimeTZDtype):
+            if (isinstance(ser.dtype, pd.DatetimeTZDtype)
+                    or (isinstance(ser.dtype, pd.ArrowDtype)
+                        and ser.dtype.pyarrow_dtype.tz is not None)):
                 ser = ser.dt.tz_localize(None)
             if ser.dtype != 'datetime64[ns]':
                 ser = ser.astype('datetime64[ns]')