PyPI - kumoai - Versions diffs - 2.14.0.dev202601051732__cp311-cp311-macosx_11_0_arm64.whl → 2.15.0.dev202601141731__cp311-cp311-macosx_11_0_arm64.whl - Mend

kumoai 2.14.0.dev202601051732__cp311-cp311-macosx_11_0_arm64.whl → 2.15.0.dev202601141731__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

kumoai/_version.py +1 -1
kumoai/client/jobs.py +2 -0
kumoai/experimental/rfm/backend/snow/sampler.py +138 -28
kumoai/experimental/rfm/backend/snow/table.py +16 -13
kumoai/experimental/rfm/backend/sqlite/sampler.py +73 -15
kumoai/experimental/rfm/base/mapper.py +69 -0
kumoai/experimental/rfm/base/sampler.py +23 -1
kumoai/experimental/rfm/base/sql_sampler.py +252 -11
kumoai/experimental/rfm/base/table.py +15 -29
kumoai/experimental/rfm/base/utils.py +36 -0
kumoai/experimental/rfm/graph.py +9 -9
kumoai/experimental/rfm/infer/dtype.py +3 -1
kumoai/experimental/rfm/infer/time_col.py +4 -2
kumoai/experimental/rfm/rfm.py +195 -114
kumoai/experimental/rfm/task_table.py +2 -0
kumoai/pquery/training_table.py +16 -2
kumoai/testing/snow.py +3 -3
kumoai/utils/display.py +44 -8
kumoai/utils/progress_logger.py +2 -1
kumoai/utils/sql.py +2 -2
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/METADATA +2 -2
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/RECORD +25 -23
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/base/sql_sampler.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from abc import abstractmethod
+from collections import defaultdict
 from typing import TYPE_CHECKING, Literal
 import numpy as np
 import pandas as pd
+from kumoapi.rfm.context import Subgraph
 from kumoapi.typing import Dtype
 from kumoai.experimental.rfm.base import (
@@ -11,11 +13,14 @@ from kumoai.experimental.rfm.base import (
     SamplerOutput,
     SourceColumn,
 )
+from kumoai.experimental.rfm.base.mapper import Mapper
 from kumoai.utils import ProgressLogger, quote_ident
 if TYPE_CHECKING:
     from kumoai.experimental.rfm import Graph
+EdgeType = tuple[str, str, str]
 class SQLSampler(Sampler):
     def __init__(
@@ -100,11 +105,28 @@ class SQLSampler(Sampler):
         num_neighbors: list[int],
     ) -> SamplerOutput:
-        # TODO Add entity time column to `columns_dict`.
+        # Make sure to always include primary key, foreign key and time columns
+        # during data fetching since these are needed for graph traversal:
+        sample_columns_dict: dict[str, set[str]] = {}
+        for table, columns in columns_dict.items():
+            sample_columns = columns | {
+                foreign_key
+                for foreign_key, _ in self.foreign_key_dict[table]
+            }
+            if primary_key := self.primary_key_dict.get(table):
+                sample_columns |= {primary_key}
+            sample_columns_dict[table] = sample_columns
+        if not isinstance(anchor_time, pd.Series):
+            sample_columns_dict[entity_table_name] |= {
+                self.time_column_dict[entity_table_name]
+            }
+        # Sample Entity Table #################################################
         df, batch = self._by_pkey(
             table_name=entity_table_name,
-            pkey=entity_pkey,
-            columns=columns_dict[entity_table_name],
+            index=entity_pkey,
+            columns=sample_columns_dict[entity_table_name],
         )
         if len(batch) != len(entity_pkey):
             mask = np.ones(len(entity_pkey), dtype=bool)
@@ -113,23 +135,230 @@ class SQLSampler(Sampler):
                            f"{entity_pkey.iloc[mask].tolist()} do not exist "
                            f"in the '{entity_table_name}' table")
+        # Make sure that entities are returned in expected order:
         perm = batch.argsort()
         batch = batch[perm]
         df = df.iloc[perm].reset_index(drop=True)
+        # Fill 'entity' anchor times with actual values:
         if not isinstance(anchor_time, pd.Series):
             time_column = self.time_column_dict[entity_table_name]
             anchor_time = df[time_column]
+        assert isinstance(anchor_time, pd.Series)
+        # Recursive Neighbor Sampling #########################################
+        mapper_dict: dict[str, Mapper] = defaultdict(
+            lambda: Mapper(num_examples=len(entity_pkey)))
+        mapper_dict[entity_table_name].add(
+            pkey=df[self.primary_key_dict[entity_table_name]],
+            batch=batch,
+        )
+        dfs_dict: dict[str, list[pd.DataFrame]] = defaultdict(list)
+        dfs_dict[entity_table_name].append(df)
+        batches_dict: dict[str, list[np.ndarray]] = defaultdict(list)
+        batches_dict[entity_table_name].append(batch)
+        num_sampled_nodes_dict: dict[str, list[int]] = defaultdict(
+            lambda: [0] * (len(num_neighbors) + 1))
+        num_sampled_nodes_dict[entity_table_name][0] = len(entity_pkey)
+        rows_dict: dict[EdgeType, list[np.ndarray]] = defaultdict(list)
+        cols_dict: dict[EdgeType, list[np.ndarray]] = defaultdict(list)
+        num_sampled_edges_dict: dict[EdgeType, list[int]] = defaultdict(
+            lambda: [0] * len(num_neighbors))
+        # The start index of data frame slices of the previous hop:
+        offset_dict: dict[str, int] = defaultdict(int)
+        for hop, neighbors in enumerate(num_neighbors):
+            if neighbors == 0:
+                break  # Abort early.
+            for table in list(num_sampled_nodes_dict.keys()):
+                # Only sample from tables that have been visited in the
+                # previous hop:
+                if num_sampled_nodes_dict[table][hop] == 0:
+                    continue
+                # Collect the slices of data sampled in the previous hop
+                # (but maintain only required key information):
+                cols = [fkey for fkey, _ in self.foreign_key_dict[table]]
+                if table in self.primary_key_dict:
+                    cols.append(self.primary_key_dict[table])
+                dfs = [df[cols] for df in dfs_dict[table][offset_dict[table]:]]
+                df = pd.concat(
+                    dfs,
+                    axis=0,
+                    ignore_index=True,
+                ) if len(dfs) > 1 else dfs[0]
+                batches = batches_dict[table][offset_dict[table]:]
+                batch = (np.concatenate(batches)
+                         if len(batches) > 1 else batches[0])
+                offset_dict[table] = len(batches_dict[table])  # Increase.
+                pkey: pd.Series | None = None
+                index: pd.ndarray | None = None
+                if table in self.primary_key_dict:
+                    pkey = df[self.primary_key_dict[table]]
+                    index = mapper_dict[table].get(pkey, batch)
+                # Iterate over foreign keys in the current table:
+                for fkey, dst_table in self.foreign_key_dict[table]:
+                    row = mapper_dict[dst_table].get(df[fkey], batch)
+                    mask = row == -1
+                    if mask.any():
+                        key_df = pd.DataFrame({
+                            'fkey': df[fkey],
+                            'batch': batch,
+                        }).iloc[mask]
+                        # Only maintain unique keys per example:
+                        unique_key_df = key_df.drop_duplicates()
+                        # Fully de-duplicate keys across examples:
+                        code, fkey_index = pd.factorize(unique_key_df['fkey'])
+                        _df, _batch = self._by_pkey(
+                            table_name=dst_table,
+                            index=fkey_index,
+                            columns=sample_columns_dict[dst_table],
+                        )  # Ensure result is sorted according to input order:
+                        _df = _df.iloc[_batch.argsort()]
+                        # Compute valid entries (without dangling foreign keys)
+                        # in `unique_fkey_df`:
+                        _mask = np.full(len(fkey_index), fill_value=False)
+                        _mask[_batch] = True
+                        _mask = _mask[code]
+                        # Recontruct unique (key, batch) pairs:
+                        code, _ = pd.factorize(unique_key_df['fkey'][_mask])
+                        _df = _df.iloc[code].reset_index(drop=True)
+                        _batch = unique_key_df['batch'].to_numpy()[_mask]
+                        # Register node IDs:
+                        mapper_dict[dst_table].add(
+                            pkey=_df[self.primary_key_dict[dst_table]],
+                            batch=_batch,
+                        )
+                        row[mask] = mapper_dict[dst_table].get(
+                            pkey=key_df['fkey'],
+                            batch=key_df['batch'].to_numpy(),
+                        )  # NOTE `row` may still hold `-1` for dangling fkeys.
+                        dfs_dict[dst_table].append(_df)
+                        batches_dict[dst_table].append(_batch)
+                        num_sampled_nodes_dict[dst_table][hop + 1] += (  #
+                            len(_batch))
+                    mask = row != -1
+                    col = index
+                    if col is None:
+                        start = sum(num_sampled_nodes_dict[table][:hop])
+                        end = sum(num_sampled_nodes_dict[table][:hop + 1])
+                        col = np.arange(start, end)
+                    row = row[mask]
+                    col = col[mask]
+                    edge_type = (table, fkey, dst_table)
+                    edge_type = Subgraph.rev_edge_type(edge_type)
+                    rows_dict[edge_type].append(row)
+                    cols_dict[edge_type].append(col)
+                    num_sampled_edges_dict[edge_type][hop] = len(col)
+                # Iterate over foreign keys that reference the current table:
+                for src_table, fkey in self.rev_foreign_key_dict[table]:
+                    assert pkey is not None and index is not None
+                    _df, _batch = self._by_fkey(
+                        table_name=src_table,
+                        foreign_key=fkey,
+                        index=pkey,
+                        num_neighbors=neighbors,
+                        anchor_time=anchor_time.iloc[batch],
+                        columns=sample_columns_dict[src_table],
+                    )
+                    edge_type = (src_table, fkey, table)
+                    cols_dict[edge_type].append(index[_batch])
+                    num_sampled_edges_dict[edge_type][hop] = len(_batch)
+                    _batch = batch[_batch]
+                    num_nodes = sum(num_sampled_nodes_dict[src_table])
+                    if src_table in self.primary_key_dict:
+                        _pkey = _df[self.primary_key_dict]
+                        mapper_dict[src_table].add(_pkey, _batch)
+                        row = mapper_dict[src_table].get(_pkey, _batch)
+                        # Only preserve unknown rows:
+                        mask = row >= num_nodes  # type: ignore
+                        mask[pd.duplicated(row)] = False
+                        _df = _df.iloc[mask]
+                        _batch = _batch[mask]
+                    else:
+                        row = np.arange(num_nodes, num_nodes + len(_batch))
+                    rows_dict[edge_type].append(row)
+                    num_sampled_nodes_dict[src_table][hop + 1] += len(_batch)
+                    dfs_dict[src_table].append(_df)
+                    batches_dict[src_table].append(_batch)
+        # Post-Processing #####################################################
+        df_dict = {
+            table:
+            pd.concat(dfs, axis=0, ignore_index=True)
+            if len(dfs) > 1 else dfs[0]
+            for table, dfs in dfs_dict.items()
+        }
+        # Only store unique rows in `df` above a certain threshold:
+        inverse_dict: dict[str, np.ndarray] = {}
+        for table, df in df_dict.items():
+            if table not in self.primary_key_dict:
+                continue
+            unique, index, inverse = np.unique(
+                df_dict[table][self.primary_key_dict[table]],
+                return_index=True,
+                return_inverse=True,
+            )
+            if len(df) > 1.05 * len(unique):
+                df_dict[table] = df.iloc[index].reset_index(drop=True)
+                inverse_dict[table] = inverse
+        df_dict = {  # Post-filter column set:
+            table: df[list(columns_dict[table])]
+            for table, df in df_dict.items()
+        }
+        batch_dict = {
+            table: np.concatenate(batches) if len(batches) > 1 else batches[0]
+            for table, batches in batches_dict.items()
+        }
+        row_dict = {
+            edge_type: np.concatenate(rows)
+            for edge_type, rows in rows_dict.items()
+        }
+        col_dict = {
+            edge_type: np.concatenate(cols)
+            for edge_type, cols in cols_dict.items()
+        }
+        if len(num_sampled_edges_dict) == 0:  # Single table:
+            num_sampled_nodes_dict = {
+                key: value[:1]
+                for key, value in num_sampled_nodes_dict.items()
+            }
         return SamplerOutput(
             anchor_time=anchor_time.astype(int).to_numpy(),
-            df_dict={entity_table_name: df},
-            inverse_dict={},
-            batch_dict={entity_table_name: batch},
-            num_sampled_nodes_dict={entity_table_name: [len(batch)]},
-            row_dict={},
-            col_dict={},
-            num_sampled_edges_dict={},
+            df_dict=df_dict,
+            inverse_dict=inverse_dict,
+            batch_dict=batch_dict,
+            num_sampled_nodes_dict=num_sampled_nodes_dict,
+            row_dict=row_dict,
+            col_dict=col_dict,
+            num_sampled_edges_dict=num_sampled_edges_dict,
         )
     # Abstract Methods ########################################################
@@ -138,7 +367,19 @@ class SQLSampler(Sampler):
     def _by_pkey(
         self,
         table_name: str,
-        pkey: pd.Series,
+        index: pd.Series,
+        columns: set[str],
+    ) -> tuple[pd.DataFrame, np.ndarray]:
+        pass
+    @abstractmethod
+    def _by_fkey(
+        self,
+        table_name: str,
+        foreign_key: str,
+        index: pd.Series,
+        num_neighbors: int,
+        anchor_time: pd.Series | None,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
         pass

kumoai/experimental/rfm/base/table.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import warnings
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from functools import cached_property
@@ -20,6 +19,7 @@ from kumoai.experimental.rfm.base import (
     SourceColumn,
     SourceForeignKey,
 )
+from kumoai.experimental.rfm.base.utils import to_datetime
 from kumoai.experimental.rfm.infer import (
     infer_dtype,
     infer_primary_key,
@@ -399,39 +399,39 @@ class Table(ABC):
         r"""Returns a :class:`pandas.DataFrame` object containing metadata
         information about the columns in this table.
-        The returned dataframe has columns ``name``, ``dtype``, ``stype``,
-        ``is_primary_key``, ``is_time_column`` and ``is_end_time_column``,
-        which provide an aggregate view of the properties of the columns of
-        this table.
+        The returned dataframe has columns ``"Name"``, ``"Data Type"``,
+        ``"Semantic Type"``, ``"Primary Key"``, ``"Time Column"`` and
+        ``"End Time Column"``, which provide an aggregated view of the
+        properties of the columns of this table.
         Example:
             >>> # doctest: +SKIP
             >>> import kumoai.experimental.rfm as rfm
             >>> table = rfm.LocalTable(df=..., name=...).infer_metadata()
             >>> table.metadata
-                name        dtype    stype  is_primary_key  is_time_column  is_end_time_column
-            0   CustomerID  float64  ID     True            False           False
+                Name        Data Type  Semantic Type  Primary Key  Time Column  End Time Column
+            0   CustomerID  float64    ID             True         False        False
         """  # noqa: E501
         cols = self.columns
         return pd.DataFrame({
-            'name':
+            'Name':
             pd.Series(dtype=str, data=[c.name for c in cols]),
-            'dtype':
+            'Data Type':
             pd.Series(dtype=str, data=[c.dtype for c in cols]),
-            'stype':
+            'Semantic Type':
             pd.Series(dtype=str, data=[c.stype for c in cols]),
-            'is_primary_key':
+            'Primary Key':
             pd.Series(
                 dtype=bool,
                 data=[self._primary_key == c.name for c in cols],
             ),
-            'is_time_column':
+            'Time Column':
             pd.Series(
                 dtype=bool,
                 data=[self._time_column == c.name for c in cols],
             ),
-            'is_end_time_column':
+            'End Time Column':
             pd.Series(
                 dtype=bool,
                 data=[self._end_time_column == c.name for c in cols],
@@ -623,20 +623,6 @@ class Table(ABC):
         r"""Sanitzes a :class:`pandas.DataFrame` in-place such that its data
         types match table data and semantic type specification.
         """
-        def _to_datetime(ser: pd.Series) -> pd.Series:
-            if not pd.api.types.is_datetime64_any_dtype(ser):
-                with warnings.catch_warnings():
-                    warnings.filterwarnings(
-                        'ignore',
-                        message='Could not infer format',
-                    )
-                    ser = pd.to_datetime(ser, errors='coerce')
-            if isinstance(ser.dtype, pd.DatetimeTZDtype):
-                ser = ser.dt.tz_localize(None)
-            if ser.dtype != 'datetime64[ns]':
-                ser = ser.astype('datetime64[ns]')
-            return ser
         def _to_list(ser: pd.Series, dtype: Dtype | None) -> pd.Series:
             if (pd.api.types.is_string_dtype(ser)
                     and dtype in {Dtype.intlist, Dtype.floatlist}):
@@ -667,9 +653,9 @@ class Table(ABC):
             stype = (stype_dict or {}).get(column_name)
             if dtype == Dtype.time:
-                df[column_name] = _to_datetime(df[column_name])
+                df[column_name] = to_datetime(df[column_name])
             elif stype == Stype.timestamp:
-                df[column_name] = _to_datetime(df[column_name])
+                df[column_name] = to_datetime(df[column_name])
             elif dtype is not None and dtype.is_list():
                 df[column_name] = _to_list(df[column_name], dtype)
             elif stype == Stype.sequence:

kumoai/experimental/rfm/base/utils.py ADDED Viewed

@@ -0,0 +1,36 @@
+import warnings
+import pandas as pd
+import pyarrow as pa
+def is_datetime(ser: pd.Series) -> bool:
+    r"""Check whether a :class:`pandas.Series` holds datetime values."""
+    if isinstance(ser.dtype, pd.ArrowDtype):
+        dtype = ser.dtype.pyarrow_dtype
+        return (pa.types.is_timestamp(dtype) or pa.types.is_date(dtype)
+                or pa.types.is_time(dtype))
+    return pd.api.types.is_datetime64_any_dtype(ser)
+def to_datetime(ser: pd.Series) -> pd.Series:
+    """Converts a :class:`pandas.Series` to ``datetime64[ns]`` format."""
+    if isinstance(ser.dtype, pd.ArrowDtype):
+        ser = pd.Series(ser.to_numpy(), index=ser.index, name=ser.name)
+    if not pd.api.types.is_datetime64_any_dtype(ser):
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                'ignore',
+                message='Could not infer format',
+            )
+            ser = pd.to_datetime(ser, errors='coerce')
+    if isinstance(ser.dtype, pd.DatetimeTZDtype):
+        ser = ser.dt.tz_localize(None)
+    if ser.dtype != 'datetime64[ns]':
+        ser = ser.astype('datetime64[ns]')
+    return ser

kumoai/experimental/rfm/graph.py CHANGED Viewed

@@ -649,28 +649,28 @@ class Graph:
         r"""Returns a :class:`pandas.DataFrame` object containing metadata
         information about the tables in this graph.
-        The returned dataframe has columns ``name``, ``primary_key``,
-        ``time_column``, and ``end_time_column``, which provide an aggregate
-        view of the properties of the tables of this graph.
+        The returned dataframe has columns ``"Name"``, ``"Primary Key"``,
+        ``"Time Column"``, and ``"End Time Column"``, which provide an
+        aggregated view of the properties of the tables of this graph.
         Example:
             >>> # doctest: +SKIP
             >>> import kumoai.experimental.rfm as rfm
             >>> graph = rfm.Graph(tables=...).infer_metadata()
             >>> graph.metadata  # doctest: +SKIP
-                name   primary_key  time_column  end_time_column
-            0   users      user_id            -                -
+                Name   Primary Key  Time Column  End Time Column
+            0   users  user_id      -            -
         """
         tables = list(self.tables.values())
         return pd.DataFrame({
-            'name':
+            'Name':
             pd.Series(dtype=str, data=[t.name for t in tables]),
-            'primary_key':
+            'Primary Key':
             pd.Series(dtype=str, data=[t._primary_key or '-' for t in tables]),
-            'time_column':
+            'Time Column':
             pd.Series(dtype=str, data=[t._time_column or '-' for t in tables]),
-            'end_time_column':
+            'End Time Column':
             pd.Series(
                 dtype=str,
                 data=[t._end_time_column or '-' for t in tables],

kumoai/experimental/rfm/infer/dtype.py CHANGED Viewed

@@ -3,6 +3,8 @@ import pandas as pd
 import pyarrow as pa
 from kumoapi.typing import Dtype
+from kumoai.experimental.rfm.base.utils import is_datetime
 PANDAS_TO_DTYPE: dict[str, Dtype] = {
     'bool': Dtype.bool,
     'boolean': Dtype.bool,
@@ -34,7 +36,7 @@ def infer_dtype(ser: pd.Series) -> Dtype:
     Returns:
         The data type.
     """
-    if pd.api.types.is_datetime64_any_dtype(ser.dtype):
+    if is_datetime(ser):
         return Dtype.date
     if pd.api.types.is_timedelta64_dtype(ser.dtype):
         return Dtype.timedelta

kumoai/experimental/rfm/infer/time_col.py CHANGED Viewed

@@ -3,6 +3,8 @@ import warnings
 import pandas as pd
+from kumoai.experimental.rfm.base.utils import to_datetime
 def infer_time_column(
     df: pd.DataFrame,
@@ -43,11 +45,11 @@ def infer_time_column(
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', message='Could not infer format')
         min_timestamp_dict = {
-            key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
+            key: to_datetime(df[key].iloc[:10_000])
             for key in candidates
         }
     min_timestamp_dict = {
-        key: value.min().tz_localize(None)
+        key: value.min()
         for key, value in min_timestamp_dict.items()
     }
     min_timestamp_dict = {