PyPI - kumoai - Versions diffs - 2.14.0.dev202512191731__cp311-cp311-macosx_11_0_arm64.whl → 2.15.0.dev202601141731__cp311-cp311-macosx_11_0_arm64.whl - Mend

kumoai 2.14.0.dev202512191731__cp311-cp311-macosx_11_0_arm64.whl → 2.15.0.dev202601141731__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

kumoai/__init__.py +23 -26
kumoai/_version.py +1 -1
kumoai/client/client.py +6 -0
kumoai/client/jobs.py +26 -0
kumoai/connector/utils.py +21 -7
kumoai/experimental/rfm/__init__.py +24 -22
kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
kumoai/experimental/rfm/backend/local/sampler.py +0 -3
kumoai/experimental/rfm/backend/local/table.py +24 -25
kumoai/experimental/rfm/backend/snow/sampler.py +235 -80
kumoai/experimental/rfm/backend/snow/table.py +146 -70
kumoai/experimental/rfm/backend/sqlite/sampler.py +196 -89
kumoai/experimental/rfm/backend/sqlite/table.py +85 -55
kumoai/experimental/rfm/base/__init__.py +6 -9
kumoai/experimental/rfm/base/column.py +95 -11
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/mapper.py +69 -0
kumoai/experimental/rfm/base/sampler.py +28 -18
kumoai/experimental/rfm/base/source.py +1 -1
kumoai/experimental/rfm/base/sql_sampler.py +320 -19
kumoai/experimental/rfm/base/table.py +256 -109
kumoai/experimental/rfm/base/utils.py +36 -0
kumoai/experimental/rfm/graph.py +115 -107
kumoai/experimental/rfm/infer/dtype.py +7 -2
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/infer/time_col.py +4 -2
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +540 -306
kumoai/experimental/rfm/task_table.py +292 -0
kumoai/pquery/training_table.py +16 -2
kumoai/testing/snow.py +3 -3
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/utils/display.py +87 -0
kumoai/utils/progress_logger.py +15 -2
kumoai/utils/sql.py +2 -2
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/METADATA +2 -2
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/RECORD +40 -35
kumoai/experimental/rfm/base/column_expression.py +0 -50
kumoai/experimental/rfm/base/sql_table.py +0 -229
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/base/sql_sampler.py CHANGED Viewed

@@ -1,15 +1,26 @@
 from abc import abstractmethod
+from collections import defaultdict
 from typing import TYPE_CHECKING, Literal
 import numpy as np
 import pandas as pd
+from kumoapi.rfm.context import Subgraph
+from kumoapi.typing import Dtype
-from kumoai.experimental.rfm.base import Sampler, SamplerOutput, SQLTable
-from kumoai.utils import ProgressLogger
+from kumoai.experimental.rfm.base import (
+    LocalExpression,
+    Sampler,
+    SamplerOutput,
+    SourceColumn,
+)
+from kumoai.experimental.rfm.base.mapper import Mapper
+from kumoai.utils import ProgressLogger, quote_ident
 if TYPE_CHECKING:
     from kumoai.experimental.rfm import Graph
+EdgeType = tuple[str, str, str]
 class SQLSampler(Sampler):
     def __init__(
@@ -19,18 +30,71 @@ class SQLSampler(Sampler):
     ) -> None:
         super().__init__(graph=graph, verbose=verbose)
-        self._fqn_dict: dict[str, str] = {}
+        self._source_name_dict: dict[str, str] = {
+            table.name: table._quoted_source_name
+            for table in graph.tables.values()
+        }
+        self._source_table_dict: dict[str, dict[str, SourceColumn]] = {}
+        for table in graph.tables.values():
+            self._source_table_dict[table.name] = {}
+            for column in table.columns:
+                if not column.is_source:
+                    continue
+                src_column = table._source_column_dict[column.name]
+                self._source_table_dict[table.name][column.name] = src_column
+        self._table_dtype_dict: dict[str, dict[str, Dtype]] = {}
         for table in graph.tables.values():
-            assert isinstance(table, SQLTable)
-            self._connection = table._connection
-            self._fqn_dict[table.name] = table.fqn
+            self._table_dtype_dict[table.name] = {}
+            for column in table.columns:
+                self._table_dtype_dict[table.name][column.name] = column.dtype
+        self._table_column_ref_dict: dict[str, dict[str, str]] = {}
+        self._table_column_proj_dict: dict[str, dict[str, str]] = {}
+        for table in graph.tables.values():
+            column_ref_dict: dict[str, str] = {}
+            column_proj_dict: dict[str, str] = {}
+            for column in table.columns:
+                if column.expr is not None:
+                    assert isinstance(column.expr, LocalExpression)
+                    column_ref_dict[column.name] = column.expr.value
+                    column_proj_dict[column.name] = (
+                        f'{column.expr} AS {quote_ident(column.name)}')
+                else:
+                    column_ref_dict[column.name] = quote_ident(column.name)
+                    column_proj_dict[column.name] = quote_ident(column.name)
+            self._table_column_ref_dict[table.name] = column_ref_dict
+            self._table_column_proj_dict[table.name] = column_proj_dict
+    @property
+    def source_name_dict(self) -> dict[str, str]:
+        r"""The source table names for all tables in the graph."""
+        return self._source_name_dict
     @property
-    def fqn_dict(self) -> dict[str, str]:
-        r"""The fully-qualified quoted source name for all table names in the
+    def source_table_dict(self) -> dict[str, dict[str, SourceColumn]]:
+        r"""The source column information for all tables in the graph."""
+        return self._source_table_dict
+    @property
+    def table_dtype_dict(self) -> dict[str, dict[str, Dtype]]:
+        r"""The data types for all columns in all tables in the graph."""
+        return self._table_dtype_dict
+    @property
+    def table_column_ref_dict(self) -> dict[str, dict[str, str]]:
+        r"""The SQL reference expression for all columns in all tables in the
         graph.
         """
-        return self._fqn_dict
+        return self._table_column_ref_dict
+    @property
+    def table_column_proj_dict(self) -> dict[str, dict[str, str]]:
+        r"""The SQL projection expressions for all columns in all tables in the
+        graph.
+        """
+        return self._table_column_proj_dict
     def _sample_subgraph(
         self,
@@ -41,10 +105,28 @@ class SQLSampler(Sampler):
         num_neighbors: list[int],
     ) -> SamplerOutput:
+        # Make sure to always include primary key, foreign key and time columns
+        # during data fetching since these are needed for graph traversal:
+        sample_columns_dict: dict[str, set[str]] = {}
+        for table, columns in columns_dict.items():
+            sample_columns = columns | {
+                foreign_key
+                for foreign_key, _ in self.foreign_key_dict[table]
+            }
+            if primary_key := self.primary_key_dict.get(table):
+                sample_columns |= {primary_key}
+            sample_columns_dict[table] = sample_columns
+        if not isinstance(anchor_time, pd.Series):
+            sample_columns_dict[entity_table_name] |= {
+                self.time_column_dict[entity_table_name]
+            }
+        # Sample Entity Table #################################################
         df, batch = self._by_pkey(
             table_name=entity_table_name,
-            pkey=entity_pkey,
-            columns=columns_dict[entity_table_name],
+            index=entity_pkey,
+            columns=sample_columns_dict[entity_table_name],
         )
         if len(batch) != len(entity_pkey):
             mask = np.ones(len(entity_pkey), dtype=bool)
@@ -53,23 +135,230 @@ class SQLSampler(Sampler):
                            f"{entity_pkey.iloc[mask].tolist()} do not exist "
                            f"in the '{entity_table_name}' table")
+        # Make sure that entities are returned in expected order:
         perm = batch.argsort()
         batch = batch[perm]
         df = df.iloc[perm].reset_index(drop=True)
+        # Fill 'entity' anchor times with actual values:
         if not isinstance(anchor_time, pd.Series):
             time_column = self.time_column_dict[entity_table_name]
             anchor_time = df[time_column]
+        assert isinstance(anchor_time, pd.Series)
+        # Recursive Neighbor Sampling #########################################
+        mapper_dict: dict[str, Mapper] = defaultdict(
+            lambda: Mapper(num_examples=len(entity_pkey)))
+        mapper_dict[entity_table_name].add(
+            pkey=df[self.primary_key_dict[entity_table_name]],
+            batch=batch,
+        )
+        dfs_dict: dict[str, list[pd.DataFrame]] = defaultdict(list)
+        dfs_dict[entity_table_name].append(df)
+        batches_dict: dict[str, list[np.ndarray]] = defaultdict(list)
+        batches_dict[entity_table_name].append(batch)
+        num_sampled_nodes_dict: dict[str, list[int]] = defaultdict(
+            lambda: [0] * (len(num_neighbors) + 1))
+        num_sampled_nodes_dict[entity_table_name][0] = len(entity_pkey)
+        rows_dict: dict[EdgeType, list[np.ndarray]] = defaultdict(list)
+        cols_dict: dict[EdgeType, list[np.ndarray]] = defaultdict(list)
+        num_sampled_edges_dict: dict[EdgeType, list[int]] = defaultdict(
+            lambda: [0] * len(num_neighbors))
+        # The start index of data frame slices of the previous hop:
+        offset_dict: dict[str, int] = defaultdict(int)
+        for hop, neighbors in enumerate(num_neighbors):
+            if neighbors == 0:
+                break  # Abort early.
+            for table in list(num_sampled_nodes_dict.keys()):
+                # Only sample from tables that have been visited in the
+                # previous hop:
+                if num_sampled_nodes_dict[table][hop] == 0:
+                    continue
+                # Collect the slices of data sampled in the previous hop
+                # (but maintain only required key information):
+                cols = [fkey for fkey, _ in self.foreign_key_dict[table]]
+                if table in self.primary_key_dict:
+                    cols.append(self.primary_key_dict[table])
+                dfs = [df[cols] for df in dfs_dict[table][offset_dict[table]:]]
+                df = pd.concat(
+                    dfs,
+                    axis=0,
+                    ignore_index=True,
+                ) if len(dfs) > 1 else dfs[0]
+                batches = batches_dict[table][offset_dict[table]:]
+                batch = (np.concatenate(batches)
+                         if len(batches) > 1 else batches[0])
+                offset_dict[table] = len(batches_dict[table])  # Increase.
+                pkey: pd.Series | None = None
+                index: pd.ndarray | None = None
+                if table in self.primary_key_dict:
+                    pkey = df[self.primary_key_dict[table]]
+                    index = mapper_dict[table].get(pkey, batch)
+                # Iterate over foreign keys in the current table:
+                for fkey, dst_table in self.foreign_key_dict[table]:
+                    row = mapper_dict[dst_table].get(df[fkey], batch)
+                    mask = row == -1
+                    if mask.any():
+                        key_df = pd.DataFrame({
+                            'fkey': df[fkey],
+                            'batch': batch,
+                        }).iloc[mask]
+                        # Only maintain unique keys per example:
+                        unique_key_df = key_df.drop_duplicates()
+                        # Fully de-duplicate keys across examples:
+                        code, fkey_index = pd.factorize(unique_key_df['fkey'])
+                        _df, _batch = self._by_pkey(
+                            table_name=dst_table,
+                            index=fkey_index,
+                            columns=sample_columns_dict[dst_table],
+                        )  # Ensure result is sorted according to input order:
+                        _df = _df.iloc[_batch.argsort()]
+                        # Compute valid entries (without dangling foreign keys)
+                        # in `unique_fkey_df`:
+                        _mask = np.full(len(fkey_index), fill_value=False)
+                        _mask[_batch] = True
+                        _mask = _mask[code]
+                        # Recontruct unique (key, batch) pairs:
+                        code, _ = pd.factorize(unique_key_df['fkey'][_mask])
+                        _df = _df.iloc[code].reset_index(drop=True)
+                        _batch = unique_key_df['batch'].to_numpy()[_mask]
+                        # Register node IDs:
+                        mapper_dict[dst_table].add(
+                            pkey=_df[self.primary_key_dict[dst_table]],
+                            batch=_batch,
+                        )
+                        row[mask] = mapper_dict[dst_table].get(
+                            pkey=key_df['fkey'],
+                            batch=key_df['batch'].to_numpy(),
+                        )  # NOTE `row` may still hold `-1` for dangling fkeys.
+                        dfs_dict[dst_table].append(_df)
+                        batches_dict[dst_table].append(_batch)
+                        num_sampled_nodes_dict[dst_table][hop + 1] += (  #
+                            len(_batch))
+                    mask = row != -1
+                    col = index
+                    if col is None:
+                        start = sum(num_sampled_nodes_dict[table][:hop])
+                        end = sum(num_sampled_nodes_dict[table][:hop + 1])
+                        col = np.arange(start, end)
+                    row = row[mask]
+                    col = col[mask]
+                    edge_type = (table, fkey, dst_table)
+                    edge_type = Subgraph.rev_edge_type(edge_type)
+                    rows_dict[edge_type].append(row)
+                    cols_dict[edge_type].append(col)
+                    num_sampled_edges_dict[edge_type][hop] = len(col)
+                # Iterate over foreign keys that reference the current table:
+                for src_table, fkey in self.rev_foreign_key_dict[table]:
+                    assert pkey is not None and index is not None
+                    _df, _batch = self._by_fkey(
+                        table_name=src_table,
+                        foreign_key=fkey,
+                        index=pkey,
+                        num_neighbors=neighbors,
+                        anchor_time=anchor_time.iloc[batch],
+                        columns=sample_columns_dict[src_table],
+                    )
+                    edge_type = (src_table, fkey, table)
+                    cols_dict[edge_type].append(index[_batch])
+                    num_sampled_edges_dict[edge_type][hop] = len(_batch)
+                    _batch = batch[_batch]
+                    num_nodes = sum(num_sampled_nodes_dict[src_table])
+                    if src_table in self.primary_key_dict:
+                        _pkey = _df[self.primary_key_dict]
+                        mapper_dict[src_table].add(_pkey, _batch)
+                        row = mapper_dict[src_table].get(_pkey, _batch)
+                        # Only preserve unknown rows:
+                        mask = row >= num_nodes  # type: ignore
+                        mask[pd.duplicated(row)] = False
+                        _df = _df.iloc[mask]
+                        _batch = _batch[mask]
+                    else:
+                        row = np.arange(num_nodes, num_nodes + len(_batch))
+                    rows_dict[edge_type].append(row)
+                    num_sampled_nodes_dict[src_table][hop + 1] += len(_batch)
+                    dfs_dict[src_table].append(_df)
+                    batches_dict[src_table].append(_batch)
+        # Post-Processing #####################################################
+        df_dict = {
+            table:
+            pd.concat(dfs, axis=0, ignore_index=True)
+            if len(dfs) > 1 else dfs[0]
+            for table, dfs in dfs_dict.items()
+        }
+        # Only store unique rows in `df` above a certain threshold:
+        inverse_dict: dict[str, np.ndarray] = {}
+        for table, df in df_dict.items():
+            if table not in self.primary_key_dict:
+                continue
+            unique, index, inverse = np.unique(
+                df_dict[table][self.primary_key_dict[table]],
+                return_index=True,
+                return_inverse=True,
+            )
+            if len(df) > 1.05 * len(unique):
+                df_dict[table] = df.iloc[index].reset_index(drop=True)
+                inverse_dict[table] = inverse
+        df_dict = {  # Post-filter column set:
+            table: df[list(columns_dict[table])]
+            for table, df in df_dict.items()
+        }
+        batch_dict = {
+            table: np.concatenate(batches) if len(batches) > 1 else batches[0]
+            for table, batches in batches_dict.items()
+        }
+        row_dict = {
+            edge_type: np.concatenate(rows)
+            for edge_type, rows in rows_dict.items()
+        }
+        col_dict = {
+            edge_type: np.concatenate(cols)
+            for edge_type, cols in cols_dict.items()
+        }
+        if len(num_sampled_edges_dict) == 0:  # Single table:
+            num_sampled_nodes_dict = {
+                key: value[:1]
+                for key, value in num_sampled_nodes_dict.items()
+            }
         return SamplerOutput(
             anchor_time=anchor_time.astype(int).to_numpy(),
-            df_dict={entity_table_name: df},
-            inverse_dict={},
-            batch_dict={entity_table_name: batch},
-            num_sampled_nodes_dict={entity_table_name: [len(batch)]},
-            row_dict={},
-            col_dict={},
-            num_sampled_edges_dict={},
+            df_dict=df_dict,
+            inverse_dict=inverse_dict,
+            batch_dict=batch_dict,
+            num_sampled_nodes_dict=num_sampled_nodes_dict,
+            row_dict=row_dict,
+            col_dict=col_dict,
+            num_sampled_edges_dict=num_sampled_edges_dict,
         )
     # Abstract Methods ########################################################
@@ -78,7 +367,19 @@ class SQLSampler(Sampler):
     def _by_pkey(
         self,
         table_name: str,
-        pkey: pd.Series,
+        index: pd.Series,
+        columns: set[str],
+    ) -> tuple[pd.DataFrame, np.ndarray]:
+        pass
+    @abstractmethod
+    def _by_fkey(
+        self,
+        table_name: str,
+        foreign_key: str,
+        index: pd.Series,
+        num_neighbors: int,
+        anchor_time: pd.Series | None,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
         pass