PyPI - kumoai - Versions diffs - 2.14.0.dev202512211732__cp313-cp313-macosx_11_0_arm64.whl → 2.15.0.dev202601121731__cp313-cp313-macosx_11_0_arm64.whl - Mend

kumoai 2.14.0.dev202512211732__cp313-cp313-macosx_11_0_arm64.whl → 2.15.0.dev202601121731__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

kumoai/__init__.py +23 -26
kumoai/_version.py +1 -1
kumoai/client/client.py +6 -0
kumoai/client/jobs.py +26 -0
kumoai/connector/utils.py +21 -7
kumoai/experimental/rfm/__init__.py +24 -22
kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
kumoai/experimental/rfm/backend/local/sampler.py +0 -3
kumoai/experimental/rfm/backend/local/table.py +24 -25
kumoai/experimental/rfm/backend/snow/sampler.py +190 -71
kumoai/experimental/rfm/backend/snow/table.py +137 -64
kumoai/experimental/rfm/backend/sqlite/sampler.py +192 -87
kumoai/experimental/rfm/backend/sqlite/table.py +85 -55
kumoai/experimental/rfm/base/__init__.py +6 -9
kumoai/experimental/rfm/base/column.py +95 -11
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/mapper.py +69 -0
kumoai/experimental/rfm/base/sampler.py +28 -18
kumoai/experimental/rfm/base/source.py +1 -1
kumoai/experimental/rfm/base/sql_sampler.py +320 -19
kumoai/experimental/rfm/base/table.py +256 -109
kumoai/experimental/rfm/base/utils.py +27 -0
kumoai/experimental/rfm/graph.py +115 -107
kumoai/experimental/rfm/infer/dtype.py +4 -1
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/infer/time_col.py +4 -2
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +540 -306
kumoai/experimental/rfm/task_table.py +292 -0
kumoai/pquery/training_table.py +16 -2
kumoai/testing/snow.py +3 -3
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/utils/display.py +87 -0
kumoai/utils/progress_logger.py +13 -1
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/METADATA +2 -2
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/RECORD +39 -34
kumoai/experimental/rfm/base/column_expression.py +0 -50
kumoai/experimental/rfm/base/sql_table.py +0 -229
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601121731.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/base/__init__.py CHANGED Viewed

@@ -8,12 +8,9 @@ class DataBackend(StrEnum):
 from .source import SourceColumn, SourceForeignKey  # noqa: E402
-from .column import Column  # noqa: E402
-from .column_expression import ColumnExpressionSpec  # noqa: E402
-from .column_expression import ColumnExpressionType  # noqa: E402
-from .column_expression import ColumnExpression  # noqa: E402
+from .expression import Expression, LocalExpression  # noqa: E402
+from .column import ColumnSpec, ColumnSpecType, Column  # noqa: E402
 from .table import Table  # noqa: E402
-from .sql_table import SQLTable  # noqa: E402
 from .sampler import SamplerOutput, Sampler  # noqa: E402
 from .sql_sampler import SQLSampler  # noqa: E402
@@ -21,12 +18,12 @@ __all__ = [
     'DataBackend',
     'SourceColumn',
     'SourceForeignKey',
+    'Expression',
+    'LocalExpression',
+    'ColumnSpec',
+    'ColumnSpecType',
     'Column',
-    'ColumnExpressionSpec',
-    'ColumnExpressionType',
-    'ColumnExpression',
     'Table',
-    'SQLTable',
     'SamplerOutput',
     'Sampler',
     'SQLSampler',

kumoai/experimental/rfm/base/column.py CHANGED Viewed

@@ -1,15 +1,89 @@
+from __future__ import annotations
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Mapping, TypeAlias
 from kumoapi.typing import Dtype, Stype
+from typing_extensions import Self
+from kumoai.experimental.rfm.base import Expression
+from kumoai.mixin import CastMixin
+@dataclass(init=False)
+class ColumnSpec(CastMixin):
+    r"""A column specification for adding a column to a table.
+    A column specification can either refer to a physical column present in
+    the data source, or be defined logically via an expression.
+    Args:
+        name: The name of the column.
+        expr: A column expression to define logical columns.
+        dtype: The data type of the column.
+    """
+    def __init__(
+        self,
+        name: str,
+        expr: Expression | Mapping[str, str] | str | None = None,
+        dtype: Dtype | str | None = None,
+        stype: Stype | str | None = None,
+    ) -> None:
+        self.name = name
+        self.expr = Expression.coerce(expr)
+        self.dtype = Dtype(dtype) if dtype is not None else None
+        self.stype = Stype(dtype) if stype is not None else None
+    @classmethod
+    def coerce(cls, spec: ColumnSpec | Mapping[str, Any] | str) -> Self:
+        r"""Coerces a column specification into a :class:`ColumnSpec`."""
+        if isinstance(spec, cls):
+            return spec
+        if isinstance(spec, str):
+            return cls(name=spec)
+        if isinstance(spec, Mapping):
+            try:
+                return cls(**spec)
+            except TypeError:
+                pass
+        raise TypeError(f"Unable to coerce 'ColumnSpec' from '{spec}'")
+    @property
+    def is_source(self) -> bool:
+        r"""Whether the column specification refers to a phyiscal column
+        present in the data source.
+        """
+        return self.expr is None
+ColumnSpecType: TypeAlias = ColumnSpec | Mapping[str, Any] | str
 @dataclass(init=False, repr=False, eq=False)
 class Column:
+    r"""Column-level metadata information.
+    A column can either refer to a physical column present in the data source,
+    or be defined logically via an expression.
+    Args:
+        name: The name of the column.
+        expr: A column expression to define logical columns.
+        dtype: The data type of the column.
+        stype: The semantic type of the column.
+    """
     stype: Stype
-    def __init__(self, name: str, stype: Stype, dtype: Dtype) -> None:
+    def __init__(
+        self,
+        name: str,
+        expr: Expression | None,
+        dtype: Dtype,
+        stype: Stype,
+    ) -> None:
         self._name = name
+        self._expr = expr
         self._dtype = Dtype(dtype)
         self._is_primary_key = False
@@ -20,19 +94,25 @@ class Column:
     @property
     def name(self) -> str:
+        r"""The name of the column."""
         return self._name
     @property
-    def dtype(self) -> Dtype:
-        return self._dtype
+    def expr(self) -> Expression | None:
+        r"""The expression of column (if logically)."""
+        return self._expr
     @property
-    def is_physical(self) -> bool:
-        return True
+    def dtype(self) -> Dtype:
+        r"""The data type of the column."""
+        return self._dtype
     @property
-    def is_logical(self) -> bool:
-        return not self.is_physical
+    def is_source(self) -> bool:
+        r"""Whether the column refers to a phyiscal column present in the data
+        source.
+        """
+        return self.expr is None
     def __setattr__(self, key: str, val: Any) -> None:
         if key == 'stype':
@@ -56,7 +136,7 @@ class Column:
         super().__setattr__(key, val)
     def __hash__(self) -> int:
-        return hash((self.name, self.stype, self.dtype))
+        return hash((self.name, self.expr, self.dtype, self.stype))
     def __eq__(self, other: Any) -> bool:
         if not isinstance(other, Column):
@@ -64,5 +144,9 @@ class Column:
         return hash(self) == hash(other)
     def __repr__(self) -> str:
-        return (f'{self.__class__.__name__}(name={self.name}, '
-                f'stype={self.stype}, dtype={self.dtype})')
+        parts = [f'name={self.name}']
+        if self.expr is not None:
+            parts.append(f'expr={self.expr}')
+        parts.append(f'dtype={self.dtype}')
+        parts.append(f'stype={self.stype}')
+        return f"{self.__class__.__name__}({', '.join(parts)})"

kumoai/experimental/rfm/base/expression.py ADDED Viewed

@@ -0,0 +1,44 @@
+from __future__ import annotations
+from abc import ABC
+from dataclasses import dataclass
+from typing import Mapping
+class Expression(ABC):
+    """A base expression to define logical columns."""
+    @classmethod
+    def coerce(
+        cls,
+        spec: Expression | Mapping[str, str] | str | None,
+    ) -> Expression | None:
+        r"""Coerces an expression specification into an :class:`Expression`, if
+        possible.
+        """
+        if spec is None:
+            return None
+        if isinstance(spec, Expression):
+            return spec
+        if isinstance(spec, str):
+            return LocalExpression(spec)
+        if isinstance(spec, Mapping):
+            for sub_cls in (LocalExpression, ):
+                try:
+                    return sub_cls(**spec)
+                except TypeError:
+                    pass
+        raise TypeError(f"Unable to coerce 'Expression' from '{spec}'")
+@dataclass(frozen=True, repr=False)
+class LocalExpression(Expression):
+    r"""A local expression to define a row-level logical attribute based on
+    physical columns of the data source in the same row.
+    Args:
+        value: The value of the expression.
+    """
+    value: str
+    def __repr__(self) -> str:
+        return self.value

kumoai/experimental/rfm/base/mapper.py ADDED Viewed

@@ -0,0 +1,69 @@
+import numpy as np
+import pandas as pd
+class Mapper:
+    r"""A mapper to map ``(pkey, batch)`` pairs to contiguous node IDs.
+    Args:
+        num_examples: The maximum number of examples to add/retrieve.
+    """
+    def __init__(self, num_examples: int):
+        self._pkey_dtype: pd.CategoricalDtype | None = None
+        self._indices: list[np.ndarray] = []
+        self._index_dtype: pd.CategoricalDtype | None = None
+        self._num_examples = num_examples
+    def add(self, pkey: pd.Series, batch: np.ndarray) -> None:
+        r"""Adds a set of ``(pkey, batch)`` pairs to the mapper.
+        Args:
+            pkey: The primary keys.
+            batch: The batch vector.
+        """
+        if self._pkey_dtype is not None:
+            category = np.concatenate([
+                self._pkey_dtype.categories.values,
+                pkey,
+            ], axis=0)
+            category = pd.unique(category)
+            self._pkey_dtype = pd.CategoricalDtype(category)
+        elif pd.api.types.is_string_dtype(pkey):
+            category = pd.unique(pkey)
+            self._pkey_dtype = pd.CategoricalDtype(category)
+        if self._pkey_dtype is not None:
+            index = pd.Categorical(pkey, dtype=self._pkey_dtype).codes
+            index = index.astype('int64')
+        else:
+            index = pkey.to_numpy()
+        index = self._num_examples * index + batch
+        self._indices.append(index)
+        self._index_dtype = None
+    def get(self, pkey: pd.Series, batch: np.ndarray) -> np.ndarray:
+        r"""Retrieves the node IDs for a set of ``(pkey, batch)`` pairs.
+        Returns ``-1`` for any pair not registered in the mapping.
+        Args:
+            pkey: The primary keys.
+            batch: The batch vector.
+        """
+        if len(self._indices) == 0:
+            return np.full(len(pkey), -1, dtype=np.int64)
+        if self._index_dtype is None:  # Lazy build index:
+            category = pd.unique(np.concatenate(self._indices))
+            self._index_dtype = pd.CategoricalDtype(category)
+        if self._pkey_dtype is not None:
+            index = pd.Categorical(pkey, dtype=self._pkey_dtype).codes
+            index = index.astype('int64')
+        else:
+            index = pkey.to_numpy()
+        index = self._num_examples * index + batch
+        out = pd.Categorical(index, dtype=self._index_dtype).codes
+        out = out.astype('int64')
+        return out

kumoai/experimental/rfm/base/sampler.py CHANGED Viewed

@@ -13,7 +13,6 @@ from kumoapi.pquery.AST import Aggregation, ASTNode
 from kumoapi.rfm.context import EdgeLayout, Link, Subgraph, Table
 from kumoapi.typing import Stype
-from kumoai.experimental.rfm.base import SourceColumn
 from kumoai.utils import ProgressLogger
 if TYPE_CHECKING:
@@ -53,12 +52,24 @@ class Sampler(ABC):
         graph: 'Graph',
         verbose: bool | ProgressLogger = True,
     ) -> None:
         self._edge_types: list[tuple[str, str, str]] = []
         for edge in graph.edges:
             edge_type = (edge.src_table, edge.fkey, edge.dst_table)
             self._edge_types.append(edge_type)
             self._edge_types.append(Subgraph.rev_edge_type(edge_type))
+        # Source Table -> [(Foreign Key, Destination Table)]
+        self._foreign_key_dict: dict[str, list[tuple[str, str]]] = {}
+        # Destination Table -> [(Source Table, Foreign Key)]
+        self._rev_foreign_key_dict: dict[str, list[tuple[str, str]]] = {}
+        for table in graph.tables.values():
+            self._foreign_key_dict[table.name] = []
+            self._rev_foreign_key_dict[table.name] = []
+        for src_table, fkey, dst_table in graph.edges:
+            self._foreign_key_dict[src_table].append((fkey, dst_table))
+            self._rev_foreign_key_dict[dst_table].append((src_table, fkey))
         self._primary_key_dict: dict[str, str] = {
             table.name: table._primary_key
             for table in graph.tables.values()
@@ -88,10 +99,6 @@ class Sampler(ABC):
                     continue
                 self._table_stype_dict[table.name][column.name] = column.stype
-        self._source_table_dict: dict[str, dict[str, SourceColumn]] = {}
-        for table in graph.tables.values():
-            self._source_table_dict[table.name] = table._source_column_dict
         self._min_time_dict: dict[str, pd.Timestamp] = {}
         self._max_time_dict: dict[str, pd.Timestamp] = {}
@@ -102,6 +109,16 @@ class Sampler(ABC):
         r"""All available edge types in the graph."""
         return self._edge_types
+    @property
+    def foreign_key_dict(self) -> dict[str, list[tuple[str, str]]]:
+        r"""The foreign keys for all tables in the graph."""
+        return self._foreign_key_dict
+    @property
+    def rev_foreign_key_dict(self) -> dict[str, list[tuple[str, str]]]:
+        r"""The foreign key back references for all tables in the graph."""
+        return self._rev_foreign_key_dict
     @property
     def primary_key_dict(self) -> dict[str, str]:
         r"""All available primary keys in the graph."""
@@ -119,16 +136,11 @@ class Sampler(ABC):
     @property
     def table_stype_dict(self) -> dict[str, dict[str, Stype]]:
-        r"""The registered semantic types for all columns in all tables in
-        the graph.
+        r"""The registered semantic types for all feature columns in all tables
+        in the graph.
         """
         return self._table_stype_dict
-    @property
-    def source_table_dict(self) -> dict[str, dict[str, SourceColumn]]:
-        r"""Source column information for all tables in the graph."""
-        return self._source_table_dict
     def get_min_time(
         self,
         table_names: list[str] | None = None,
@@ -189,7 +201,7 @@ class Sampler(ABC):
             exclude_cols_dict: The columns to exclude from the subgraph.
         """
         # Exclude all columns that leak target information:
-        table_stype_dict: dict[str, dict[str, Stype]] = self._table_stype_dict
+        table_stype_dict: dict[str, dict[str, Stype]] = self.table_stype_dict
         if exclude_cols_dict is not None:
             table_stype_dict = copy.deepcopy(table_stype_dict)
             for table_name, exclude_cols in exclude_cols_dict.items():
@@ -237,11 +249,8 @@ class Sampler(ABC):
                 # Set end time to NaT for all values greater than anchor time:
                 assert table_name not in out.inverse_dict
                 ser = df[end_time_column]
-                if ser.dtype != 'datetime64[ns]':
-                    ser = ser.astype('datetime64[ns]')
                 mask = ser.astype(int).to_numpy() > out.anchor_time[batch]
-                ser.iloc[mask] = pd.NaT
-                df[end_time_column] = ser
+                df.loc[mask, end_time_column] = pd.NaT
             stype_dict = table_stype_dict[table_name]
             for column_name, stype in stype_dict.items():
@@ -286,7 +295,8 @@ class Sampler(ABC):
             # Store in compressed representation if more efficient:
             num_cols = subgraph.table_dict[edge_type[2]].num_rows
-            if col is not None and len(col) > num_cols + 1:
+            if (col is not None and len(col) > num_cols + 1
+                    and ((col[1:] - col[:-1]) >= 0).all()):
                 layout = EdgeLayout.CSC
                 colcount = np.bincount(col, minlength=num_cols)
                 col = np.empty(num_cols + 1, dtype=col.dtype)

kumoai/experimental/rfm/base/source.py CHANGED Viewed

@@ -6,7 +6,7 @@ from kumoapi.typing import Dtype
 @dataclass
 class SourceColumn:
     name: str
-    dtype: Dtype
+    dtype: Dtype | None
     is_primary_key: bool
     is_unique_key: bool
     is_nullable: bool