PyPI - kumoai - Versions diffs - 2.13.0.dev202511211730__py3-none-any.whl → 2.15.0.dev202601131732__py3-none-any.whl - Mend

kumoai 2.13.0.dev202511211730py3-none-any.whl → 2.15.0.dev202601131732py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

kumoai/__init__.py +35 -26
kumoai/_version.py +1 -1
kumoai/client/client.py +6 -0
kumoai/client/jobs.py +26 -0
kumoai/client/pquery.py +6 -2
kumoai/connector/utils.py +44 -9
kumoai/experimental/rfm/__init__.py +70 -68
kumoai/experimental/rfm/authenticate.py +3 -4
kumoai/experimental/rfm/backend/__init__.py +0 -0
kumoai/experimental/rfm/backend/local/__init__.py +42 -0
kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +65 -127
kumoai/experimental/rfm/backend/local/sampler.py +312 -0
kumoai/experimental/rfm/backend/local/table.py +113 -0
kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
kumoai/experimental/rfm/backend/snow/sampler.py +407 -0
kumoai/experimental/rfm/backend/snow/table.py +245 -0
kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
kumoai/experimental/rfm/backend/sqlite/sampler.py +454 -0
kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
kumoai/experimental/rfm/base/__init__.py +30 -0
kumoai/experimental/rfm/base/column.py +152 -0
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/mapper.py +69 -0
kumoai/experimental/rfm/base/sampler.py +783 -0
kumoai/experimental/rfm/base/source.py +19 -0
kumoai/experimental/rfm/base/sql_sampler.py +385 -0
kumoai/experimental/rfm/base/table.py +722 -0
kumoai/experimental/rfm/base/utils.py +36 -0
kumoai/experimental/rfm/{local_graph.py → graph.py} +581 -154
kumoai/experimental/rfm/infer/__init__.py +8 -0
kumoai/experimental/rfm/infer/dtype.py +84 -0
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/infer/pkey.py +128 -0
kumoai/experimental/rfm/infer/stype.py +35 -0
kumoai/experimental/rfm/infer/time_col.py +63 -0
kumoai/experimental/rfm/pquery/executor.py +27 -27
kumoai/experimental/rfm/pquery/pandas_executor.py +30 -32
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +783 -481
kumoai/experimental/rfm/sagemaker.py +15 -7
kumoai/experimental/rfm/task_table.py +292 -0
kumoai/pquery/predictive_query.py +10 -6
kumoai/pquery/training_table.py +16 -2
kumoai/testing/decorators.py +1 -1
kumoai/testing/snow.py +50 -0
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/utils/__init__.py +3 -2
kumoai/utils/display.py +87 -0
kumoai/utils/progress_logger.py +192 -13
kumoai/utils/sql.py +3 -0
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.15.0.dev202601131732.dist-info}/METADATA +10 -8
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.15.0.dev202601131732.dist-info}/RECORD +55 -30
kumoai/experimental/rfm/local_graph_sampler.py +0 -182
kumoai/experimental/rfm/local_pquery_driver.py +0 -689
kumoai/experimental/rfm/local_table.py +0 -545
kumoai/experimental/rfm/utils.py +0 -344
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.15.0.dev202601131732.dist-info}/WHEEL +0 -0
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.15.0.dev202601131732.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.15.0.dev202601131732.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/rfm.py CHANGED Viewed

@@ -1,26 +1,23 @@
 import json
+import math
 import time
 import warnings
 from collections import defaultdict
-from collections.abc import Generator
+from collections.abc import Generator, Iterator
 from contextlib import contextmanager
 from dataclasses import dataclass, replace
-from typing import (
-    Any,
-    Dict,
-    Iterator,
-    List,
-    Literal,
-    Optional,
-    Tuple,
-    Union,
-    overload,
-)
+from typing import Any, Literal, overload
-import numpy as np
 import pandas as pd
 from kumoapi.model_plan import RunMode
 from kumoapi.pquery import QueryType, ValidatedPredictiveQuery
+from kumoapi.pquery.AST import (
+    Aggregation,
+    Column,
+    Condition,
+    Join,
+    LogicalOperation,
+)
 from kumoapi.rfm import Context
 from kumoapi.rfm import Explanation as ExplanationConfig
 from kumoapi.rfm import (
@@ -29,35 +26,38 @@ from kumoapi.rfm import (
     RFMPredictRequest,
 )
 from kumoapi.task import TaskType
+from kumoapi.typing import AggregationType, Stype
+from rich.console import Console
+from rich.markdown import Markdown
+from kumoai import in_notebook
 from kumoai.client.rfm import RFMAPI
 from kumoai.exceptions import HTTPException
-from kumoai.experimental.rfm import LocalGraph
-from kumoai.experimental.rfm.local_graph_sampler import LocalGraphSampler
-from kumoai.experimental.rfm.local_graph_store import LocalGraphStore
-from kumoai.experimental.rfm.local_pquery_driver import (
-    LocalPQueryDriver,
-    date_offset_to_seconds,
-)
+from kumoai.experimental.rfm import Graph, TaskTable
+from kumoai.experimental.rfm.base import DataBackend, Sampler
 from kumoai.mixin import CastMixin
-from kumoai.utils import InteractiveProgressLogger, ProgressLogger
+from kumoai.utils import ProgressLogger, display
 _RANDOM_SEED = 42
 _MAX_PRED_SIZE: dict[TaskType, int] = defaultdict(lambda: 1_000)
 _MAX_PRED_SIZE[TaskType.TEMPORAL_LINK_PREDICTION] = 200
+_MAX_TEST_SIZE: dict[TaskType, int] = defaultdict(lambda: 2_000)
+_MAX_TEST_SIZE[TaskType.TEMPORAL_LINK_PREDICTION] = 400
 _MAX_CONTEXT_SIZE = {
     RunMode.DEBUG: 100,
     RunMode.FAST: 1_000,
     RunMode.NORMAL: 5_000,
     RunMode.BEST: 10_000,
 }
-_MAX_TEST_SIZE = {  # Share test set size across run modes for fair comparison:
-    RunMode.DEBUG: 100,
-    RunMode.FAST: 2_000,
-    RunMode.NORMAL: 2_000,
-    RunMode.BEST: 2_000,
+_DEFAULT_NUM_NEIGHBORS = {
+    RunMode.DEBUG: [16, 16, 4, 4, 1, 1],
+    RunMode.FAST: [32, 32, 8, 8, 4, 4],
+    RunMode.NORMAL: [64, 64, 8, 8, 4, 4],
+    RunMode.BEST: [64, 64, 8, 8, 4, 4],
 }
 _MAX_SIZE = 30 * 1024 * 1024
@@ -95,24 +95,36 @@ class Explanation:
     def __getitem__(self, index: Literal[1]) -> str:
         pass
-    def __getitem__(self, index: int) -> Union[pd.DataFrame, str]:
+    def __getitem__(self, index: int) -> pd.DataFrame | str:
         if index == 0:
             return self.prediction
         if index == 1:
             return self.summary
         raise IndexError("Index out of range")
-    def __iter__(self) -> Iterator[Union[pd.DataFrame, str]]:
+    def __iter__(self) -> Iterator[pd.DataFrame | str]:
         return iter((self.prediction, self.summary))
     def __repr__(self) -> str:
         return str((self.prediction, self.summary))
-    def _ipython_display_(self) -> None:
-        from IPython.display import Markdown, display
+    def __str__(self) -> str:
+        console = Console(soft_wrap=True)
+        with console.capture() as cap:
+            console.print(display.to_rich_table(self.prediction))
+            console.print(Markdown(self.summary))
+        return cap.get()[:-1]
+    def print(self) -> None:
+        r"""Prints the explanation."""
+        if in_notebook():
+            display.dataframe(self.prediction)
+            display.message(self.summary)
+        else:
+            print(self)
-        display(self.prediction)
-        display(Markdown(self.summary))
+    def _ipython_display_(self) -> None:
+        self.print()
 class KumoRFM:
@@ -123,17 +135,17 @@ class KumoRFM:
     :class:`KumoRFM` is a foundation model to generate predictions for any
     relational dataset without training.
     The model is pre-trained and the class provides an interface to query the
-    model from a :class:`LocalGraph` object.
+    model from a :class:`Graph` object.
     .. code-block:: python
-        from kumoai.experimental.rfm import LocalGraph, KumoRFM
+        from kumoai.experimental.rfm import Graph, KumoRFM
         df_users = pd.DataFrame(...)
         df_items = pd.DataFrame(...)
         df_orders = pd.DataFrame(...)
-        graph = LocalGraph.from_data({
+        graph = Graph.from_data({
             'users': df_users,
             'items': df_items,
             'orders': df_orders,
@@ -150,40 +162,78 @@ class KumoRFM:
     Args:
         graph: The graph.
-        preprocess: Whether to pre-process the data in advance during graph
-            materialization.
-            This is a runtime trade-off between graph materialization and model
-            processing speed.
-            It can be benefical to preprocess your data once and then run many
-            queries on top to achieve maximum model speed.
-            However, if activiated, graph materialization can take potentially
-            much longer, especially on graphs with many large text columns.
-            Best to tune this option manually.
         verbose: Whether to print verbose output.
+        optimize: If set to ``True``, will optimize the underlying data backend
+            for optimal querying. For example, for transactional database
+            backends, will create any missing indices. Requires write-access to
+            the data backend.
     """
     def __init__(
         self,
-        graph: LocalGraph,
-        preprocess: bool = False,
-        verbose: Union[bool, ProgressLogger] = True,
+        graph: Graph,
+        verbose: bool | ProgressLogger = True,
+        optimize: bool = False,
     ) -> None:
         graph = graph.validate()
         self._graph_def = graph._to_api_graph_definition()
-        self._graph_store = LocalGraphStore(graph, preprocess, verbose)
-        self._graph_sampler = LocalGraphSampler(self._graph_store)
-        self._batch_size: Optional[int | Literal['max']] = None
-        self.num_retries: int = 0
+        if graph.backend == DataBackend.LOCAL:
+            from kumoai.experimental.rfm.backend.local import LocalSampler
+            self._sampler: Sampler = LocalSampler(graph, verbose)
+        elif graph.backend == DataBackend.SQLITE:
+            from kumoai.experimental.rfm.backend.sqlite import SQLiteSampler
+            self._sampler = SQLiteSampler(graph, verbose, optimize)
+        elif graph.backend == DataBackend.SNOWFLAKE:
+            from kumoai.experimental.rfm.backend.snow import SnowSampler
+            self._sampler = SnowSampler(graph, verbose)
+        else:
+            raise NotImplementedError
+        self._client: RFMAPI | None = None
+        self._batch_size: int | Literal['max'] | None = None
+        self._num_retries: int = 0
+    @property
+    def _api_client(self) -> RFMAPI:
+        if self._client is not None:
+            return self._client
         from kumoai.experimental.rfm import global_state
-        self._api_client = RFMAPI(global_state.client)
+        self._client = RFMAPI(global_state.client)
+        return self._client
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}()'
+    @contextmanager
+    def retry(
+        self,
+        num_retries: int = 1,
+    ) -> Generator[None, None, None]:
+        """Context manager to retry failed queries due to unexpected server
+        issues.
+        .. code-block:: python
+            with model.retry(num_retries=1):
+                df = model.predict(query, indices=...)
+        Args:
+            num_retries: The maximum number of retries.
+        """
+        if num_retries < 0:
+            raise ValueError(f"'num_retries' must be greater than or equal to "
+                             f"zero (got {num_retries})")
+        self._num_retries = num_retries
+        yield
+        self._num_retries = 0
     @contextmanager
     def batch_mode(
         self,
-        batch_size: Union[int, Literal['max']] = 'max',
+        batch_size: int | Literal['max'] = 'max',
         num_retries: int = 1,
     ) -> Generator[None, None, None]:
         """Context manager to predict in batches.
@@ -203,31 +253,26 @@ class KumoRFM:
             raise ValueError(f"'batch_size' must be greater than zero "
                              f"(got {batch_size})")
-        if num_retries < 0:
-            raise ValueError(f"'num_retries' must be greater than or equal to "
-                             f"zero (got {num_retries})")
         self._batch_size = batch_size
-        self.num_retries = num_retries
-        yield
+        with self.retry(self._num_retries or num_retries):
+            yield
         self._batch_size = None
-        self.num_retries = 0
     @overload
     def predict(
         self,
         query: str,
-        indices: Union[List[str], List[float], List[int], None] = None,
+        indices: list[str] | list[float] | list[int] | None = None,
         *,
         explain: Literal[False] = False,
-        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
-        context_anchor_time: Union[pd.Timestamp, None] = None,
-        run_mode: Union[RunMode, str] = RunMode.FAST,
-        num_neighbors: Optional[List[int]] = None,
+        anchor_time: pd.Timestamp | Literal['entity'] | None = None,
+        context_anchor_time: pd.Timestamp | None = None,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
         num_hops: int = 2,
-        max_pq_iterations: int = 20,
-        random_seed: Optional[int] = _RANDOM_SEED,
-        verbose: Union[bool, ProgressLogger] = True,
+        max_pq_iterations: int = 10,
+        random_seed: int | None = _RANDOM_SEED,
+        verbose: bool | ProgressLogger = True,
         use_prediction_time: bool = False,
     ) -> pd.DataFrame:
         pass
@@ -236,37 +281,56 @@ class KumoRFM:
     def predict(
         self,
         query: str,
-        indices: Union[List[str], List[float], List[int], None] = None,
+        indices: list[str] | list[float] | list[int] | None = None,
         *,
-        explain: Union[Literal[True], ExplainConfig, Dict[str, Any]],
-        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
-        context_anchor_time: Union[pd.Timestamp, None] = None,
-        run_mode: Union[RunMode, str] = RunMode.FAST,
-        num_neighbors: Optional[List[int]] = None,
+        explain: Literal[True] | ExplainConfig | dict[str, Any],
+        anchor_time: pd.Timestamp | Literal['entity'] | None = None,
+        context_anchor_time: pd.Timestamp | None = None,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
         num_hops: int = 2,
-        max_pq_iterations: int = 20,
-        random_seed: Optional[int] = _RANDOM_SEED,
-        verbose: Union[bool, ProgressLogger] = True,
+        max_pq_iterations: int = 10,
+        random_seed: int | None = _RANDOM_SEED,
+        verbose: bool | ProgressLogger = True,
         use_prediction_time: bool = False,
     ) -> Explanation:
         pass
+    @overload
+    def predict(
+        self,
+        query: str,
+        indices: list[str] | list[float] | list[int] | None = None,
+        *,
+        explain: bool | ExplainConfig | dict[str, Any] = False,
+        anchor_time: pd.Timestamp | Literal['entity'] | None = None,
+        context_anchor_time: pd.Timestamp | None = None,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        num_hops: int = 2,
+        max_pq_iterations: int = 10,
+        random_seed: int | None = _RANDOM_SEED,
+        verbose: bool | ProgressLogger = True,
+        use_prediction_time: bool = False,
+    ) -> pd.DataFrame | Explanation:
+        pass
     def predict(
         self,
         query: str,
-        indices: Union[List[str], List[float], List[int], None] = None,
+        indices: list[str] | list[float] | list[int] | None = None,
         *,
-        explain: Union[bool, ExplainConfig, Dict[str, Any]] = False,
-        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
-        context_anchor_time: Union[pd.Timestamp, None] = None,
-        run_mode: Union[RunMode, str] = RunMode.FAST,
-        num_neighbors: Optional[List[int]] = None,
+        explain: bool | ExplainConfig | dict[str, Any] = False,
+        anchor_time: pd.Timestamp | Literal['entity'] | None = None,
+        context_anchor_time: pd.Timestamp | None = None,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
         num_hops: int = 2,
-        max_pq_iterations: int = 20,
-        random_seed: Optional[int] = _RANDOM_SEED,
-        verbose: Union[bool, ProgressLogger] = True,
+        max_pq_iterations: int = 10,
+        random_seed: int | None = _RANDOM_SEED,
+        verbose: bool | ProgressLogger = True,
         use_prediction_time: bool = False,
-    ) -> Union[pd.DataFrame, Explanation]:
+    ) -> pd.DataFrame | Explanation:
         """Returns predictions for a predictive query.
         Args:
@@ -274,8 +338,7 @@ class KumoRFM:
             indices: The entity primary keys to predict for. Will override the
                 indices given as part of the predictive query. Predictions will
                 be generated for all indices, independent of whether they
-                fulfill entity filter constraints. To pre-filter entities, use
-                :meth:`~KumoRFM.is_valid_entity`.
+                fulfill entity filter constraints.
             explain: Configuration for explainability.
                 If set to ``True``, will additionally explain the prediction.
                 Passing in an :class:`ExplainConfig` instance provides control
@@ -308,18 +371,152 @@ class KumoRFM:
             If ``explain`` is provided, returns an :class:`Explanation` object
             containing the prediction, summary, and details.
         """
-        explain_config: Optional[ExplainConfig] = None
-        if explain is True:
-            explain_config = ExplainConfig()
-        elif explain is not False:
-            explain_config = ExplainConfig._cast(explain)
         query_def = self._parse_query(query)
-        query_str = query_def.to_string()
+        if indices is None:
+            if query_def.rfm_entity_ids is None:
+                raise ValueError("Cannot find entities to predict for. Please "
+                                 "pass them via `predict(query, indices=...)`")
+            indices = query_def.get_rfm_entity_id_list()
+        query_def = replace(
+            query_def,
+            for_each='FOR EACH',
+            rfm_entity_ids=None,
+        )
+        if not isinstance(verbose, ProgressLogger):
+            query_repr = query_def.to_string(rich=True, exclude_predict=True)
+            if explain is not False:
+                msg = f'[bold]EXPLAIN[/bold] {query_repr}'
+            else:
+                msg = f'[bold]PREDICT[/bold] {query_repr}'
+            verbose = ProgressLogger.default(msg=msg, verbose=verbose)
+        with verbose as logger:
+            task_table = self._get_task_table(
+                query=query_def,
+                indices=indices,
+                anchor_time=anchor_time,
+                context_anchor_time=context_anchor_time,
+                run_mode=run_mode,
+                max_pq_iterations=max_pq_iterations,
+                random_seed=random_seed,
+                logger=logger,
+            )
+            task_table._query = query_def.to_string()
+            return self.predict_task(
+                task_table,
+                explain=explain,
+                run_mode=run_mode,
+                num_neighbors=num_neighbors,
+                num_hops=num_hops,
+                verbose=verbose,
+                exclude_cols_dict=query_def.get_exclude_cols_dict(),
+                use_prediction_time=use_prediction_time,
+                top_k=query_def.top_k,
+            )
+    @overload
+    def predict_task(
+        self,
+        task: TaskTable,
+        *,
+        explain: Literal[False] = False,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        num_hops: int = 2,
+        verbose: bool | ProgressLogger = True,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        use_prediction_time: bool = False,
+        top_k: int | None = None,
+    ) -> pd.DataFrame:
+        pass
+    @overload
+    def predict_task(
+        self,
+        task: TaskTable,
+        *,
+        explain: Literal[True] | ExplainConfig | dict[str, Any],
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        num_hops: int = 2,
+        verbose: bool | ProgressLogger = True,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        use_prediction_time: bool = False,
+        top_k: int | None = None,
+    ) -> Explanation:
+        pass
+    @overload
+    def predict_task(
+        self,
+        task: TaskTable,
+        *,
+        explain: bool | ExplainConfig | dict[str, Any] = False,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        num_hops: int = 2,
+        verbose: bool | ProgressLogger = True,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        use_prediction_time: bool = False,
+        top_k: int | None = None,
+    ) -> pd.DataFrame | Explanation:
+        pass
+    def predict_task(
+        self,
+        task: TaskTable,
+        *,
+        explain: bool | ExplainConfig | dict[str, Any] = False,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        num_hops: int = 2,
+        verbose: bool | ProgressLogger = True,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        use_prediction_time: bool = False,
+        top_k: int | None = None,
+    ) -> pd.DataFrame | Explanation:
+        """Returns predictions for a custom task specification.
+        Args:
+            task: The custom :class:`TaskTable`.
+            explain: Configuration for explainability.
+                If set to ``True``, will additionally explain the prediction.
+                Passing in an :class:`ExplainConfig` instance provides control
+                over which parts of explanation are generated.
+                Explainability is currently only supported for single entity
+                predictions with ``run_mode="FAST"``.
+            run_mode: The :class:`RunMode` for the query.
+            num_neighbors: The number of neighbors to sample for each hop.
+                If specified, the ``num_hops`` option will be ignored.
+            num_hops: The number of hops to sample when generating the context.
+            verbose: Whether to print verbose output.
+            exclude_cols_dict: Any column in any table to exclude from the
+                model input.
+            use_prediction_time: Whether to use the anchor timestamp as an
+                additional feature during prediction. This is typically
+                beneficial for time series forecasting tasks.
+            top_k: The number of predictions to return per entity.
+        Returns:
+            The predictions as a :class:`pandas.DataFrame`.
+            If ``explain`` is provided, returns an :class:`Explanation` object
+            containing the prediction, summary, and details.
+        """
         if num_hops != 2 and num_neighbors is not None:
             warnings.warn(f"Received custom 'num_neighbors' option; ignoring "
                           f"custom 'num_hops={num_hops}' option")
+        if num_neighbors is None:
+            key = RunMode.FAST if task.task_type.is_link_pred else run_mode
+            num_neighbors = _DEFAULT_NUM_NEIGHBORS[key][:num_hops]
+        explain_config: ExplainConfig | None = None
+        if explain is True:
+            explain_config = ExplainConfig()
+        elif explain is not False:
+            explain_config = ExplainConfig._cast(explain)
         if explain_config is not None and run_mode in {
                 RunMode.NORMAL, RunMode.BEST
@@ -328,83 +525,82 @@ class KumoRFM:
                           f"run mode 'FAST' (got '{run_mode}'). Provided run "
                           f"mode has been reset. Please lower the run mode to "
                           f"suppress this warning.")
+            run_mode = RunMode.FAST
-        if indices is None:
-            if query_def.rfm_entity_ids is None:
-                raise ValueError("Cannot find entities to predict for. Please "
-                                 "pass them via `predict(query, indices=...)`")
-            indices = query_def.get_rfm_entity_id_list()
-        else:
-            query_def = replace(query_def, rfm_entity_ids=None)
-        if len(indices) == 0:
-            raise ValueError("At least one entity is required")
-        if explain_config is not None and len(indices) > 1:
-            raise ValueError(
-                f"Cannot explain predictions for more than a single entity "
-                f"(got {len(indices)})")
-        query_repr = query_def.to_string(rich=True, exclude_predict=True)
-        if explain_config is not None:
-            msg = f'[bold]EXPLAIN[/bold] {query_repr}'
-        else:
-            msg = f'[bold]PREDICT[/bold] {query_repr}'
+        if explain_config is not None and task.num_prediction_examples > 1:
+            raise ValueError(f"Cannot explain predictions for more than a "
+                             f"single entity "
+                             f"(got {task.num_prediction_examples:,})")
         if not isinstance(verbose, ProgressLogger):
-            verbose = InteractiveProgressLogger(msg, verbose=verbose)
-        with verbose as logger:
-            batch_size: Optional[int] = None
-            if self._batch_size == 'max':
-                task_type = LocalPQueryDriver.get_task_type(
-                    query_def,
-                    edge_types=self._graph_store.edge_types,
-                )
-                batch_size = _MAX_PRED_SIZE[task_type]
+            if task.task_type == TaskType.BINARY_CLASSIFICATION:
+                task_type_repr = 'binary classification'
+            elif task.task_type == TaskType.MULTICLASS_CLASSIFICATION:
+                task_type_repr = 'multi-class classification'
+            elif task.task_type == TaskType.REGRESSION:
+                task_type_repr = 'regression'
+            elif task.task_type == TaskType.TEMPORAL_LINK_PREDICTION:
+                task_type_repr = 'link prediction'
             else:
-                batch_size = self._batch_size
+                task_type_repr = str(task.task_type)
-            if batch_size is not None:
-                offsets = range(0, len(indices), batch_size)
-                batches = [indices[step:step + batch_size] for step in offsets]
+            if explain_config is not None:
+                msg = f"Explaining {task_type_repr} task"
             else:
-                batches = [indices]
+                msg = f"Predicting {task_type_repr} task"
+            verbose = ProgressLogger.default(msg=msg, verbose=verbose)
-            if len(batches) > 1:
-                logger.log(f"Splitting {len(indices):,} entities into "
-                           f"{len(batches):,} batches of size {batch_size:,}")
+        with verbose as logger:
+            if task.num_context_examples > _MAX_CONTEXT_SIZE[run_mode]:
+                logger.log(f"Sub-sampled {_MAX_CONTEXT_SIZE[run_mode]:,} "
+                           f"out of {task.num_context_examples:,} in-context "
+                           f"examples")
+                task = task.narrow_context(0, _MAX_CONTEXT_SIZE[run_mode])
+            if self._batch_size is None:
+                batch_size = task.num_prediction_examples
+            elif self._batch_size == 'max':
+                batch_size = _MAX_PRED_SIZE[task.task_type]
+            else:
+                batch_size = self._batch_size
-            predictions: List[pd.DataFrame] = []
-            summary: Optional[str] = None
-            details: Optional[Explanation] = None
-            for i, batch in enumerate(batches):
-                # TODO Re-use the context for subsequent predictions.
+            if batch_size > _MAX_PRED_SIZE[task.task_type]:
+                raise ValueError(f"Cannot predict for more than "
+                                 f"{_MAX_PRED_SIZE[task.task_type]:,} "
+                                 f"entities at once (got {batch_size:,}). Use "
+                                 f"`KumoRFM.batch_mode` to process entities "
+                                 f"in batches with a sufficient batch size.")
+            if task.num_prediction_examples > batch_size:
+                num = math.ceil(task.num_prediction_examples / batch_size)
+                logger.log(f"Splitting {task.num_prediction_examples:,} "
+                           f"entities into {num:,} batches of size "
+                           f"{batch_size:,}")
+            predictions: list[pd.DataFrame] = []
+            summary: str | None = None
+            details: Explanation | None = None
+            for start in range(0, task.num_prediction_examples, batch_size):
                 context = self._get_context(
-                    query=query_def,
-                    indices=batch,
-                    anchor_time=anchor_time,
-                    context_anchor_time=context_anchor_time,
-                    run_mode=RunMode(run_mode),
+                    task=task.narrow_prediction(start, length=batch_size),
+                    run_mode=run_mode,
                     num_neighbors=num_neighbors,
-                    num_hops=num_hops,
-                    max_pq_iterations=max_pq_iterations,
-                    evaluate=False,
-                    random_seed=random_seed,
-                    logger=logger if i == 0 else None,
+                    exclude_cols_dict=exclude_cols_dict,
+                    top_k=top_k,
                 )
+                context.y_test = None
                 request = RFMPredictRequest(
                     context=context,
                     run_mode=RunMode(run_mode),
-                    query=query_str,
+                    query=task._query,
                     use_prediction_time=use_prediction_time,
                 )
                 with warnings.catch_warnings():
                     warnings.filterwarnings('ignore', message='gencode')
                     request_msg = request.to_protobuf()
                     _bytes = request_msg.SerializeToString()
-                if i == 0:
+                if start == 0:
                     logger.log(f"Generated context of size "
                                f"{len(_bytes) / (1024*1024):.2f}MB")
@@ -412,14 +608,11 @@ class KumoRFM:
                     stats = Context.get_memory_stats(request_msg.context)
                     raise ValueError(_SIZE_LIMIT_MSG.format(stats=stats))
-                if (isinstance(verbose, InteractiveProgressLogger) and i == 0
-                        and len(batches) > 1):
-                    verbose.init_progress(
-                        total=len(batches),
-                        description='Predicting',
-                    )
+                if start == 0 and task.num_prediction_examples > batch_size:
+                    num = math.ceil(task.num_prediction_examples / batch_size)
+                    verbose.init_progress(total=num, description='Predicting')
-                for attempt in range(self.num_retries + 1):
+                for attempt in range(self._num_retries + 1):
                     try:
                         if explain_config is not None:
                             resp = self._api_client.explain(
@@ -434,10 +627,10 @@ class KumoRFM:
                         # Cast 'ENTITY' to correct data type:
                         if 'ENTITY' in df:
-                            entity = query_def.entity_table
-                            pkey_map = self._graph_store.pkey_map_dict[entity]
-                            df['ENTITY'] = df['ENTITY'].astype(
-                                type(pkey_map.index[0]))
+                            table_dict = context.subgraph.table_dict
+                            table = table_dict[context.entity_table_names[0]]
+                            ser = table.df[table.primary_key]
+                            df['ENTITY'] = df['ENTITY'].astype(ser.dtype)
                         # Cast 'ANCHOR_TIMESTAMP' to correct data type:
                         if 'ANCHOR_TIMESTAMP' in df:
@@ -452,13 +645,12 @@ class KumoRFM:
                         predictions.append(df)
-                        if (isinstance(verbose, InteractiveProgressLogger)
-                                and len(batches) > 1):
+                        if task.num_prediction_examples > batch_size:
                             verbose.step()
                         break
                     except HTTPException as e:
-                        if attempt == self.num_retries:
+                        if attempt == self._num_retries:
                             try:
                                 msg = json.loads(e.detail)['detail']
                             except Exception:
@@ -488,69 +680,19 @@ class KumoRFM:
         return prediction
-    def is_valid_entity(
-        self,
-        query: str,
-        indices: Union[List[str], List[float], List[int], None] = None,
-        *,
-        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
-    ) -> np.ndarray:
-        r"""Returns a mask that denotes which entities are valid for the
-        given predictive query, *i.e.*, which entities fulfill (temporal)
-        entity filter constraints.
-        Args:
-            query: The predictive query.
-            indices: The entity primary keys to predict for. Will override the
-                indices given as part of the predictive query.
-            anchor_time: The anchor timestamp for the prediction. If set to
-                ``None``, will use the maximum timestamp in the data.
-                If set to ``"entity"``, will use the timestamp of the entity.
-        """
-        query_def = self._parse_query(query)
-        if indices is None:
-            if query_def.rfm_entity_ids is None:
-                raise ValueError("Cannot find entities to predict for. Please "
-                                 "pass them via "
-                                 "`is_valid_entity(query, indices=...)`")
-            indices = query_def.get_rfm_entity_id_list()
-        if len(indices) == 0:
-            raise ValueError("At least one entity is required")
-        if anchor_time is None:
-            anchor_time = self._graph_store.max_time
-        if isinstance(anchor_time, pd.Timestamp):
-            self._validate_time(query_def, anchor_time, None, False)
-        else:
-            assert anchor_time == 'entity'
-            if (query_def.entity_table not in self._graph_store.time_dict):
-                raise ValueError(f"Anchor time 'entity' requires the entity "
-                                 f"table '{query_def.entity_table}' "
-                                 f"to have a time column.")
-        node = self._graph_store.get_node_id(
-            table_name=query_def.entity_table,
-            pkey=pd.Series(indices),
-        )
-        query_driver = LocalPQueryDriver(self._graph_store, query_def)
-        return query_driver.is_valid(node, anchor_time)
     def evaluate(
         self,
         query: str,
         *,
-        metrics: Optional[List[str]] = None,
-        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
-        context_anchor_time: Union[pd.Timestamp, None] = None,
-        run_mode: Union[RunMode, str] = RunMode.FAST,
-        num_neighbors: Optional[List[int]] = None,
+        metrics: list[str] | None = None,
+        anchor_time: pd.Timestamp | Literal['entity'] | None = None,
+        context_anchor_time: pd.Timestamp | None = None,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
         num_hops: int = 2,
-        max_pq_iterations: int = 20,
-        random_seed: Optional[int] = _RANDOM_SEED,
-        verbose: Union[bool, ProgressLogger] = True,
+        max_pq_iterations: int = 10,
+        random_seed: int | None = _RANDOM_SEED,
+        verbose: bool | ProgressLogger = True,
         use_prediction_time: bool = False,
     ) -> pd.DataFrame:
         """Evaluates a predictive query.
@@ -582,41 +724,120 @@ class KumoRFM:
         Returns:
             The metrics as a :class:`pandas.DataFrame`
         """
-        query_def = self._parse_query(query)
+        query_def = replace(
+            self._parse_query(query),
+            for_each='FOR EACH',
+            rfm_entity_ids=None,
+        )
+        if not isinstance(verbose, ProgressLogger):
+            query_repr = query_def.to_string(rich=True, exclude_predict=True)
+            msg = f'[bold]EVALUATE[/bold] {query_repr}'
+            verbose = ProgressLogger.default(msg=msg, verbose=verbose)
+        with verbose as logger:
+            task_table = self._get_task_table(
+                query=query_def,
+                indices=None,
+                anchor_time=anchor_time,
+                context_anchor_time=context_anchor_time,
+                run_mode=run_mode,
+                max_pq_iterations=max_pq_iterations,
+                random_seed=random_seed,
+                logger=logger,
+            )
+            return self.evaluate_task(
+                task_table,
+                metrics=metrics,
+                run_mode=run_mode,
+                num_neighbors=num_neighbors,
+                num_hops=num_hops,
+                verbose=verbose,
+                exclude_cols_dict=query_def.get_exclude_cols_dict(),
+                use_prediction_time=use_prediction_time,
+            )
+    def evaluate_task(
+        self,
+        task: TaskTable,
+        *,
+        metrics: list[str] | None = None,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        num_hops: int = 2,
+        verbose: bool | ProgressLogger = True,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        use_prediction_time: bool = False,
+    ) -> pd.DataFrame:
+        """Evaluates a custom task specification.
+        Args:
+            task: The custom :class:`TaskTable`.
+            metrics: The metrics to use.
+            run_mode: The :class:`RunMode` for the query.
+            num_neighbors: The number of neighbors to sample for each hop.
+                If specified, the ``num_hops`` option will be ignored.
+            num_hops: The number of hops to sample when generating the context.
+            verbose: Whether to print verbose output.
+            exclude_cols_dict: Any column in any table to exclude from the
+                model input.
+            use_prediction_time: Whether to use the anchor timestamp as an
+                additional feature during prediction. This is typically
+                beneficial for time series forecasting tasks.
+        Returns:
+            The metrics as a :class:`pandas.DataFrame`
+        """
         if num_hops != 2 and num_neighbors is not None:
             warnings.warn(f"Received custom 'num_neighbors' option; ignoring "
                           f"custom 'num_hops={num_hops}' option")
+        if num_neighbors is None:
+            key = RunMode.FAST if task.task_type.is_link_pred else run_mode
+            num_neighbors = _DEFAULT_NUM_NEIGHBORS[key][:num_hops]
-        if query_def.rfm_entity_ids is not None:
-            query_def = replace(
-                query_def,
-                rfm_entity_ids=None,
-            )
-        query_repr = query_def.to_string(rich=True, exclude_predict=True)
-        msg = f'[bold]EVALUATE[/bold] {query_repr}'
+        if metrics is not None and len(metrics) > 0:
+            self._validate_metrics(metrics, task.task_type)
+            metrics = list(dict.fromkeys(metrics))
         if not isinstance(verbose, ProgressLogger):
-            verbose = InteractiveProgressLogger(msg, verbose=verbose)
+            if task.task_type == TaskType.BINARY_CLASSIFICATION:
+                task_type_repr = 'binary classification'
+            elif task.task_type == TaskType.MULTICLASS_CLASSIFICATION:
+                task_type_repr = 'multi-class classification'
+            elif task.task_type == TaskType.REGRESSION:
+                task_type_repr = 'regression'
+            elif task.task_type == TaskType.TEMPORAL_LINK_PREDICTION:
+                task_type_repr = 'link prediction'
+            else:
+                task_type_repr = str(task.task_type)
+            msg = f"Evaluating {task_type_repr} task"
+            verbose = ProgressLogger.default(msg=msg, verbose=verbose)
         with verbose as logger:
+            if task.num_context_examples > _MAX_CONTEXT_SIZE[run_mode]:
+                logger.log(f"Sub-sampled {_MAX_CONTEXT_SIZE[run_mode]:,} "
+                           f"out of {task.num_context_examples:,} in-context "
+                           f"examples")
+                task = task.narrow_context(0, _MAX_CONTEXT_SIZE[run_mode])
+            if task.num_prediction_examples > _MAX_TEST_SIZE[task.task_type]:
+                logger.log(f"Sub-sampled {_MAX_TEST_SIZE[task.task_type]:,} "
+                           f"out of {task.num_prediction_examples:,} test "
+                           f"examples")
+                task = task.narrow_prediction(
+                    start=0,
+                    length=_MAX_TEST_SIZE[task.task_type],
+                )
             context = self._get_context(
-                query=query_def,
-                indices=None,
-                anchor_time=anchor_time,
-                context_anchor_time=context_anchor_time,
-                run_mode=RunMode(run_mode),
+                task=task,
+                run_mode=run_mode,
                 num_neighbors=num_neighbors,
-                num_hops=num_hops,
-                max_pq_iterations=max_pq_iterations,
-                evaluate=True,
-                random_seed=random_seed,
-                logger=logger if verbose else None,
+                exclude_cols_dict=exclude_cols_dict,
             )
-            if metrics is not None and len(metrics) > 0:
-                self._validate_metrics(metrics, context.task_type)
-                metrics = list(dict.fromkeys(metrics))
             request = RFMEvaluateRequest(
                 context=context,
                 run_mode=RunMode(run_mode),
@@ -634,17 +855,23 @@ class KumoRFM:
                 stats_msg = Context.get_memory_stats(request_msg.context)
                 raise ValueError(_SIZE_LIMIT_MSG.format(stats=stats_msg))
-            try:
-                resp = self._api_client.evaluate(request_bytes)
-            except HTTPException as e:
+            for attempt in range(self._num_retries + 1):
                 try:
-                    msg = json.loads(e.detail)['detail']
-                except Exception:
-                    msg = e.detail
-                raise RuntimeError(f"An unexpected exception occurred. "
-                                   f"Please create an issue at "
-                                   f"'https://github.com/kumo-ai/kumo-rfm'. "
-                                   f"{msg}") from None
+                    resp = self._api_client.evaluate(request_bytes)
+                    break
+                except HTTPException as e:
+                    if attempt == self._num_retries:
+                        try:
+                            msg = json.loads(e.detail)['detail']
+                        except Exception:
+                            msg = e.detail
+                        raise RuntimeError(
+                            f"An unexpected exception occurred. Please create "
+                            f"an issue at "
+                            f"'https://github.com/kumo-ai/kumo-rfm'. {msg}"
+                        ) from None
+                    time.sleep(2**attempt)  # 1s, 2s, 4s, 8s, ...
         return pd.DataFrame.from_dict(
             resp.metrics,
@@ -657,9 +884,9 @@ class KumoRFM:
         query: str,
         size: int,
         *,
-        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
-        random_seed: Optional[int] = _RANDOM_SEED,
-        max_iterations: int = 20,
+        anchor_time: pd.Timestamp | Literal['entity'] | None = None,
+        random_seed: int | None = _RANDOM_SEED,
+        max_iterations: int = 10,
     ) -> pd.DataFrame:
         """Returns the labels of a predictive query for a specified anchor
         time.
@@ -679,40 +906,37 @@ class KumoRFM:
         query_def = self._parse_query(query)
         if anchor_time is None:
-            anchor_time = self._graph_store.max_time
+            anchor_time = self._get_default_anchor_time(query_def)
             if query_def.target_ast.date_offset_range is not None:
-                anchor_time = anchor_time - (
-                    query_def.target_ast.date_offset_range.end_date_offset *
-                    query_def.num_forecasts)
+                offset = query_def.target_ast.date_offset_range.end_date_offset
+                offset *= query_def.num_forecasts
+                anchor_time -= offset
         assert anchor_time is not None
         if isinstance(anchor_time, pd.Timestamp):
             self._validate_time(query_def, anchor_time, None, evaluate=True)
         else:
             assert anchor_time == 'entity'
-            if (query_def.entity_table not in self._graph_store.time_dict):
+            if query_def.entity_table not in self._sampler.time_column_dict:
                 raise ValueError(f"Anchor time 'entity' requires the entity "
                                  f"table '{query_def.entity_table}' "
                                  f"to have a time column")
-        query_driver = LocalPQueryDriver(self._graph_store, query_def,
-                                         random_seed)
-        node, time, y = query_driver.collect_test(
-            size=size,
-            anchor_time=anchor_time,
-            batch_size=min(10_000, size),
-            max_iterations=max_iterations,
-            guarantee_train_examples=False,
+        train, test = self._sampler.sample_target(
+            query=query_def,
+            num_train_examples=0,
+            train_anchor_time=anchor_time,
+            num_train_trials=0,
+            num_test_examples=size,
+            test_anchor_time=anchor_time,
+            num_test_trials=max_iterations * size,
+            random_seed=random_seed,
         )
-        entity = self._graph_store.pkey_map_dict[
-            query_def.entity_table].index[node]
         return pd.DataFrame({
-            'ENTITY': entity,
-            'ANCHOR_TIMESTAMP': time,
-            'TARGET': y,
+            'ENTITY': test.entity_pkey,
+            'ANCHOR_TIMESTAMP': test.anchor_time,
+            'TARGET': test.target,
         })
     # Helpers #################################################################
@@ -727,63 +951,128 @@ class KumoRFM:
                              "`predict()` or `evaluate()` methods to perform "
                              "predictions or evaluations.")
-        try:
-            request = RFMParseQueryRequest(
-                query=query,
-                graph_definition=self._graph_def,
-            )
+        request = RFMParseQueryRequest(
+            query=query,
+            graph_definition=self._graph_def,
+        )
-            resp = self._api_client.parse_query(request)
+        for attempt in range(self._num_retries + 1):
+            try:
+                resp = self._api_client.parse_query(request)
+                break
+            except HTTPException as e:
+                if attempt == self._num_retries:
+                    try:
+                        msg = json.loads(e.detail)['detail']
+                    except Exception:
+                        msg = e.detail
+                    raise ValueError(f"Failed to parse query '{query}'. {msg}")
-            # TODO Expose validation warnings.
+                time.sleep(2**attempt)  # 1s, 2s, 4s, 8s, ...
-            if len(resp.validation_response.warnings) > 0:
-                msg = '\n'.join([
-                    f'{i+1}. {warning.title}: {warning.message}' for i, warning
-                    in enumerate(resp.validation_response.warnings)
-                ])
-                warnings.warn(f"Encountered the following warnings during "
-                              f"parsing:\n{msg}")
+        if len(resp.validation_response.warnings) > 0:
+            msg = '\n'.join([
+                f'{i+1}. {warning.title}: {warning.message}'
+                for i, warning in enumerate(resp.validation_response.warnings)
+            ])
+            warnings.warn(f"Encountered the following warnings during "
+                          f"parsing:\n{msg}")
-            return resp.query
-        except HTTPException as e:
-            try:
-                msg = json.loads(e.detail)['detail']
-            except Exception:
-                msg = e.detail
-            raise ValueError(f"Failed to parse query '{query}'. "
-                             f"{msg}") from None
+        return resp.query
+    @staticmethod
+    def _get_task_type(
+        query: ValidatedPredictiveQuery,
+        edge_types: list[tuple[str, str, str]],
+    ) -> TaskType:
+        if isinstance(query.target_ast, (Condition, LogicalOperation)):
+            return TaskType.BINARY_CLASSIFICATION
+        target = query.target_ast
+        if isinstance(target, Join):
+            target = target.rhs_target
+        if isinstance(target, Aggregation):
+            if target.aggr == AggregationType.LIST_DISTINCT:
+                table_name, col_name = target._get_target_column_name().split(
+                    '.')
+                target_edge_types = [
+                    edge_type for edge_type in edge_types
+                    if edge_type[0] == table_name and edge_type[1] == col_name
+                ]
+                if len(target_edge_types) != 1:
+                    raise NotImplementedError(
+                        f"Multilabel-classification queries based on "
+                        f"'LIST_DISTINCT' are not supported yet. If you "
+                        f"planned to write a link prediction query instead, "
+                        f"make sure to register '{col_name}' as a "
+                        f"foreign key.")
+                return TaskType.TEMPORAL_LINK_PREDICTION
+            return TaskType.REGRESSION
+        assert isinstance(target, Column)
+        if target.stype in {Stype.ID, Stype.categorical}:
+            return TaskType.MULTICLASS_CLASSIFICATION
+        if target.stype in {Stype.numerical}:
+            return TaskType.REGRESSION
+        raise NotImplementedError("Task type not yet supported")
+    def _get_default_anchor_time(
+        self,
+        query: ValidatedPredictiveQuery | None = None,
+    ) -> pd.Timestamp:
+        if query is not None and query.query_type == QueryType.TEMPORAL:
+            aggr_table_names = [
+                aggr._get_target_column_name().split('.')[0]
+                for aggr in query.get_all_target_aggregations()
+            ]
+            return self._sampler.get_max_time(aggr_table_names)
+        return self._sampler.get_max_time()
     def _validate_time(
         self,
         query: ValidatedPredictiveQuery,
         anchor_time: pd.Timestamp,
-        context_anchor_time: Union[pd.Timestamp, None],
+        context_anchor_time: pd.Timestamp | None,
         evaluate: bool,
     ) -> None:
-        if self._graph_store.min_time == pd.Timestamp.max:
+        if len(self._sampler.time_column_dict) == 0:
             return  # Graph without timestamps
-        if anchor_time < self._graph_store.min_time:
+        if query.query_type == QueryType.TEMPORAL:
+            aggr_table_names = [
+                aggr._get_target_column_name().split('.')[0]
+                for aggr in query.get_all_target_aggregations()
+            ]
+            min_time = self._sampler.get_min_time(aggr_table_names)
+            max_time = self._sampler.get_max_time(aggr_table_names)
+        else:
+            min_time = self._sampler.get_min_time()
+            max_time = self._sampler.get_max_time()
+        if anchor_time < min_time:
             raise ValueError(f"Anchor timestamp '{anchor_time}' is before "
-                             f"the earliest timestamp "
-                             f"'{self._graph_store.min_time}' in the data.")
+                             f"the earliest timestamp '{min_time}' in the "
+                             f"data.")
-        if (context_anchor_time is not None
-                and context_anchor_time < self._graph_store.min_time):
+        if context_anchor_time is not None and context_anchor_time < min_time:
             raise ValueError(f"Context anchor timestamp is too early or "
                              f"aggregation time range is too large. To make "
                              f"this prediction, we would need data back to "
                              f"'{context_anchor_time}', however, your data "
-                             f"only contains data back to "
-                             f"'{self._graph_store.min_time}'.")
+                             f"only contains data back to '{min_time}'.")
         if query.target_ast.date_offset_range is not None:
             end_offset = query.target_ast.date_offset_range.end_date_offset
         else:
             end_offset = pd.DateOffset(0)
-        forecast_end_offset = end_offset * query.num_forecasts
+        end_offset = end_offset * query.num_forecasts
         if (context_anchor_time is not None
                 and context_anchor_time > anchor_time):
             warnings.warn(f"Context anchor timestamp "
@@ -793,7 +1082,7 @@ class KumoRFM:
                           f"intended.")
         elif (query.query_type == QueryType.TEMPORAL
               and context_anchor_time is not None
-              and context_anchor_time + forecast_end_offset > anchor_time):
+              and context_anchor_time + end_offset > anchor_time):
             warnings.warn(f"Aggregation for context examples at timestamp "
                           f"'{context_anchor_time}' will leak information "
                           f"from the prediction anchor timestamp "
@@ -801,62 +1090,44 @@ class KumoRFM:
                           f"intended.")
         elif (context_anchor_time is not None
-              and context_anchor_time - forecast_end_offset
-              < self._graph_store.min_time):
-            _time = context_anchor_time - forecast_end_offset
+              and context_anchor_time - end_offset < min_time):
+            _time = context_anchor_time - end_offset
             warnings.warn(f"Context anchor timestamp is too early or "
                           f"aggregation time range is too large. To form "
                           f"proper input data, we would need data back to "
                           f"'{_time}', however, your data only contains "
-                          f"data back to '{self._graph_store.min_time}'.")
+                          f"data back to '{min_time}'.")
-        if (not evaluate and anchor_time
-                > self._graph_store.max_time + pd.DateOffset(days=1)):
+        if not evaluate and anchor_time > max_time + pd.DateOffset(days=1):
             warnings.warn(f"Anchor timestamp '{anchor_time}' is after the "
-                          f"latest timestamp '{self._graph_store.max_time}' "
-                          f"in the data. Please make sure this is intended.")
+                          f"latest timestamp '{max_time}' in the data. Please "
+                          f"make sure this is intended.")
-        max_eval_time = self._graph_store.max_time - forecast_end_offset
-        if evaluate and anchor_time > max_eval_time:
+        if evaluate and anchor_time > max_time - end_offset:
             raise ValueError(
                 f"Anchor timestamp for evaluation is after the latest "
-                f"supported timestamp '{max_eval_time}'.")
+                f"supported timestamp '{max_time - end_offset}'.")
-    def _get_context(
+    def _get_task_table(
         self,
         query: ValidatedPredictiveQuery,
-        indices: Union[List[str], List[float], List[int], None],
-        anchor_time: Union[pd.Timestamp, Literal['entity'], None],
-        context_anchor_time: Union[pd.Timestamp, None],
-        run_mode: RunMode,
-        num_neighbors: Optional[List[int]],
-        num_hops: int,
-        max_pq_iterations: int,
-        evaluate: bool,
-        random_seed: Optional[int] = _RANDOM_SEED,
-        logger: Optional[ProgressLogger] = None,
-    ) -> Context:
-        if num_neighbors is not None:
-            num_hops = len(num_neighbors)
-        if num_hops < 0:
-            raise ValueError(f"'num_hops' must be non-negative "
-                             f"(got {num_hops})")
-        if num_hops > 6:
-            raise ValueError(f"Cannot predict on subgraphs with more than 6 "
-                             f"hops (got {num_hops}). Please reduce the "
-                             f"number of hops and try again. Please create a "
-                             f"feature request at "
-                             f"'https://github.com/kumo-ai/kumo-rfm' if you "
-                             f"must go beyond this for your use-case.")
-        query_driver = LocalPQueryDriver(self._graph_store, query, random_seed)
-        task_type = LocalPQueryDriver.get_task_type(
-            query,
-            edge_types=self._graph_store.edge_types,
+        indices: list[str] | list[float] | list[int] | None,
+        anchor_time: pd.Timestamp | Literal['entity'] | None = None,
+        context_anchor_time: pd.Timestamp | None = None,
+        run_mode: RunMode = RunMode.FAST,
+        max_pq_iterations: int = 10,
+        random_seed: int | None = _RANDOM_SEED,
+        logger: ProgressLogger | None = None,
+    ) -> TaskTable:
+        task_type = self._get_task_type(
+            query=query,
+            edge_types=self._sampler.edge_types,
         )
+        num_train_examples = _MAX_CONTEXT_SIZE[run_mode]
+        num_test_examples = _MAX_TEST_SIZE[task_type] if indices is None else 0
         if logger is not None:
             if task_type == TaskType.BINARY_CLASSIFICATION:
                 task_type_repr = 'binary classification'
@@ -870,30 +1141,17 @@ class KumoRFM:
                 task_type_repr = str(task_type)
             logger.log(f"Identified {query.query_type} {task_type_repr} task")
-        if task_type.is_link_pred and num_hops < 2:
-            raise ValueError(f"Cannot perform link prediction on subgraphs "
-                             f"with less than 2 hops (got {num_hops}) since "
-                             f"historical target entities need to be part of "
-                             f"the context. Please increase the number of "
-                             f"hops and try again.")
-        if num_neighbors is None:
-            if run_mode == RunMode.DEBUG:
-                num_neighbors = [16, 16, 4, 4, 1, 1][:num_hops]
-            elif run_mode == RunMode.FAST or task_type.is_link_pred:
-                num_neighbors = [32, 32, 8, 8, 4, 4][:num_hops]
-            else:
-                num_neighbors = [64, 64, 8, 8, 4, 4][:num_hops]
         if query.target_ast.date_offset_range is None:
-            end_offset = pd.DateOffset(0)
+            step_offset = pd.DateOffset(0)
         else:
-            end_offset = query.target_ast.date_offset_range.end_date_offset
-        forecast_end_offset = end_offset * query.num_forecasts
+            step_offset = query.target_ast.date_offset_range.end_date_offset
+        end_offset = step_offset * query.num_forecasts
         if anchor_time is None:
-            anchor_time = self._graph_store.max_time
-            if evaluate:
-                anchor_time = anchor_time - forecast_end_offset
+            anchor_time = self._get_default_anchor_time(query)
+            if num_test_examples > 0:
+                anchor_time = anchor_time - end_offset
             if logger is not None:
                 assert isinstance(anchor_time, pd.Timestamp)
                 if anchor_time == pd.Timestamp.min:
@@ -905,114 +1163,98 @@ class KumoRFM:
                 else:
                     logger.log(f"Derived anchor time {anchor_time}")
-        assert anchor_time is not None
         if isinstance(anchor_time, pd.Timestamp):
+            if context_anchor_time == 'entity':
+                raise ValueError("Anchor time 'entity' needs to be shared "
+                                 "for context and prediction examples")
             if context_anchor_time is None:
-                context_anchor_time = anchor_time - forecast_end_offset
+                context_anchor_time = anchor_time - end_offset
             self._validate_time(query, anchor_time, context_anchor_time,
-                                evaluate)
+                                evaluate=num_test_examples > 0)
         else:
             assert anchor_time == 'entity'
-            if query.entity_table not in self._graph_store.time_dict:
+            if query.query_type != QueryType.STATIC:
+                raise ValueError("Anchor time 'entity' is only valid for "
+                                 "static predictive queries")
+            if query.entity_table not in self._sampler.time_column_dict:
                 raise ValueError(f"Anchor time 'entity' requires the entity "
                                  f"table '{query.entity_table}' to "
                                  f"have a time column")
-            if context_anchor_time is not None:
-                warnings.warn("Ignoring option 'context_anchor_time' for "
-                              "`anchor_time='entity'`")
-            context_anchor_time = None
-        y_test: Optional[pd.Series] = None
-        if evaluate:
-            max_test_size = _MAX_TEST_SIZE[run_mode]
-            if task_type.is_link_pred:
-                max_test_size = max_test_size // 5
-            test_node, test_time, y_test = query_driver.collect_test(
-                size=max_test_size,
-                anchor_time=anchor_time,
-                max_iterations=max_pq_iterations,
-                guarantee_train_examples=True,
-            )
-            if logger is not None:
-                if task_type == TaskType.BINARY_CLASSIFICATION:
-                    pos = 100 * int((y_test > 0).sum()) / len(y_test)
-                    msg = (f"Collected {len(y_test):,} test examples with "
-                           f"{pos:.2f}% positive cases")
-                elif task_type == TaskType.MULTICLASS_CLASSIFICATION:
-                    msg = (f"Collected {len(y_test):,} test examples "
-                           f"holding {y_test.nunique()} classes")
-                elif task_type == TaskType.REGRESSION:
-                    _min, _max = float(y_test.min()), float(y_test.max())
-                    msg = (f"Collected {len(y_test):,} test examples with "
-                           f"targets between {format_value(_min)} and "
-                           f"{format_value(_max)}")
-                elif task_type == TaskType.TEMPORAL_LINK_PREDICTION:
-                    num_rhs = y_test.explode().nunique()
-                    msg = (f"Collected {len(y_test):,} test examples with "
-                           f"{num_rhs:,} unique items")
-                else:
-                    raise NotImplementedError
-                logger.log(msg)
-        else:
-            assert indices is not None
-            if len(indices) > _MAX_PRED_SIZE[task_type]:
-                raise ValueError(f"Cannot predict for more than "
-                                 f"{_MAX_PRED_SIZE[task_type]:,} entities at "
-                                 f"once (got {len(indices):,}). Use "
-                                 f"`KumoRFM.batch_mode` to process entities "
-                                 f"in batches")
+            if isinstance(context_anchor_time, pd.Timestamp):
+                raise ValueError("Anchor time 'entity' needs to be shared "
+                                 "for context and prediction examples")
+            context_anchor_time = 'entity'
+        train, test = self._sampler.sample_target(
+            query=query,
+            num_train_examples=num_train_examples,
+            train_anchor_time=context_anchor_time,
+            num_train_trials=max_pq_iterations * num_train_examples,
+            num_test_examples=num_test_examples,
+            test_anchor_time=anchor_time,
+            num_test_trials=max_pq_iterations * num_test_examples,
+            random_seed=random_seed,
+        )
+        train_pkey, train_time, train_y = train
+        test_pkey, test_time, test_y = test
-            test_node = self._graph_store.get_node_id(
-                table_name=query.entity_table,
-                pkey=pd.Series(indices),
-            )
+        if num_test_examples > 0 and logger is not None:
+            if task_type == TaskType.BINARY_CLASSIFICATION:
+                pos = 100 * int((test_y > 0).sum()) / len(test_y)
+                msg = (f"Collected {len(test_y):,} test examples with "
+                       f"{pos:.2f}% positive cases")
+            elif task_type == TaskType.MULTICLASS_CLASSIFICATION:
+                msg = (f"Collected {len(test_y):,} test examples holding "
+                       f"{test_y.nunique()} classes")
+            elif task_type == TaskType.REGRESSION:
+                _min, _max = float(test_y.min()), float(test_y.max())
+                msg = (f"Collected {len(test_y):,} test examples with targets "
+                       f"between {format_value(_min)} and "
+                       f"{format_value(_max)}")
+            elif task_type == TaskType.TEMPORAL_LINK_PREDICTION:
+                num_rhs = test_y.explode().nunique()
+                msg = (f"Collected {len(test_y):,} test examples with "
+                       f"{num_rhs:,} unique items")
+            else:
+                raise NotImplementedError
+            logger.log(msg)
+        if num_test_examples == 0:
+            assert indices is not None
+            test_pkey = pd.Series(indices, dtype=train_pkey.dtype)
             if isinstance(anchor_time, pd.Timestamp):
-                test_time = pd.Series(anchor_time).repeat(
-                    len(test_node)).reset_index(drop=True)
+                test_time = pd.Series([anchor_time]).repeat(
+                    len(indices)).reset_index(drop=True)
             else:
-                time = self._graph_store.time_dict[query.entity_table]
-                time = time[test_node] * 1000**3
-                test_time = pd.Series(time, dtype='datetime64[ns]')
-        train_node, train_time, y_train = query_driver.collect_train(
-            size=_MAX_CONTEXT_SIZE[run_mode],
-            anchor_time=context_anchor_time or 'entity',
-            exclude_node=test_node if (query.query_type == QueryType.STATIC
-                                       or anchor_time == 'entity') else None,
-            max_iterations=max_pq_iterations,
-        )
+                train_time = test_time = 'entity'
         if logger is not None:
             if task_type == TaskType.BINARY_CLASSIFICATION:
-                pos = 100 * int((y_train > 0).sum()) / len(y_train)
-                msg = (f"Collected {len(y_train):,} in-context examples with "
+                pos = 100 * int((train_y > 0).sum()) / len(train_y)
+                msg = (f"Collected {len(train_y):,} in-context examples with "
                        f"{pos:.2f}% positive cases")
             elif task_type == TaskType.MULTICLASS_CLASSIFICATION:
-                msg = (f"Collected {len(y_train):,} in-context examples "
-                       f"holding {y_train.nunique()} classes")
+                msg = (f"Collected {len(train_y):,} in-context examples "
+                       f"holding {train_y.nunique()} classes")
             elif task_type == TaskType.REGRESSION:
-                _min, _max = float(y_train.min()), float(y_train.max())
-                msg = (f"Collected {len(y_train):,} in-context examples with "
+                _min, _max = float(train_y.min()), float(train_y.max())
+                msg = (f"Collected {len(train_y):,} in-context examples with "
                        f"targets between {format_value(_min)} and "
                        f"{format_value(_max)}")
             elif task_type == TaskType.TEMPORAL_LINK_PREDICTION:
-                num_rhs = y_train.explode().nunique()
-                msg = (f"Collected {len(y_train):,} in-context examples with "
+                num_rhs = train_y.explode().nunique()
+                msg = (f"Collected {len(train_y):,} in-context examples with "
                        f"{num_rhs:,} unique items")
             else:
                 raise NotImplementedError
             logger.log(msg)
-        entity_table_names: Tuple[str, ...]
+        entity_table_names: tuple[str] | tuple[str, str]
         if task_type.is_link_pred:
             final_aggr = query.get_final_target_aggregation()
             assert final_aggr is not None
             edge_fkey = final_aggr._get_target_column_name()
-            for edge_type in self._graph_store.edge_types:
+            for edge_type in self._sampler.edge_types:
                 if edge_fkey == f'{edge_type[0]}.{edge_type[1]}':
                     entity_table_names = (
                         query.entity_table,
@@ -1021,23 +1263,80 @@ class KumoRFM:
         else:
             entity_table_names = (query.entity_table, )
+        context_df = pd.DataFrame({'ENTITY': train_pkey, 'TARGET': train_y})
+        if isinstance(train_time, pd.Series):
+            context_df['ANCHOR_TIMESTAMP'] = train_time
+        pred_df = pd.DataFrame({'ENTITY': test_pkey})
+        if num_test_examples > 0:
+            pred_df['TARGET'] = test_y
+        if isinstance(test_time, pd.Series):
+            pred_df['ANCHOR_TIMESTAMP'] = test_time
+        return TaskTable(
+            task_type=task_type,
+            context_df=context_df,
+            pred_df=pred_df,
+            entity_table_name=entity_table_names,
+            entity_column='ENTITY',
+            target_column='TARGET',
+            time_column='ANCHOR_TIMESTAMP' if isinstance(
+                train_time, pd.Series) else TaskTable.ENTITY_TIME,
+        )
+    def _get_context(
+        self,
+        task: TaskTable,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        top_k: int | None = None,
+    ) -> Context:
+        if num_neighbors is None:
+            key = RunMode.FAST if task.task_type.is_link_pred else run_mode
+            num_neighbors = _DEFAULT_NUM_NEIGHBORS[key][:2]
+        if len(num_neighbors) > 6:
+            raise ValueError(f"Cannot predict on subgraphs with more than 6 "
+                             f"hops (got {len(num_neighbors)}). Reduce the "
+                             f"number of hops and try again. Please create a "
+                             f"feature request at "
+                             f"'https://github.com/kumo-ai/kumo-rfm' if you "
+                             f"must go beyond this for your use-case.")
         # Exclude the entity anchor time from the feature set to prevent
         # running out-of-distribution between in-context and test examples:
-        exclude_cols_dict = query.get_exclude_cols_dict()
-        if anchor_time == 'entity':
-            if entity_table_names[0] not in exclude_cols_dict:
-                exclude_cols_dict[entity_table_names[0]] = []
-            time_column_dict = self._graph_store.time_column_dict
-            time_column = time_column_dict[entity_table_names[0]]
-            exclude_cols_dict[entity_table_names[0]].append(time_column)
-        subgraph = self._graph_sampler(
-            entity_table_names=entity_table_names,
-            node=np.concatenate([train_node, test_node]),
-            time=np.concatenate([
-                train_time.astype('datetime64[ns]').astype(int).to_numpy(),
-                test_time.astype('datetime64[ns]').astype(int).to_numpy(),
-            ]),
+        exclude_cols_dict = exclude_cols_dict or {}
+        if task.entity_table_name in self._sampler.time_column_dict:
+            if task.entity_table_name not in exclude_cols_dict:
+                exclude_cols_dict[task.entity_table_name] = []
+            time_col = self._sampler.time_column_dict[task.entity_table_name]
+            exclude_cols_dict[task.entity_table_name].append(time_col)
+        entity_pkey = pd.concat([
+            task._context_df[task._entity_column],
+            task._pred_df[task._entity_column],
+        ], axis=0, ignore_index=True)
+        if task.use_entity_time:
+            if task.entity_table_name not in self._sampler.time_column_dict:
+                raise ValueError(f"The given annchor time requires the entity "
+                                 f"table '{task.entity_table_name}' to have a "
+                                 f"time column")
+            anchor_time = 'entity'
+        elif task._time_column is not None:
+            anchor_time = pd.concat([
+                task._context_df[task._time_column],
+                task._pred_df[task._time_column],
+            ], axis=0, ignore_index=True)
+        else:
+            anchor_time = pd.Series(self._get_default_anchor_time()).repeat(
+                (len(entity_pkey))).reset_index(drop=True)
+        subgraph = self._sampler.sample_subgraph(
+            entity_table_names=task.entity_table_names,
+            entity_pkey=entity_pkey,
+            anchor_time=anchor_time,
             num_neighbors=num_neighbors,
             exclude_cols_dict=exclude_cols_dict,
         )
@@ -1049,23 +1348,26 @@ class KumoRFM:
                              f"'https://github.com/kumo-ai/kumo-rfm' if you "
                              f"must go beyond this for your use-case.")
-        step_size: Optional[int] = None
-        if query.query_type == QueryType.TEMPORAL:
-            step_size = date_offset_to_seconds(end_offset)
+        if (task.task_type.is_link_pred
+                and task.entity_table_names[-1] not in subgraph.table_dict):
+            raise ValueError("Cannot perform link prediction on subgraphs "
+                             "without any historical target entities. Please "
+                             "increase the number of hops and try again.")
         return Context(
-            task_type=task_type,
-            entity_table_names=entity_table_names,
+            task_type=task.task_type,
+            entity_table_names=task.entity_table_names,
             subgraph=subgraph,
-            y_train=y_train,
-            y_test=y_test,
-            top_k=query.top_k,
-            step_size=step_size,
+            y_train=task._context_df[task.target_column.name],
+            y_test=task._pred_df[task.target_column.name]
+            if task.evaluate else None,
+            top_k=top_k,
+            step_size=None,
         )
     @staticmethod
     def _validate_metrics(
-        metrics: List[str],
+        metrics: list[str],
         task_type: TaskType,
     ) -> None:
@@ -1122,7 +1424,7 @@ class KumoRFM:
                                  f"'https://github.com/kumo-ai/kumo-rfm'.")
-def format_value(value: Union[int, float]) -> str:
+def format_value(value: int | float) -> str:
     if value == int(value):
         return f'{int(value):,}'
     if abs(value) >= 1000:

kumoai 2.13.0.dev202511211730__py3-none-any.whl → 2.15.0.dev202601131732__py3-none-any.whl

kumoai 2.13.0.dev202511211730py3-none-any.whl → 2.15.0.dev202601131732py3-none-any.whl