PyPI - kumoai - Versions diffs - 2.9.0.dev202509061830__cp311-cp311-macosx_11_0_arm64.whl → 2.12.0.dev202511031731__cp311-cp311-macosx_11_0_arm64.whl - Mend

kumoai 2.9.0.dev202509061830__cp311-cp311-macosx_11_0_arm64.whl → 2.12.0.dev202511031731__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

kumoai/__init__.py +4 -2
kumoai/_version.py +1 -1
kumoai/client/client.py +10 -5
kumoai/client/rfm.py +3 -2
kumoai/connector/file_upload_connector.py +71 -102
kumoai/connector/utils.py +1367 -236
kumoai/experimental/rfm/__init__.py +2 -2
kumoai/experimental/rfm/authenticate.py +8 -5
kumoai/experimental/rfm/infer/timestamp.py +7 -4
kumoai/experimental/rfm/local_graph.py +90 -80
kumoai/experimental/rfm/local_graph_sampler.py +16 -8
kumoai/experimental/rfm/local_graph_store.py +22 -6
kumoai/experimental/rfm/local_pquery_driver.py +129 -28
kumoai/experimental/rfm/local_table.py +100 -22
kumoai/experimental/rfm/pquery/__init__.py +4 -0
kumoai/experimental/rfm/pquery/backend.py +4 -0
kumoai/experimental/rfm/pquery/executor.py +102 -0
kumoai/experimental/rfm/pquery/pandas_backend.py +71 -30
kumoai/experimental/rfm/pquery/pandas_executor.py +506 -0
kumoai/experimental/rfm/rfm.py +442 -94
kumoai/jobs.py +1 -0
kumoai/trainer/trainer.py +19 -10
kumoai/utils/progress_logger.py +62 -0
{kumoai-2.9.0.dev202509061830.dist-info → kumoai-2.12.0.dev202511031731.dist-info}/METADATA +4 -5
{kumoai-2.9.0.dev202509061830.dist-info → kumoai-2.12.0.dev202511031731.dist-info}/RECORD +28 -26
{kumoai-2.9.0.dev202509061830.dist-info → kumoai-2.12.0.dev202511031731.dist-info}/WHEEL +0 -0
{kumoai-2.9.0.dev202509061830.dist-info → kumoai-2.12.0.dev202511031731.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.9.0.dev202509061830.dist-info → kumoai-2.12.0.dev202511031731.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/rfm.py CHANGED Viewed

@@ -1,13 +1,19 @@
 import json
+import time
 import warnings
-from typing import List, Literal, Optional, Union
+from collections import defaultdict
+from collections.abc import Generator
+from contextlib import contextmanager
+from dataclasses import dataclass, replace
+from typing import Iterator, List, Literal, Optional, Union, overload
 import numpy as np
 import pandas as pd
 from kumoapi.model_plan import RunMode
 from kumoapi.pquery import QueryType
+from kumoapi.rfm import Context
+from kumoapi.rfm import Explanation as ExplanationConfig
 from kumoapi.rfm import (
-    Context,
     PQueryDefinition,
     RFMEvaluateRequest,
     RFMPredictRequest,
@@ -20,11 +26,17 @@ from kumoai.exceptions import HTTPException
 from kumoai.experimental.rfm import LocalGraph
 from kumoai.experimental.rfm.local_graph_sampler import LocalGraphSampler
 from kumoai.experimental.rfm.local_graph_store import LocalGraphStore
-from kumoai.experimental.rfm.local_pquery_driver import LocalPQueryDriver
+from kumoai.experimental.rfm.local_pquery_driver import (
+    LocalPQueryDriver,
+    date_offset_to_seconds,
+)
 from kumoai.utils import InteractiveProgressLogger, ProgressLogger
 _RANDOM_SEED = 42
+_MAX_PRED_SIZE: dict[TaskType, int] = defaultdict(lambda: 1_000)
+_MAX_PRED_SIZE[TaskType.TEMPORAL_LINK_PREDICTION] = 200
 _MAX_CONTEXT_SIZE = {
     RunMode.DEBUG: 100,
     RunMode.FAST: 1_000,
@@ -39,7 +51,7 @@ _MAX_TEST_SIZE = {  # Share test set size across run modes for fair comparison:
 }
 _MAX_SIZE = 30 * 1024 * 1024
-_SIZE_LIMIT_MSG = ("Context size exceeds the 30MB limit. {stats_msg}\nPlease "
+_SIZE_LIMIT_MSG = ("Context size exceeds the 30MB limit. {stats}\nPlease "
                    "reduce either the number of tables in the graph, their "
                    "number of columns (e.g., large text columns), "
                    "neighborhood configuration, or the run mode. If none of "
@@ -48,6 +60,34 @@ _SIZE_LIMIT_MSG = ("Context size exceeds the 30MB limit. {stats_msg}\nPlease "
                    "beyond this for your use-case.")
+@dataclass(repr=False)
+class Explanation:
+    prediction: pd.DataFrame
+    summary: str
+    details: ExplanationConfig
+    @overload
+    def __getitem__(self, index: Literal[0]) -> pd.DataFrame:
+        pass
+    @overload
+    def __getitem__(self, index: Literal[1]) -> str:
+        pass
+    def __getitem__(self, index: int) -> Union[pd.DataFrame, str]:
+        if index == 0:
+            return self.prediction
+        if index == 1:
+            return self.summary
+        raise IndexError("Index out of range")
+    def __iter__(self) -> Iterator[Union[pd.DataFrame, str]]:
+        return iter((self.prediction, self.summary))
+    def __repr__(self) -> str:
+        return str((self.prediction, self.summary))
 class KumoRFM:
     r"""The Kumo Relational Foundation model (RFM) from the `KumoRFM: A
     Foundation Model for In-Context Learning on Relational Data
@@ -105,28 +145,117 @@ class KumoRFM:
         self._graph_store = LocalGraphStore(graph, preprocess, verbose)
         self._graph_sampler = LocalGraphSampler(self._graph_store)
+        self._batch_size: Optional[int | Literal['max']] = None
+        self.num_retries: int = 0
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}()'
+    @contextmanager
+    def batch_mode(
+        self,
+        batch_size: Union[int, Literal['max']] = 'max',
+        num_retries: int = 1,
+    ) -> Generator[None, None, None]:
+        """Context manager to predict in batches.
+        .. code-block:: python
+            with model.batch_mode(batch_size='max', num_retries=1):
+                df = model.predict(query, indices=...)
+        Args:
+            batch_size: The batch size. If set to ``"max"``, will use the
+                maximum applicable batch size for the given task.
+            num_retries: The maximum number of retries for failed queries due
+                to unexpected server issues.
+        """
+        if batch_size != 'max' and batch_size <= 0:
+            raise ValueError(f"'batch_size' must be greater than zero "
+                             f"(got {batch_size})")
+        if num_retries < 0:
+            raise ValueError(f"'num_retries' must be greater than or equal to "
+                             f"zero (got {num_retries})")
+        self._batch_size = batch_size
+        self.num_retries = num_retries
+        yield
+        self._batch_size = None
+        self.num_retries = 0
+    @overload
     def predict(
         self,
         query: str,
+        indices: Union[List[str], List[float], List[int], None] = None,
         *,
+        explain: Literal[False] = False,
         anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
+        context_anchor_time: Union[pd.Timestamp, None] = None,
         run_mode: Union[RunMode, str] = RunMode.FAST,
         num_neighbors: Optional[List[int]] = None,
         num_hops: int = 2,
         max_pq_iterations: int = 20,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
+        use_prediction_time: bool = False,
     ) -> pd.DataFrame:
+        pass
+    @overload
+    def predict(
+        self,
+        query: str,
+        indices: Union[List[str], List[float], List[int], None] = None,
+        *,
+        explain: Literal[True],
+        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
+        context_anchor_time: Union[pd.Timestamp, None] = None,
+        run_mode: Union[RunMode, str] = RunMode.FAST,
+        num_neighbors: Optional[List[int]] = None,
+        num_hops: int = 2,
+        max_pq_iterations: int = 20,
+        random_seed: Optional[int] = _RANDOM_SEED,
+        verbose: Union[bool, ProgressLogger] = True,
+        use_prediction_time: bool = False,
+    ) -> Explanation:
+        pass
+    def predict(
+        self,
+        query: str,
+        indices: Union[List[str], List[float], List[int], None] = None,
+        *,
+        explain: bool = False,
+        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
+        context_anchor_time: Union[pd.Timestamp, None] = None,
+        run_mode: Union[RunMode, str] = RunMode.FAST,
+        num_neighbors: Optional[List[int]] = None,
+        num_hops: int = 2,
+        max_pq_iterations: int = 20,
+        random_seed: Optional[int] = _RANDOM_SEED,
+        verbose: Union[bool, ProgressLogger] = True,
+        use_prediction_time: bool = False,
+    ) -> Union[pd.DataFrame, Explanation]:
         """Returns predictions for a predictive query.
         Args:
             query: The predictive query.
-            anchor_time: The anchor timestamp for the query. If set to
-                :obj:`None`, will use the maximum timestamp in the data.
-                If set to :`"entity"`, will use the timestamp of the entity.
+            indices: The entity primary keys to predict for. Will override the
+                indices given as part of the predictive query. Predictions will
+                be generated for all indices, independent of whether they
+                fulfill entity filter constraints. To pre-filter entities, use
+                :meth:`~KumoRFM.is_valid_entity`.
+            explain: If set to ``True``, will additionally explain the
+                prediction. Explainability is currently only supported for
+                single entity predictions with ``run_mode="FAST"``.
+            anchor_time: The anchor timestamp for the prediction. If set to
+                ``None``, will use the maximum timestamp in the data.
+                If set to ``"entity"``, will use the timestamp of the entity.
+            context_anchor_time: The maximum anchor timestamp for context
+                examples. If set to ``None``, ``anchor_time`` will
+                determine the anchor time for context examples.
             run_mode: The :class:`RunMode` for the query.
             num_neighbors: The number of neighbors to sample for each hop.
                 If specified, the ``num_hops`` option will be ignored.
@@ -138,11 +267,15 @@ class KumoRFM:
                 entities to find valid labels.
             random_seed: A manual seed for generating pseudo-random numbers.
             verbose: Whether to print verbose output.
+            use_prediction_time: Whether to use the anchor timestamp as an
+                additional feature during prediction. This is typically
+                beneficial for time series forecasting tasks.
         Returns:
-            The predictions as a :class:`pandas.DataFrame`
+            The predictions as a :class:`pandas.DataFrame`.
+            If ``explain=True``, additionally returns a textual summary that
+            explains the prediction.
         """
-        explain = False
         query_def = self._parse_query(query)
         if num_hops != 2 and num_neighbors is not None:
@@ -155,12 +288,24 @@ class KumoRFM:
                           f"mode has been reset. Please lower the run mode to "
                           f"suppress this warning.")
-        if explain:
-            assert query_def.entity.ids is not None
-            if len(query_def.entity.ids.value) > 1:
-                raise ValueError(
-                    f"Cannot explain predictions for more than a single "
-                    f"entity (got {len(query_def.entity.ids.value)})")
+        if indices is None:
+            if query_def.entity.ids is None:
+                raise ValueError("Cannot find entities to predict for. Please "
+                                 "pass them via `predict(query, indices=...)`")
+            indices = query_def.entity.ids.value
+        else:
+            query_def = replace(
+                query_def,
+                entity=replace(query_def.entity, ids=None),
+            )
+        if len(indices) == 0:
+            raise ValueError("At least one entity is required")
+        if explain and len(indices) > 1:
+            raise ValueError(
+                f"Cannot explain predictions for more than a single entity "
+                f"(got {len(indices)})")
         query_repr = query_def.to_string(rich=True, exclude_predict=True)
         if explain:
@@ -172,48 +317,185 @@ class KumoRFM:
             verbose = InteractiveProgressLogger(msg, verbose=verbose)
         with verbose as logger:
-            context = self._get_context(
-                query_def,
-                anchor_time=anchor_time,
-                run_mode=RunMode(run_mode),
-                num_neighbors=num_neighbors,
-                num_hops=num_hops,
-                max_pq_iterations=max_pq_iterations,
-                evaluate=False,
-                random_seed=random_seed,
-                logger=logger,
-            )
-            request = RFMPredictRequest(
-                context=context,
-                run_mode=RunMode(run_mode),
+            batch_size: Optional[int] = None
+            if self._batch_size == 'max':
+                task_type = query_def.get_task_type(
+                    stypes=self._graph_store.stype_dict,
+                    edge_types=self._graph_store.edge_types,
+                )
+                batch_size = _MAX_PRED_SIZE[task_type]
+            else:
+                batch_size = self._batch_size
+            if batch_size is not None:
+                offsets = range(0, len(indices), batch_size)
+                batches = [indices[step:step + batch_size] for step in offsets]
+            else:
+                batches = [indices]
+            if len(batches) > 1:
+                logger.log(f"Splitting {len(indices):,} entities into "
+                           f"{len(batches):,} batches of size {batch_size:,}")
+            predictions: List[pd.DataFrame] = []
+            summary: Optional[str] = None
+            details: Optional[Explanation] = None
+            for i, batch in enumerate(batches):
+                # TODO Re-use the context for subsequent predictions.
+                context = self._get_context(
+                    query=query_def,
+                    indices=batch,
+                    anchor_time=anchor_time,
+                    context_anchor_time=context_anchor_time,
+                    run_mode=RunMode(run_mode),
+                    num_neighbors=num_neighbors,
+                    num_hops=num_hops,
+                    max_pq_iterations=max_pq_iterations,
+                    evaluate=False,
+                    random_seed=random_seed,
+                    logger=logger if i == 0 else None,
+                )
+                request = RFMPredictRequest(
+                    context=context,
+                    run_mode=RunMode(run_mode),
+                    use_prediction_time=use_prediction_time,
+                )
+                with warnings.catch_warnings():
+                    warnings.filterwarnings('ignore', message='gencode')
+                    request_msg = request.to_protobuf()
+                    _bytes = request_msg.SerializeToString()
+                if i == 0:
+                    logger.log(f"Generated context of size "
+                               f"{len(_bytes) / (1024*1024):.2f}MB")
+                if len(_bytes) > _MAX_SIZE:
+                    stats = Context.get_memory_stats(request_msg.context)
+                    raise ValueError(_SIZE_LIMIT_MSG.format(stats=stats))
+                if (isinstance(verbose, InteractiveProgressLogger) and i == 0
+                        and len(batches) > 1):
+                    verbose.init_progress(
+                        total=len(batches),
+                        description='Predicting',
+                    )
+                for attempt in range(self.num_retries + 1):
+                    try:
+                        if explain:
+                            resp = global_state.client.rfm_api.explain(_bytes)
+                            summary = resp.summary
+                            details = resp.details
+                        else:
+                            resp = global_state.client.rfm_api.predict(_bytes)
+                        df = pd.DataFrame(**resp.prediction)
+                        # Cast 'ENTITY' to correct data type:
+                        if 'ENTITY' in df:
+                            entity = query_def.entity.pkey.table_name
+                            pkey_map = self._graph_store.pkey_map_dict[entity]
+                            df['ENTITY'] = df['ENTITY'].astype(
+                                type(pkey_map.index[0]))
+                        # Cast 'ANCHOR_TIMESTAMP' to correct data type:
+                        if 'ANCHOR_TIMESTAMP' in df:
+                            ser = df['ANCHOR_TIMESTAMP']
+                            if not pd.api.types.is_datetime64_any_dtype(ser):
+                                if isinstance(ser.iloc[0], str):
+                                    unit = None
+                                else:
+                                    unit = 'ms'
+                                df['ANCHOR_TIMESTAMP'] = pd.to_datetime(
+                                    ser, errors='coerce', unit=unit)
+                        predictions.append(df)
+                        if (isinstance(verbose, InteractiveProgressLogger)
+                                and len(batches) > 1):
+                            verbose.step()
+                        break
+                    except HTTPException as e:
+                        if attempt == self.num_retries:
+                            try:
+                                msg = json.loads(e.detail)['detail']
+                            except Exception:
+                                msg = e.detail
+                            raise RuntimeError(
+                                f"An unexpected exception occurred. Please "
+                                f"create an issue at "
+                                f"'https://github.com/kumo-ai/kumo-rfm'. {msg}"
+                            ) from None
+                        time.sleep(2**attempt)  # 1s, 2s, 4s, 8s, ...
+        if len(predictions) == 1:
+            prediction = predictions[0]
+        else:
+            prediction = pd.concat(predictions, ignore_index=True)
+        if explain:
+            assert len(predictions) == 1
+            assert summary is not None
+            assert details is not None
+            return Explanation(
+                prediction=prediction,
+                summary=summary,
+                details=details,
             )
-            with warnings.catch_warnings():
-                warnings.filterwarnings('ignore', message='Protobuf gencode')
-                request_msg = request.to_protobuf()
-                request_bytes = request_msg.SerializeToString()
-            logger.log(f"Generated context of size "
-                       f"{len(request_bytes) / (1024*1024):.2f}MB")
-            if len(request_bytes) > _MAX_SIZE:
-                stats_msg = Context.get_memory_stats(request_msg.context)
-                raise ValueError(_SIZE_LIMIT_MSG.format(stats_msg=stats_msg))
+        return prediction
-            try:
-                if explain:
-                    resp = global_state.client.rfm_api.explain(request_bytes)
-                else:
-                    resp = global_state.client.rfm_api.predict(request_bytes)
-            except HTTPException as e:
-                try:
-                    msg = json.loads(e.detail)['detail']
-                except Exception:
-                    msg = e.detail
-                raise RuntimeError(f"An unexpected exception occurred. "
-                                   f"Please create an issue at "
-                                   f"'https://github.com/kumo-ai/kumo-rfm'. "
-                                   f"{msg}") from None
+    def is_valid_entity(
+        self,
+        query: str,
+        indices: Union[List[str], List[float], List[int], None] = None,
+        *,
+        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
+    ) -> np.ndarray:
+        r"""Returns a mask that denotes which entities are valid for the
+        given predictive query, *i.e.*, which entities fulfill (temporal)
+        entity filter constraints.
+        Args:
+            query: The predictive query.
+            indices: The entity primary keys to predict for. Will override the
+                indices given as part of the predictive query.
+            anchor_time: The anchor timestamp for the prediction. If set to
+                ``None``, will use the maximum timestamp in the data.
+                If set to ``"entity"``, will use the timestamp of the entity.
+        """
+        query_def = self._parse_query(query)
+        if indices is None:
+            if query_def.entity.ids is None:
+                raise ValueError("Cannot find entities to predict for. Please "
+                                 "pass them via "
+                                 "`is_valid_entity(query, indices=...)`")
+            indices = query_def.entity.ids.value
+        if len(indices) == 0:
+            raise ValueError("At least one entity is required")
+        if anchor_time is None:
+            anchor_time = self._graph_store.max_time
+        if isinstance(anchor_time, pd.Timestamp):
+            self._validate_time(query_def, anchor_time, None, False)
+        else:
+            assert anchor_time == 'entity'
+            if (query_def.entity.pkey.table_name
+                    not in self._graph_store.time_dict):
+                raise ValueError(f"Anchor time 'entity' requires the entity "
+                                 f"table '{query_def.entity.pkey.table_name}' "
+                                 f"to have a time column")
-        return pd.DataFrame(**resp.prediction)
+        node = self._graph_store.get_node_id(
+            table_name=query_def.entity.pkey.table_name,
+            pkey=pd.Series(indices),
+        )
+        query_driver = LocalPQueryDriver(self._graph_store, query_def)
+        return query_driver.is_valid(node, anchor_time)
     def evaluate(
         self,
@@ -221,21 +503,26 @@ class KumoRFM:
         *,
         metrics: Optional[List[str]] = None,
         anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
+        context_anchor_time: Union[pd.Timestamp, None] = None,
         run_mode: Union[RunMode, str] = RunMode.FAST,
         num_neighbors: Optional[List[int]] = None,
         num_hops: int = 2,
         max_pq_iterations: int = 20,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
+        use_prediction_time: bool = False,
     ) -> pd.DataFrame:
         """Evaluates a predictive query.
         Args:
             query: The predictive query.
             metrics: The metrics to use.
-            anchor_time: The anchor timestamp for the query. If set to
-                :obj:`None`, will use the maximum timestamp in the data.
-                If set to :`"entity"`, will use the timestamp of the entity.
+            anchor_time: The anchor timestamp for the prediction. If set to
+                ``None``, will use the maximum timestamp in the data.
+                If set to ``"entity"``, will use the timestamp of the entity.
+            context_anchor_time: The maximum anchor timestamp for context
+                examples. If set to ``None``, ``anchor_time`` will
+                determine the anchor time for context examples.
             run_mode: The :class:`RunMode` for the query.
             num_neighbors: The number of neighbors to sample for each hop.
                 If specified, the ``num_hops`` option will be ignored.
@@ -247,6 +534,9 @@ class KumoRFM:
                 entities to find valid labels.
             random_seed: A manual seed for generating pseudo-random numbers.
             verbose: Whether to print verbose output.
+            use_prediction_time: Whether to use the anchor timestamp as an
+                additional feature during prediction. This is typically
+                beneficial for time series forecasting tasks.
         Returns:
             The metrics as a :class:`pandas.DataFrame`
@@ -257,6 +547,12 @@ class KumoRFM:
             warnings.warn(f"Received custom 'num_neighbors' option; ignoring "
                           f"custom 'num_hops={num_hops}' option")
+        if query_def.entity.ids is not None:
+            query_def = replace(
+                query_def,
+                entity=replace(query_def.entity, ids=None),
+            )
         query_repr = query_def.to_string(rich=True, exclude_predict=True)
         msg = f'[bold]EVALUATE[/bold] {query_repr}'
@@ -265,8 +561,10 @@ class KumoRFM:
         with verbose as logger:
             context = self._get_context(
-                query_def,
+                query=query_def,
+                indices=None,
                 anchor_time=anchor_time,
+                context_anchor_time=context_anchor_time,
                 run_mode=RunMode(run_mode),
                 num_neighbors=num_neighbors,
                 num_hops=num_hops,
@@ -282,6 +580,7 @@ class KumoRFM:
                 context=context,
                 run_mode=RunMode(run_mode),
                 metrics=metrics,
+                use_prediction_time=use_prediction_time,
             )
             with warnings.catch_warnings():
                 warnings.filterwarnings('ignore', message='Protobuf gencode')
@@ -340,11 +639,12 @@ class KumoRFM:
         if anchor_time is None:
             anchor_time = self._graph_store.max_time
-            anchor_time = anchor_time - query_def.target.end_offset
+            anchor_time = anchor_time - (query_def.target.end_offset *
+                                         query_def.num_forecasts)
         assert anchor_time is not None
         if isinstance(anchor_time, pd.Timestamp):
-            self._validate_time(query_def, anchor_time, evaluate=True)
+            self._validate_time(query_def, anchor_time, None, evaluate=True)
         else:
             assert anchor_time == 'entity'
             if (query_def.entity.pkey.table_name
@@ -361,10 +661,14 @@ class KumoRFM:
             anchor_time=anchor_time,
             batch_size=min(10_000, size),
             max_iterations=max_iterations,
+            guarantee_train_examples=False,
         )
+        entity = self._graph_store.pkey_map_dict[
+            query_def.entity.pkey.table_name].index[node]
         return pd.DataFrame({
-            'ENTITY': node,
+            'ENTITY': entity,
             'ANCHOR_TIMESTAMP': time,
             'TARGET': y,
         })
@@ -411,6 +715,7 @@ class KumoRFM:
         self,
         query: PQueryDefinition,
         anchor_time: pd.Timestamp,
+        context_anchor_time: Union[pd.Timestamp, None],
         evaluate: bool,
     ) -> None:
@@ -422,22 +727,41 @@ class KumoRFM:
                              f"the earliest timestamp "
                              f"'{self._graph_store.min_time}' in the data.")
-        if anchor_time - query.target.end_offset < self._graph_store.min_time:
-            raise ValueError(f"Anchor timestamp is too early or aggregation "
-                             f"time range is too large. To make this "
-                             f"prediction, we would need data back to "
-                             f"'{anchor_time - query.target.end_offset}', "
-                             f"however, your data only contains data back to "
+        if (context_anchor_time is not None
+                and context_anchor_time < self._graph_store.min_time):
+            raise ValueError(f"Context anchor timestamp is too early or "
+                             f"aggregation time range is too large. To make "
+                             f"this prediction, we would need data back to "
+                             f"'{context_anchor_time}', however, your data "
+                             f"only contains data back to "
                              f"'{self._graph_store.min_time}'.")
-        if (anchor_time - 2 * query.target.end_offset
-                < self._graph_store.min_time):
-            warnings.warn(f"Anchor timestamp is too early or aggregation "
-                          f"time range is too large. To form proper input "
-                          f"data, we would need data back to "
-                          f"'{anchor_time - 2 * query.target.end_offset}', "
-                          f"however, your data only contains data back to "
-                          f"'{self._graph_store.min_time}'.")
+        if (context_anchor_time is not None
+                and context_anchor_time > anchor_time):
+            warnings.warn(f"Context anchor timestamp "
+                          f"(got '{context_anchor_time}') is set to a later "
+                          f"date than the prediction anchor timestamp "
+                          f"(got '{anchor_time}'). Please make sure this is "
+                          f"intended.")
+        elif (query.query_type == QueryType.TEMPORAL
+              and context_anchor_time is not None and context_anchor_time +
+              query.target.end_offset * query.num_forecasts > anchor_time):
+            warnings.warn(f"Aggregation for context examples at timestamp "
+                          f"'{context_anchor_time}' will leak information "
+                          f"from the prediction anchor timestamp "
+                          f"'{anchor_time}'. Please make sure this is "
+                          f"intended.")
+        elif (context_anchor_time is not None and context_anchor_time -
+              query.target.end_offset * query.num_forecasts
+              < self._graph_store.min_time):
+            _time = context_anchor_time - (query.target.end_offset *
+                                           query.num_forecasts)
+            warnings.warn(f"Context anchor timestamp is too early or "
+                          f"aggregation time range is too large. To form "
+                          f"proper input data, we would need data back to "
+                          f"'{_time}', however, your data only contains "
+                          f"data back to '{self._graph_store.min_time}'.")
         if (not evaluate and anchor_time
                 > self._graph_store.max_time + pd.DateOffset(days=1)):
@@ -445,17 +769,19 @@ class KumoRFM:
                           f"latest timestamp '{self._graph_store.max_time}' "
                           f"in the data. Please make sure this is intended.")
-        if (evaluate and anchor_time
-                > self._graph_store.max_time - query.target.end_offset):
+        max_eval_time = (self._graph_store.max_time -
+                         query.target.end_offset * query.num_forecasts)
+        if evaluate and anchor_time > max_eval_time:
             raise ValueError(
                 f"Anchor timestamp for evaluation is after the latest "
-                f"supported timestamp "
-                f"'{self._graph_store.max_time - query.target.end_offset}'.")
+                f"supported timestamp '{max_eval_time}'.")
     def _get_context(
         self,
         query: PQueryDefinition,
+        indices: Union[List[str], List[float], List[int], None],
         anchor_time: Union[pd.Timestamp, Literal['entity'], None],
+        context_anchor_time: Union[pd.Timestamp, None],
         run_mode: RunMode,
         num_neighbors: Optional[List[int]],
         num_hops: int,
@@ -516,25 +842,36 @@ class KumoRFM:
         if anchor_time is None:
             anchor_time = self._graph_store.max_time
             if evaluate:
-                anchor_time = anchor_time - query.target.end_offset
+                anchor_time = anchor_time - (query.target.end_offset *
+                                             query.num_forecasts)
             if logger is not None:
                 assert isinstance(anchor_time, pd.Timestamp)
-                if (anchor_time.hour == 0 and anchor_time.minute == 0
-                        and anchor_time.second == 0
-                        and anchor_time.microsecond == 0):
+                if anchor_time == pd.Timestamp.min:
+                    pass  # Static graph
+                elif (anchor_time.hour == 0 and anchor_time.minute == 0
+                      and anchor_time.second == 0
+                      and anchor_time.microsecond == 0):
                     logger.log(f"Derived anchor time {anchor_time.date()}")
                 else:
                     logger.log(f"Derived anchor time {anchor_time}")
         assert anchor_time is not None
         if isinstance(anchor_time, pd.Timestamp):
-            self._validate_time(query, anchor_time, evaluate)
+            if context_anchor_time is None:
+                context_anchor_time = anchor_time - (query.target.end_offset *
+                                                     query.num_forecasts)
+            self._validate_time(query, anchor_time, context_anchor_time,
+                                evaluate)
         else:
             assert anchor_time == 'entity'
             if query.entity.pkey.table_name not in self._graph_store.time_dict:
                 raise ValueError(f"Anchor time 'entity' requires the entity "
                                  f"table '{query.entity.pkey.table_name}' to "
                                  f"have a time column")
+            if context_anchor_time is not None:
+                warnings.warn("Ignoring option 'context_anchor_time' for "
+                              "`anchor_time='entity'`")
+            context_anchor_time = None
         y_test: Optional[pd.Series] = None
         if evaluate:
@@ -546,6 +883,7 @@ class KumoRFM:
                 size=max_test_size,
                 anchor_time=anchor_time,
                 max_iterations=max_pq_iterations,
+                guarantee_train_examples=True,
             )
             if logger is not None:
                 if task_type == TaskType.BINARY_CLASSIFICATION:
@@ -569,20 +907,18 @@ class KumoRFM:
                 logger.log(msg)
         else:
-            assert query.entity.ids is not None
+            assert indices is not None
-            max_num_test = 200 if task_type.is_link_pred else 1000
-            if len(query.entity.ids.value) > max_num_test:
+            if len(indices) > _MAX_PRED_SIZE[task_type]:
                 raise ValueError(f"Cannot predict for more than "
-                                 f"{max_num_test:,} entities at once "
-                                 f"(got {len(query.entity.ids.value):,})")
+                                 f"{_MAX_PRED_SIZE[task_type]:,} entities at "
+                                 f"once (got {len(indices):,}). Use "
+                                 f"`KumoRFM.batch_mode` to process entities "
+                                 f"in batches")
             test_node = self._graph_store.get_node_id(
                 table_name=query.entity.pkey.table_name,
-                pkey=pd.Series(
-                    query.entity.ids.value,
-                    dtype=query.entity.ids.dtype,
-                ),
+                pkey=pd.Series(indices),
             )
             if isinstance(anchor_time, pd.Timestamp):
@@ -596,7 +932,7 @@ class KumoRFM:
         train_node, train_time, y_train = query_driver.collect_train(
             size=_MAX_CONTEXT_SIZE[run_mode],
-            anchor_time=anchor_time,
+            anchor_time=context_anchor_time or 'entity',
             exclude_node=test_node if (query.query_type == QueryType.STATIC
                                        or anchor_time == 'entity') else None,
             max_iterations=max_pq_iterations,
@@ -648,6 +984,17 @@ class KumoRFM:
             exclude_cols_dict=exclude_cols_dict,
         )
+        if len(subgraph.table_dict) >= 15:
+            raise ValueError(f"Cannot query from a graph with more than 15 "
+                             f"tables (got {len(subgraph.table_dict)}). "
+                             f"Please create a feature request at "
+                             f"'https://github.com/kumo-ai/kumo-rfm' if you "
+                             f"must go beyond this for your use-case.")
+        step_size: Optional[int] = None
+        if query.query_type == QueryType.TEMPORAL:
+            step_size = date_offset_to_seconds(query.target.end_offset)
         return Context(
             task_type=task_type,
             entity_table_names=entity_table_names,
@@ -655,6 +1002,7 @@ class KumoRFM:
             y_train=y_train,
             y_test=y_test,
             top_k=query.top_k,
+            step_size=step_size,
         )
     @staticmethod
@@ -670,7 +1018,7 @@ class KumoRFM:
         elif task_type == TaskType.MULTICLASS_CLASSIFICATION:
             supported_metrics = ['acc', 'precision', 'recall', 'f1', 'mrr']
         elif task_type == TaskType.REGRESSION:
-            supported_metrics = ['mae', 'mape', 'mse', 'rmse', 'smape']
+            supported_metrics = ['mae', 'mape', 'mse', 'rmse', 'smape', 'r2']
         elif task_type == TaskType.TEMPORAL_LINK_PREDICTION:
             supported_metrics = [
                 'map@', 'ndcg@', 'mrr@', 'precision@', 'recall@', 'f1@',