PyPI - kumoai - Versions diffs - 2.14.0.dev202512191731__cp311-cp311-macosx_11_0_arm64.whl → 2.14.0.dev202601051732__cp311-cp311-macosx_11_0_arm64.whl - Mend

kumoai 2.14.0.dev202512191731__cp311-cp311-macosx_11_0_arm64.whl → 2.14.0.dev202601051732__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

kumoai/__init__.py +23 -26
kumoai/_version.py +1 -1
kumoai/client/client.py +6 -0
kumoai/client/jobs.py +24 -0
kumoai/connector/utils.py +21 -7
kumoai/experimental/rfm/__init__.py +24 -22
kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
kumoai/experimental/rfm/backend/local/sampler.py +0 -3
kumoai/experimental/rfm/backend/local/table.py +24 -25
kumoai/experimental/rfm/backend/snow/sampler.py +106 -61
kumoai/experimental/rfm/backend/snow/table.py +137 -64
kumoai/experimental/rfm/backend/sqlite/sampler.py +127 -78
kumoai/experimental/rfm/backend/sqlite/table.py +85 -55
kumoai/experimental/rfm/base/__init__.py +6 -9
kumoai/experimental/rfm/base/column.py +95 -11
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/sampler.py +5 -17
kumoai/experimental/rfm/base/source.py +1 -1
kumoai/experimental/rfm/base/sql_sampler.py +69 -9
kumoai/experimental/rfm/base/table.py +258 -97
kumoai/experimental/rfm/graph.py +106 -98
kumoai/experimental/rfm/infer/dtype.py +4 -1
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +394 -241
kumoai/experimental/rfm/task_table.py +290 -0
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/utils/display.py +51 -0
kumoai/utils/progress_logger.py +13 -1
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.14.0.dev202601051732.dist-info}/METADATA +1 -1
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.14.0.dev202601051732.dist-info}/RECORD +34 -31
kumoai/experimental/rfm/base/column_expression.py +0 -50
kumoai/experimental/rfm/base/sql_table.py +0 -229
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.14.0.dev202601051732.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.14.0.dev202601051732.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512191731.dist-info → kumoai-2.14.0.dev202601051732.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/rfm.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import math
 import time
 import warnings
 from collections import defaultdict
@@ -28,30 +29,33 @@ from kumoapi.rfm import (
 from kumoapi.task import TaskType
 from kumoapi.typing import AggregationType, Stype
-from kumoai import in_notebook, in_snowflake_notebook
 from kumoai.client.rfm import RFMAPI
 from kumoai.exceptions import HTTPException
-from kumoai.experimental.rfm import Graph
+from kumoai.experimental.rfm import Graph, TaskTable
 from kumoai.experimental.rfm.base import DataBackend, Sampler
 from kumoai.mixin import CastMixin
-from kumoai.utils import ProgressLogger
+from kumoai.utils import ProgressLogger, display
 _RANDOM_SEED = 42
 _MAX_PRED_SIZE: dict[TaskType, int] = defaultdict(lambda: 1_000)
 _MAX_PRED_SIZE[TaskType.TEMPORAL_LINK_PREDICTION] = 200
+_MAX_TEST_SIZE: dict[TaskType, int] = defaultdict(lambda: 2_000)
+_MAX_TEST_SIZE[TaskType.TEMPORAL_LINK_PREDICTION] = 400
 _MAX_CONTEXT_SIZE = {
     RunMode.DEBUG: 100,
     RunMode.FAST: 1_000,
     RunMode.NORMAL: 5_000,
     RunMode.BEST: 10_000,
 }
-_MAX_TEST_SIZE = {  # Share test set size across run modes for fair comparison:
-    RunMode.DEBUG: 100,
-    RunMode.FAST: 2_000,
-    RunMode.NORMAL: 2_000,
-    RunMode.BEST: 2_000,
+_DEFAULT_NUM_NEIGHBORS = {
+    RunMode.DEBUG: [16, 16, 4, 4, 1, 1],
+    RunMode.FAST: [32, 32, 8, 8, 4, 4],
+    RunMode.NORMAL: [64, 64, 8, 8, 4, 4],
+    RunMode.BEST: [64, 64, 8, 8, 4, 4],
 }
 _MAX_SIZE = 30 * 1024 * 1024
@@ -104,23 +108,8 @@ class Explanation:
     def print(self) -> None:
         r"""Prints the explanation."""
-        if in_snowflake_notebook():
-            import streamlit as st
-            st.dataframe(self.prediction, hide_index=True)
-            st.markdown(self.summary)
-        elif in_notebook():
-            from IPython.display import Markdown, display
-            try:
-                if hasattr(self.prediction.style, 'hide'):
-                    display(self.prediction.hide(axis='index'))  # pandas=2
-                else:
-                    display(self.prediction.hide_index())  # pandas <1.3
-            except ImportError:
-                print(self.prediction.to_string(index=False))  # missing jinja2
-            display(Markdown(self.summary))
-        else:
-            print(self.prediction.to_string(index=False))
-            print(self.summary)
+        display.dataframe(self.prediction)
+        display.message(self.summary)
     def _ipython_display_(self) -> None:
         self.print()
@@ -333,18 +322,133 @@ class KumoRFM:
             If ``explain`` is provided, returns an :class:`Explanation` object
             containing the prediction, summary, and details.
         """
-        explain_config: ExplainConfig | None = None
-        if explain is True:
-            explain_config = ExplainConfig()
-        elif explain is not False:
-            explain_config = ExplainConfig._cast(explain)
         query_def = self._parse_query(query)
-        query_str = query_def.to_string()
+        if indices is None:
+            if query_def.rfm_entity_ids is None:
+                raise ValueError("Cannot find entities to predict for. Please "
+                                 "pass them via `predict(query, indices=...)`")
+            indices = query_def.get_rfm_entity_id_list()
+        else:
+            query_def = replace(query_def, rfm_entity_ids=None)
+        if not isinstance(verbose, ProgressLogger):
+            query_repr = query_def.to_string(rich=True, exclude_predict=True)
+            if explain is not False:
+                msg = f'[bold]EXPLAIN[/bold] {query_repr}'
+            else:
+                msg = f'[bold]PREDICT[/bold] {query_repr}'
+            verbose = ProgressLogger.default(msg=msg, verbose=verbose)
+        with verbose as logger:
+            task_table = self._get_task_table(
+                query=query_def,
+                indices=indices,
+                anchor_time=anchor_time,
+                context_anchor_time=context_anchor_time,
+                run_mode=run_mode,
+                max_pq_iterations=max_pq_iterations,
+                random_seed=random_seed,
+                logger=logger,
+            )
+            task_table._query = query_def.to_string()  # type: ignore
+            return self.predict_task(
+                task_table,
+                explain=explain,  # type: ignore
+                run_mode=run_mode,
+                num_neighbors=num_neighbors,
+                num_hops=num_hops,
+                verbose=verbose,
+                exclude_cols_dict=query_def.get_exclude_cols_dict(),
+                use_prediction_time=use_prediction_time,
+                top_k=query_def.top_k,
+            )
+    @overload
+    def predict_task(
+        self,
+        task: TaskTable,
+        *,
+        explain: Literal[False] = False,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        num_hops: int = 2,
+        verbose: bool | ProgressLogger = True,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        use_prediction_time: bool = False,
+        top_k: int | None = None,
+    ) -> pd.DataFrame:
+        pass
+    @overload
+    def predict_task(
+        self,
+        task: TaskTable,
+        *,
+        explain: Literal[True] | ExplainConfig | dict[str, Any],
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        num_hops: int = 2,
+        verbose: bool | ProgressLogger = True,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        use_prediction_time: bool = False,
+        top_k: int | None = None,
+    ) -> Explanation:
+        pass
+    def predict_task(
+        self,
+        task: TaskTable,
+        *,
+        explain: bool | ExplainConfig | dict[str, Any] = False,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        num_hops: int = 2,
+        verbose: bool | ProgressLogger = True,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        use_prediction_time: bool = False,
+        top_k: int | None = None,
+    ) -> pd.DataFrame | Explanation:
+        """Returns predictions for a custom task specification.
+        Args:
+            task: The custom :class:`TaskTable`.
+            explain: Configuration for explainability.
+                If set to ``True``, will additionally explain the prediction.
+                Passing in an :class:`ExplainConfig` instance provides control
+                over which parts of explanation are generated.
+                Explainability is currently only supported for single entity
+                predictions with ``run_mode="FAST"``.
+            run_mode: The :class:`RunMode` for the query.
+            num_neighbors: The number of neighbors to sample for each hop.
+                If specified, the ``num_hops`` option will be ignored.
+            num_hops: The number of hops to sample when generating the context.
+            verbose: Whether to print verbose output.
+            exclude_cols_dict: Any column in any table to exclude from the
+                model input.
+            use_prediction_time: Whether to use the anchor timestamp as an
+                additional feature during prediction. This is typically
+                beneficial for time series forecasting tasks.
+            top_k: The number of predictions to return per entity.
+        Returns:
+            The predictions as a :class:`pandas.DataFrame`.
+            If ``explain`` is provided, returns an :class:`Explanation` object
+            containing the prediction, summary, and details.
+        """
         if num_hops != 2 and num_neighbors is not None:
             warnings.warn(f"Received custom 'num_neighbors' option; ignoring "
                           f"custom 'num_hops={num_hops}' option")
+        if num_neighbors is None:
+            key = RunMode.FAST if task.task_type.is_link_pred else run_mode
+            num_neighbors = _DEFAULT_NUM_NEIGHBORS[key][:num_hops]
+        explain_config: ExplainConfig | None = None
+        if explain is True:
+            explain_config = ExplainConfig()
+        elif explain is not False:
+            explain_config = ExplainConfig._cast(explain)
         if explain_config is not None and run_mode in {
                 RunMode.NORMAL, RunMode.BEST
@@ -353,83 +457,82 @@ class KumoRFM:
                           f"run mode 'FAST' (got '{run_mode}'). Provided run "
                           f"mode has been reset. Please lower the run mode to "
                           f"suppress this warning.")
+            run_mode = RunMode.FAST
-        if indices is None:
-            if query_def.rfm_entity_ids is None:
-                raise ValueError("Cannot find entities to predict for. Please "
-                                 "pass them via `predict(query, indices=...)`")
-            indices = query_def.get_rfm_entity_id_list()
-        else:
-            query_def = replace(query_def, rfm_entity_ids=None)
-        if len(indices) == 0:
-            raise ValueError("At least one entity is required")
-        if explain_config is not None and len(indices) > 1:
-            raise ValueError(
-                f"Cannot explain predictions for more than a single entity "
-                f"(got {len(indices)})")
-        query_repr = query_def.to_string(rich=True, exclude_predict=True)
-        if explain_config is not None:
-            msg = f'[bold]EXPLAIN[/bold] {query_repr}'
-        else:
-            msg = f'[bold]PREDICT[/bold] {query_repr}'
+        if explain_config is not None and task.num_prediction_examples > 1:
+            raise ValueError(f"Cannot explain predictions for more than a "
+                             f"single entity "
+                             f"(got {task.num_prediction_examples:,})")
         if not isinstance(verbose, ProgressLogger):
+            if task.task_type == TaskType.BINARY_CLASSIFICATION:
+                task_type_repr = 'binary classification'
+            elif task.task_type == TaskType.MULTICLASS_CLASSIFICATION:
+                task_type_repr = 'multi-class classification'
+            elif task.task_type == TaskType.REGRESSION:
+                task_type_repr = 'regression'
+            elif task.task_type == TaskType.TEMPORAL_LINK_PREDICTION:
+                task_type_repr = 'link prediction'
+            else:
+                task_type_repr = str(task.task_type)
+            if explain_config is not None:
+                msg = f'Explain {task_type_repr} task'
+            else:
+                msg = f'Predict {task_type_repr} task'
             verbose = ProgressLogger.default(msg=msg, verbose=verbose)
         with verbose as logger:
-            batch_size: int | None = None
-            if self._batch_size == 'max':
-                task_type = self._get_task_type(
-                    query=query_def,
-                    edge_types=self._sampler.edge_types,
-                )
-                batch_size = _MAX_PRED_SIZE[task_type]
+            if task.num_context_examples > _MAX_CONTEXT_SIZE[run_mode]:
+                logger.log(f"Sub-sampled {_MAX_CONTEXT_SIZE[run_mode]:,} "
+                           f"out of {task.num_context_examples:,} in-context "
+                           f"examples")
+                task = task.narrow_context(0, _MAX_CONTEXT_SIZE[run_mode])
+            if self._batch_size is None:
+                batch_size = task.num_prediction_examples
+            elif self._batch_size == 'max':
+                batch_size = _MAX_PRED_SIZE[task.task_type]
             else:
                 batch_size = self._batch_size
-            if batch_size is not None:
-                offsets = range(0, len(indices), batch_size)
-                batches = [indices[step:step + batch_size] for step in offsets]
-            else:
-                batches = [indices]
+            if batch_size > _MAX_PRED_SIZE[task.task_type]:
+                raise ValueError(f"Cannot predict for more than "
+                                 f"{_MAX_PRED_SIZE[task.task_type]:,} "
+                                 f"entities at once (got {batch_size:,}). Use "
+                                 f"`KumoRFM.batch_mode` to process entities "
+                                 f"in batches with a sufficient batch size.")
-            if len(batches) > 1:
-                logger.log(f"Splitting {len(indices):,} entities into "
-                           f"{len(batches):,} batches of size {batch_size:,}")
+            if task.num_prediction_examples > batch_size:
+                num = math.ceil(task.num_prediction_examples / batch_size)
+                logger.log(f"Splitting {task.num_prediction_examples:,} "
+                           f"entities into {num:,} batches of size "
+                           f"{batch_size:,}")
             predictions: list[pd.DataFrame] = []
             summary: str | None = None
             details: Explanation | None = None
-            for i, batch in enumerate(batches):
-                # TODO Re-use the context for subsequent predictions.
+            for start in range(0, task.num_prediction_examples, batch_size):
                 context = self._get_context(
-                    query=query_def,
-                    indices=batch,
-                    anchor_time=anchor_time,
-                    context_anchor_time=context_anchor_time,
-                    run_mode=RunMode(run_mode),
+                    task=task.narrow_prediction(start, length=batch_size),
+                    run_mode=run_mode,
                     num_neighbors=num_neighbors,
-                    num_hops=num_hops,
-                    max_pq_iterations=max_pq_iterations,
-                    evaluate=False,
-                    random_seed=random_seed,
-                    logger=logger if i == 0 else None,
+                    exclude_cols_dict=exclude_cols_dict,
+                    top_k=top_k,
                 )
+                context.y_test = None
                 request = RFMPredictRequest(
                     context=context,
                     run_mode=RunMode(run_mode),
-                    query=query_str,
+                    query=getattr(task, '_query', ''),
                     use_prediction_time=use_prediction_time,
                 )
                 with warnings.catch_warnings():
                     warnings.filterwarnings('ignore', message='gencode')
                     request_msg = request.to_protobuf()
                     _bytes = request_msg.SerializeToString()
-                if i == 0:
+                if start == 0:
                     logger.log(f"Generated context of size "
                                f"{len(_bytes) / (1024*1024):.2f}MB")
@@ -437,11 +540,9 @@ class KumoRFM:
                     stats = Context.get_memory_stats(request_msg.context)
                     raise ValueError(_SIZE_LIMIT_MSG.format(stats=stats))
-                if i == 0 and len(batches) > 1:
-                    verbose.init_progress(
-                        total=len(batches),
-                        description='Predicting',
-                    )
+                if start == 0 and task.num_prediction_examples > batch_size:
+                    num = math.ceil(task.num_prediction_examples / batch_size)
+                    verbose.init_progress(total=num, description='Predicting')
                 for attempt in range(self.num_retries + 1):
                     try:
@@ -459,7 +560,7 @@ class KumoRFM:
                         # Cast 'ENTITY' to correct data type:
                         if 'ENTITY' in df:
                             table_dict = context.subgraph.table_dict
-                            table = table_dict[query_def.entity_table]
+                            table = table_dict[context.entity_table_names[0]]
                             ser = table.df[table.primary_key]
                             df['ENTITY'] = df['ENTITY'].astype(ser.dtype)
@@ -476,7 +577,7 @@ class KumoRFM:
                         predictions.append(df)
-                        if len(batches) > 1:
+                        if task.num_prediction_examples > batch_size:
                             verbose.step()
                         break
@@ -601,40 +702,51 @@ class KumoRFM:
             The metrics as a :class:`pandas.DataFrame`
         """
         query_def = self._parse_query(query)
-        if num_hops != 2 and num_neighbors is not None:
-            warnings.warn(f"Received custom 'num_neighbors' option; ignoring "
-                          f"custom 'num_hops={num_hops}' option")
         if query_def.rfm_entity_ids is not None:
             query_def = replace(
                 query_def,
                 rfm_entity_ids=None,
             )
-        query_repr = query_def.to_string(rich=True, exclude_predict=True)
-        msg = f'[bold]EVALUATE[/bold] {query_repr}'
+        task_type = self._get_task_type(
+            query=query_def,
+            edge_types=self._sampler.edge_types,
+        )
+        if num_hops != 2 and num_neighbors is not None:
+            warnings.warn(f"Received custom 'num_neighbors' option; ignoring "
+                          f"custom 'num_hops={num_hops}' option")
+        if num_neighbors is None:
+            key = RunMode.FAST if task_type.is_link_pred else run_mode
+            num_neighbors = _DEFAULT_NUM_NEIGHBORS[key][:num_hops]
+        if metrics is not None and len(metrics) > 0:
+            self._validate_metrics(metrics, task_type)
+            metrics = list(dict.fromkeys(metrics))
         if not isinstance(verbose, ProgressLogger):
+            query_repr = query_def.to_string(rich=True, exclude_predict=True)
+            msg = f'[bold]EVALUATE[/bold] {query_repr}'
             verbose = ProgressLogger.default(msg=msg, verbose=verbose)
         with verbose as logger:
-            context = self._get_context(
+            task_table = self._get_task_table(
                 query=query_def,
                 indices=None,
                 anchor_time=anchor_time,
                 context_anchor_time=context_anchor_time,
-                run_mode=RunMode(run_mode),
-                num_neighbors=num_neighbors,
-                num_hops=num_hops,
+                run_mode=run_mode,
                 max_pq_iterations=max_pq_iterations,
-                evaluate=True,
                 random_seed=random_seed,
-                logger=logger if verbose else None,
+                logger=logger,
             )
-            if metrics is not None and len(metrics) > 0:
-                self._validate_metrics(metrics, context.task_type)
-                metrics = list(dict.fromkeys(metrics))
+            context = self._get_context(
+                task=task_table,
+                run_mode=run_mode,
+                num_neighbors=num_neighbors,
+                exclude_cols_dict=query_def.get_exclude_cols_dict(),
+            )
             request = RFMEvaluateRequest(
                 context=context,
                 run_mode=RunMode(run_mode),
@@ -652,17 +764,23 @@ class KumoRFM:
                 stats_msg = Context.get_memory_stats(request_msg.context)
                 raise ValueError(_SIZE_LIMIT_MSG.format(stats=stats_msg))
-            try:
-                resp = self._api_client.evaluate(request_bytes)
-            except HTTPException as e:
+            for attempt in range(self.num_retries + 1):
                 try:
-                    msg = json.loads(e.detail)['detail']
-                except Exception:
-                    msg = e.detail
-                raise RuntimeError(f"An unexpected exception occurred. "
-                                   f"Please create an issue at "
-                                   f"'https://github.com/kumo-ai/kumo-rfm'. "
-                                   f"{msg}") from None
+                    resp = self._api_client.evaluate(request_bytes)
+                    break
+                except HTTPException as e:
+                    if attempt == self.num_retries:
+                        try:
+                            msg = json.loads(e.detail)['detail']
+                        except Exception:
+                            msg = e.detail
+                        raise RuntimeError(
+                            f"An unexpected exception occurred. Please create "
+                            f"an issue at "
+                            f"'https://github.com/kumo-ai/kumo-rfm'. {msg}"
+                        ) from None
+                    time.sleep(2**attempt)  # 1s, 2s, 4s, 8s, ...
         return pd.DataFrame.from_dict(
             resp.metrics,
@@ -714,7 +832,7 @@ class KumoRFM:
                                  f"to have a time column")
         train, test = self._sampler.sample_target(
-            query=query,
+            query=query_def,
             num_train_examples=0,
             train_anchor_time=anchor_time,
             num_train_trials=0,
@@ -742,30 +860,34 @@ class KumoRFM:
                              "`predict()` or `evaluate()` methods to perform "
                              "predictions or evaluations.")
-        try:
-            request = RFMParseQueryRequest(
-                query=query,
-                graph_definition=self._graph_def,
-            )
+        request = RFMParseQueryRequest(
+            query=query,
+            graph_definition=self._graph_def,
+        )
+        for attempt in range(self.num_retries + 1):
+            try:
+                resp = self._api_client.parse_query(request)
+                break
+            except HTTPException as e:
+                if attempt == self.num_retries:
+                    try:
+                        msg = json.loads(e.detail)['detail']
+                    except Exception:
+                        msg = e.detail
+                    raise ValueError(f"Failed to parse query '{query}'. {msg}")
-            resp = self._api_client.parse_query(request)
+                time.sleep(2**attempt)  # 1s, 2s, 4s, 8s, ...
-            if len(resp.validation_response.warnings) > 0:
-                msg = '\n'.join([
-                    f'{i+1}. {warning.title}: {warning.message}' for i, warning
-                    in enumerate(resp.validation_response.warnings)
-                ])
-                warnings.warn(f"Encountered the following warnings during "
-                              f"parsing:\n{msg}")
+        if len(resp.validation_response.warnings) > 0:
+            msg = '\n'.join([
+                f'{i+1}. {warning.title}: {warning.message}'
+                for i, warning in enumerate(resp.validation_response.warnings)
+            ])
+            warnings.warn(f"Encountered the following warnings during "
+                          f"parsing:\n{msg}")
-            return resp.query
-        except HTTPException as e:
-            try:
-                msg = json.loads(e.detail)['detail']
-            except Exception:
-                msg = e.detail
-            raise ValueError(f"Failed to parse query '{query}'. "
-                             f"{msg}") from None
+        return resp.query
     @staticmethod
     def _get_task_type(
@@ -809,16 +931,15 @@ class KumoRFM:
     def _get_default_anchor_time(
         self,
-        query: ValidatedPredictiveQuery,
+        query: ValidatedPredictiveQuery | None = None,
     ) -> pd.Timestamp:
-        if query.query_type == QueryType.TEMPORAL:
+        if query is not None and query.query_type == QueryType.TEMPORAL:
             aggr_table_names = [
                 aggr._get_target_column_name().split('.')[0]
                 for aggr in query.get_all_target_aggregations()
             ]
             return self._sampler.get_max_time(aggr_table_names)
-        assert query.query_type == QueryType.STATIC
         return self._sampler.get_max_time()
     def _validate_time(
@@ -888,40 +1009,26 @@ class KumoRFM:
                 f"Anchor timestamp for evaluation is after the latest "
                 f"supported timestamp '{max_time - end_offset}'.")
-    def _get_context(
+    def _get_task_table(
         self,
         query: ValidatedPredictiveQuery,
         indices: list[str] | list[float] | list[int] | None,
-        anchor_time: pd.Timestamp | Literal['entity'] | None,
-        context_anchor_time: pd.Timestamp | None,
-        run_mode: RunMode,
-        num_neighbors: list[int] | None,
-        num_hops: int,
-        max_pq_iterations: int,
-        evaluate: bool,
+        anchor_time: pd.Timestamp | Literal['entity'] | None = None,
+        context_anchor_time: pd.Timestamp | None = None,
+        run_mode: RunMode = RunMode.FAST,
+        max_pq_iterations: int = 10,
         random_seed: int | None = _RANDOM_SEED,
         logger: ProgressLogger | None = None,
-    ) -> Context:
-        if num_neighbors is not None:
-            num_hops = len(num_neighbors)
-        if num_hops < 0:
-            raise ValueError(f"'num_hops' must be non-negative "
-                             f"(got {num_hops})")
-        if num_hops > 6:
-            raise ValueError(f"Cannot predict on subgraphs with more than 6 "
-                             f"hops (got {num_hops}). Please reduce the "
-                             f"number of hops and try again. Please create a "
-                             f"feature request at "
-                             f"'https://github.com/kumo-ai/kumo-rfm' if you "
-                             f"must go beyond this for your use-case.")
+    ) -> TaskTable:
         task_type = self._get_task_type(
             query=query,
             edge_types=self._sampler.edge_types,
         )
+        num_train_examples = _MAX_CONTEXT_SIZE[run_mode]
+        num_test_examples = _MAX_TEST_SIZE[task_type] if indices is None else 0
         if logger is not None:
             if task_type == TaskType.BINARY_CLASSIFICATION:
                 task_type_repr = 'binary classification'
@@ -935,21 +1042,6 @@ class KumoRFM:
                 task_type_repr = str(task_type)
             logger.log(f"Identified {query.query_type} {task_type_repr} task")
-        if task_type.is_link_pred and num_hops < 2:
-            raise ValueError(f"Cannot perform link prediction on subgraphs "
-                             f"with less than 2 hops (got {num_hops}) since "
-                             f"historical target entities need to be part of "
-                             f"the context. Please increase the number of "
-                             f"hops and try again.")
-        if num_neighbors is None:
-            if run_mode == RunMode.DEBUG:
-                num_neighbors = [16, 16, 4, 4, 1, 1][:num_hops]
-            elif run_mode == RunMode.FAST or task_type.is_link_pred:
-                num_neighbors = [32, 32, 8, 8, 4, 4][:num_hops]
-            else:
-                num_neighbors = [64, 64, 8, 8, 4, 4][:num_hops]
         if query.target_ast.date_offset_range is None:
             step_offset = pd.DateOffset(0)
         else:
@@ -958,8 +1050,7 @@ class KumoRFM:
         if anchor_time is None:
             anchor_time = self._get_default_anchor_time(query)
-            if evaluate:
+            if num_test_examples > 0:
                 anchor_time = anchor_time - end_offset
             if logger is not None:
@@ -973,7 +1064,6 @@ class KumoRFM:
                 else:
                     logger.log(f"Derived anchor time {anchor_time}")
-        assert anchor_time is not None
         if isinstance(anchor_time, pd.Timestamp):
             if context_anchor_time == 'entity':
                 raise ValueError("Anchor time 'entity' needs to be shared "
@@ -981,7 +1071,7 @@ class KumoRFM:
             if context_anchor_time is None:
                 context_anchor_time = anchor_time - end_offset
             self._validate_time(query, anchor_time, context_anchor_time,
-                                evaluate)
+                                evaluate=num_test_examples > 0)
         else:
             assert anchor_time == 'entity'
             if query.query_type != QueryType.STATIC:
@@ -996,14 +1086,6 @@ class KumoRFM:
                                  "for context and prediction examples")
             context_anchor_time = 'entity'
-        num_train_examples = _MAX_CONTEXT_SIZE[run_mode]
-        if evaluate:
-            num_test_examples = _MAX_TEST_SIZE[run_mode]
-            if task_type.is_link_pred:
-                num_test_examples = num_test_examples // 5
-        else:
-            num_test_examples = 0
         train, test = self._sampler.sample_target(
             query=query,
             num_train_examples=num_train_examples,
@@ -1014,39 +1096,32 @@ class KumoRFM:
             num_test_trials=max_pq_iterations * num_test_examples,
             random_seed=random_seed,
         )
-        train_pkey, train_time, y_train = train
-        test_pkey, test_time, y_test = test
+        train_pkey, train_time, train_y = train
+        test_pkey, test_time, test_y = test
-        if evaluate and logger is not None:
+        if num_test_examples > 0 and logger is not None:
             if task_type == TaskType.BINARY_CLASSIFICATION:
-                pos = 100 * int((y_test > 0).sum()) / len(y_test)
-                msg = (f"Collected {len(y_test):,} test examples with "
+                pos = 100 * int((test_y > 0).sum()) / len(test_y)
+                msg = (f"Collected {len(test_y):,} test examples with "
                        f"{pos:.2f}% positive cases")
             elif task_type == TaskType.MULTICLASS_CLASSIFICATION:
-                msg = (f"Collected {len(y_test):,} test examples holding "
-                       f"{y_test.nunique()} classes")
+                msg = (f"Collected {len(test_y):,} test examples holding "
+                       f"{test_y.nunique()} classes")
             elif task_type == TaskType.REGRESSION:
-                _min, _max = float(y_test.min()), float(y_test.max())
-                msg = (f"Collected {len(y_test):,} test examples with targets "
+                _min, _max = float(test_y.min()), float(test_y.max())
+                msg = (f"Collected {len(test_y):,} test examples with targets "
                        f"between {format_value(_min)} and "
                        f"{format_value(_max)}")
             elif task_type == TaskType.TEMPORAL_LINK_PREDICTION:
-                num_rhs = y_test.explode().nunique()
-                msg = (f"Collected {len(y_test):,} test examples with "
+                num_rhs = test_y.explode().nunique()
+                msg = (f"Collected {len(test_y):,} test examples with "
                        f"{num_rhs:,} unique items")
             else:
                 raise NotImplementedError
             logger.log(msg)
-        if not evaluate:
+        if num_test_examples == 0:
             assert indices is not None
-            if len(indices) > _MAX_PRED_SIZE[task_type]:
-                raise ValueError(f"Cannot predict for more than "
-                                 f"{_MAX_PRED_SIZE[task_type]:,} entities at "
-                                 f"once (got {len(indices):,}). Use "
-                                 f"`KumoRFM.batch_mode` to process entities "
-                                 f"in batches")
             test_pkey = pd.Series(indices, dtype=train_pkey.dtype)
             if isinstance(anchor_time, pd.Timestamp):
                 test_time = pd.Series([anchor_time]).repeat(
@@ -1056,26 +1131,26 @@ class KumoRFM:
         if logger is not None:
             if task_type == TaskType.BINARY_CLASSIFICATION:
-                pos = 100 * int((y_train > 0).sum()) / len(y_train)
-                msg = (f"Collected {len(y_train):,} in-context examples with "
+                pos = 100 * int((train_y > 0).sum()) / len(train_y)
+                msg = (f"Collected {len(train_y):,} in-context examples with "
                        f"{pos:.2f}% positive cases")
             elif task_type == TaskType.MULTICLASS_CLASSIFICATION:
-                msg = (f"Collected {len(y_train):,} in-context examples "
-                       f"holding {y_train.nunique()} classes")
+                msg = (f"Collected {len(train_y):,} in-context examples "
+                       f"holding {train_y.nunique()} classes")
             elif task_type == TaskType.REGRESSION:
-                _min, _max = float(y_train.min()), float(y_train.max())
-                msg = (f"Collected {len(y_train):,} in-context examples with "
+                _min, _max = float(train_y.min()), float(train_y.max())
+                msg = (f"Collected {len(train_y):,} in-context examples with "
                        f"targets between {format_value(_min)} and "
                        f"{format_value(_max)}")
             elif task_type == TaskType.TEMPORAL_LINK_PREDICTION:
-                num_rhs = y_train.explode().nunique()
-                msg = (f"Collected {len(y_train):,} in-context examples with "
+                num_rhs = train_y.explode().nunique()
+                msg = (f"Collected {len(train_y):,} in-context examples with "
                        f"{num_rhs:,} unique items")
             else:
                 raise NotImplementedError
             logger.log(msg)
-        entity_table_names: tuple[str, ...]
+        entity_table_names: tuple[str] | tuple[str, str]
         if task_type.is_link_pred:
             final_aggr = query.get_final_target_aggregation()
             assert final_aggr is not None
@@ -1089,27 +1164,98 @@ class KumoRFM:
         else:
             entity_table_names = (query.entity_table, )
+        context_df = pd.DataFrame({'ENTITY': train_pkey, 'TARGET': train_y})
+        if isinstance(train_time, pd.Series):
+            context_df['ANCHOR_TIMESTAMP'] = train_time
+        pred_df = pd.DataFrame({'ENTITY': test_pkey})
+        if num_test_examples > 0:
+            pred_df['TARGET'] = test_y
+        if isinstance(test_time, pd.Series):
+            pred_df['ANCHOR_TIMESTAMP'] = test_time
+        return TaskTable(
+            task_type=task_type,
+            context_df=context_df,
+            pred_df=pred_df,
+            entity_table_name=entity_table_names,
+            entity_column='ENTITY',
+            target_column='TARGET',
+            time_column='ANCHOR_TIMESTAMP' if isinstance(
+                train_time, pd.Series) else TaskTable.ENTITY_TIME,
+        )
+    def _get_context(
+        self,
+        task: TaskTable,
+        run_mode: RunMode | str = RunMode.FAST,
+        num_neighbors: list[int] | None = None,
+        exclude_cols_dict: dict[str, list[str]] | None = None,
+        top_k: int | None = None,
+    ) -> Context:
+        # TODO Remove all
+        if task.num_context_examples > max(_MAX_CONTEXT_SIZE.values()):
+            raise ValueError(f"Cannot process a context with more than "
+                             f"{max(_MAX_CONTEXT_SIZE.values()):,} samples "
+                             f"(got {task.num_context_examples:,})")
+        if task.evaluate:
+            if task.num_prediction_examples > _MAX_TEST_SIZE[task.task_type]:
+                raise ValueError(f"Cannot process a test set with more than "
+                                 f"{_MAX_TEST_SIZE[task.task_type]:,} samples "
+                                 f"for evaluation "
+                                 f"(got {task.num_prediction_examples:,})")
+        else:
+            if task.num_prediction_examples > _MAX_PRED_SIZE[task.task_type]:
+                raise ValueError(f"Cannot predict for more than "
+                                 f"{_MAX_PRED_SIZE[task.task_type]:,} "
+                                 f"entities at once "
+                                 f"(got {task.num_prediction_examples:,})")
+        if num_neighbors is None:
+            key = RunMode.FAST if task.task_type.is_link_pred else run_mode
+            num_neighbors = _DEFAULT_NUM_NEIGHBORS[key][:2]
+        if len(num_neighbors) > 6:
+            raise ValueError(f"Cannot predict on subgraphs with more than 6 "
+                             f"hops (got {len(num_neighbors)}). Reduce the "
+                             f"number of hops and try again. Please create a "
+                             f"feature request at "
+                             f"'https://github.com/kumo-ai/kumo-rfm' if you "
+                             f"must go beyond this for your use-case.")
         # Exclude the entity anchor time from the feature set to prevent
         # running out-of-distribution between in-context and test examples:
-        exclude_cols_dict = query.get_exclude_cols_dict()
-        if entity_table_names[0] in self._sampler.time_column_dict:
-            if entity_table_names[0] not in exclude_cols_dict:
-                exclude_cols_dict[entity_table_names[0]] = []
-            time_column = self._sampler.time_column_dict[entity_table_names[0]]
-            exclude_cols_dict[entity_table_names[0]].append(time_column)
+        exclude_cols_dict = exclude_cols_dict or {}
+        if task.entity_table_name in self._sampler.time_column_dict:
+            if task.entity_table_name not in exclude_cols_dict:
+                exclude_cols_dict[task.entity_table_name] = []
+            time_col = self._sampler.time_column_dict[task.entity_table_name]
+            exclude_cols_dict[task.entity_table_name].append(time_col)
+        entity_pkey = pd.concat([
+            task._context_df[task._entity_column],
+            task._pred_df[task._entity_column],
+        ], axis=0, ignore_index=True)
+        if task.use_entity_time:
+            if task.entity_table_name not in self._sampler.time_column_dict:
+                raise ValueError(f"The given annchor time requires the entity "
+                                 f"table '{task.entity_table_name}' to have a "
+                                 f"time column")
+            anchor_time = 'entity'
+        elif task._time_column is not None:
+            anchor_time = pd.concat([
+                task._context_df[task._time_column],
+                task._pred_df[task._time_column],
+            ], axis=0, ignore_index=True)
+        else:
+            anchor_time = pd.Series(self._get_default_anchor_time()).repeat(
+                (len(entity_pkey))).reset_index(drop=True)
         subgraph = self._sampler.sample_subgraph(
-            entity_table_names=entity_table_names,
-            entity_pkey=pd.concat(
-                [train_pkey, test_pkey],
-                axis=0,
-                ignore_index=True,
-            ),
-            anchor_time=pd.concat(
-                [train_time, test_time],
-                axis=0,
-                ignore_index=True,
-            ) if isinstance(train_time, pd.Series) else 'entity',
+            entity_table_names=task.entity_table_names,
+            entity_pkey=entity_pkey,
+            anchor_time=anchor_time,
             num_neighbors=num_neighbors,
             exclude_cols_dict=exclude_cols_dict,
         )
@@ -1121,13 +1267,20 @@ class KumoRFM:
                              f"'https://github.com/kumo-ai/kumo-rfm' if you "
                              f"must go beyond this for your use-case.")
+        if (task.task_type.is_link_pred
+                and task.entity_table_names[-1] not in subgraph.table_dict):
+            raise ValueError("Cannot perform link prediction on subgraphs "
+                             "without any historical target entities. Please "
+                             "increase the number of hops and try again.")
         return Context(
-            task_type=task_type,
-            entity_table_names=entity_table_names,
+            task_type=task.task_type,
+            entity_table_names=task.entity_table_names,
             subgraph=subgraph,
-            y_train=y_train,
-            y_test=y_test if evaluate else None,
-            top_k=query.top_k,
+            y_train=task._context_df[task.target_column.name],
+            y_test=task._pred_df[task.target_column.name]
+            if task.evaluate else None,
+            top_k=top_k,
             step_size=None,
         )