PyPI - kumoai - Versions diffs - 2.13.0.dev202511211730__py3-none-any.whl → 2.14.0.dev202512141732__py3-none-any.whl - Mend

kumoai 2.13.0.dev202511211730py3-none-any.whl → 2.14.0.dev202512141732py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

kumoai/__init__.py +12 -0
kumoai/_version.py +1 -1
kumoai/client/pquery.py +6 -2
kumoai/connector/utils.py +23 -2
kumoai/experimental/rfm/__init__.py +20 -45
kumoai/experimental/rfm/backend/__init__.py +0 -0
kumoai/experimental/rfm/backend/local/__init__.py +42 -0
kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +37 -90
kumoai/experimental/rfm/backend/local/sampler.py +313 -0
kumoai/experimental/rfm/backend/local/table.py +119 -0
kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
kumoai/experimental/rfm/backend/snow/sampler.py +119 -0
kumoai/experimental/rfm/backend/snow/table.py +135 -0
kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
kumoai/experimental/rfm/backend/sqlite/sampler.py +112 -0
kumoai/experimental/rfm/backend/sqlite/table.py +115 -0
kumoai/experimental/rfm/base/__init__.py +23 -0
kumoai/experimental/rfm/base/column.py +66 -0
kumoai/experimental/rfm/base/sampler.py +773 -0
kumoai/experimental/rfm/base/source.py +19 -0
kumoai/experimental/rfm/{local_table.py → base/table.py} +152 -141
kumoai/experimental/rfm/{local_graph.py → graph.py} +352 -80
kumoai/experimental/rfm/infer/__init__.py +6 -0
kumoai/experimental/rfm/infer/dtype.py +79 -0
kumoai/experimental/rfm/infer/pkey.py +126 -0
kumoai/experimental/rfm/infer/time_col.py +62 -0
kumoai/experimental/rfm/pquery/pandas_executor.py +1 -1
kumoai/experimental/rfm/rfm.py +224 -167
kumoai/experimental/rfm/sagemaker.py +11 -3
kumoai/pquery/predictive_query.py +10 -6
kumoai/testing/decorators.py +1 -1
kumoai/testing/snow.py +50 -0
kumoai/utils/__init__.py +2 -0
kumoai/utils/sql.py +3 -0
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/METADATA +9 -8
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/RECORD +39 -23
kumoai/experimental/rfm/local_graph_sampler.py +0 -182
kumoai/experimental/rfm/local_pquery_driver.py +0 -689
kumoai/experimental/rfm/utils.py +0 -344
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/WHEEL +0 -0
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.13.0.dev202511211730.dist-info → kumoai-2.14.0.dev202512141732.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/rfm.py CHANGED Viewed

@@ -21,6 +21,13 @@ import numpy as np
 import pandas as pd
 from kumoapi.model_plan import RunMode
 from kumoapi.pquery import QueryType, ValidatedPredictiveQuery
+from kumoapi.pquery.AST import (
+    Aggregation,
+    Column,
+    Condition,
+    Join,
+    LogicalOperation,
+)
 from kumoapi.rfm import Context
 from kumoapi.rfm import Explanation as ExplanationConfig
 from kumoapi.rfm import (
@@ -29,16 +36,12 @@ from kumoapi.rfm import (
     RFMPredictRequest,
 )
 from kumoapi.task import TaskType
+from kumoapi.typing import AggregationType, Stype
 from kumoai.client.rfm import RFMAPI
 from kumoai.exceptions import HTTPException
-from kumoai.experimental.rfm import LocalGraph
-from kumoai.experimental.rfm.local_graph_sampler import LocalGraphSampler
-from kumoai.experimental.rfm.local_graph_store import LocalGraphStore
-from kumoai.experimental.rfm.local_pquery_driver import (
-    LocalPQueryDriver,
-    date_offset_to_seconds,
-)
+from kumoai.experimental.rfm import Graph
+from kumoai.experimental.rfm.base import DataBackend, Sampler
 from kumoai.mixin import CastMixin
 from kumoai.utils import InteractiveProgressLogger, ProgressLogger
@@ -123,17 +126,17 @@ class KumoRFM:
     :class:`KumoRFM` is a foundation model to generate predictions for any
     relational dataset without training.
     The model is pre-trained and the class provides an interface to query the
-    model from a :class:`LocalGraph` object.
+    model from a :class:`Graph` object.
     .. code-block:: python
-        from kumoai.experimental.rfm import LocalGraph, KumoRFM
+        from kumoai.experimental.rfm import Graph, KumoRFM
         df_users = pd.DataFrame(...)
         df_items = pd.DataFrame(...)
         df_orders = pd.DataFrame(...)
-        graph = LocalGraph.from_data({
+        graph = Graph.from_data({
             'users': df_users,
             'items': df_items,
             'orders': df_orders,
@@ -150,32 +153,41 @@ class KumoRFM:
     Args:
         graph: The graph.
-        preprocess: Whether to pre-process the data in advance during graph
-            materialization.
-            This is a runtime trade-off between graph materialization and model
-            processing speed.
-            It can be benefical to preprocess your data once and then run many
-            queries on top to achieve maximum model speed.
-            However, if activiated, graph materialization can take potentially
-            much longer, especially on graphs with many large text columns.
-            Best to tune this option manually.
         verbose: Whether to print verbose output.
     """
     def __init__(
         self,
-        graph: LocalGraph,
-        preprocess: bool = False,
+        graph: Graph,
         verbose: Union[bool, ProgressLogger] = True,
     ) -> None:
         graph = graph.validate()
         self._graph_def = graph._to_api_graph_definition()
-        self._graph_store = LocalGraphStore(graph, preprocess, verbose)
-        self._graph_sampler = LocalGraphSampler(self._graph_store)
+        if graph.backend == DataBackend.LOCAL:
+            from kumoai.experimental.rfm.backend.local import LocalSampler
+            self._sampler: Sampler = LocalSampler(graph, verbose)
+        elif graph.backend == DataBackend.SQLITE:
+            from kumoai.experimental.rfm.backend.sqlite import SQLiteSampler
+            self._sampler = SQLiteSampler(graph, verbose)
+        elif graph.backend == DataBackend.SNOWFLAKE:
+            from kumoai.experimental.rfm.backend.snow import SnowSampler
+            self._sampler = SnowSampler(graph, verbose)
+        else:
+            raise NotImplementedError
+        self._client: Optional[RFMAPI] = None
         self._batch_size: Optional[int | Literal['max']] = None
         self.num_retries: int = 0
+    @property
+    def _api_client(self) -> RFMAPI:
+        if self._client is not None:
+            return self._client
         from kumoai.experimental.rfm import global_state
-        self._api_client = RFMAPI(global_state.client)
+        self._client = RFMAPI(global_state.client)
+        return self._client
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}()'
@@ -225,7 +237,7 @@ class KumoRFM:
         run_mode: Union[RunMode, str] = RunMode.FAST,
         num_neighbors: Optional[List[int]] = None,
         num_hops: int = 2,
-        max_pq_iterations: int = 20,
+        max_pq_iterations: int = 10,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
         use_prediction_time: bool = False,
@@ -244,7 +256,7 @@ class KumoRFM:
         run_mode: Union[RunMode, str] = RunMode.FAST,
         num_neighbors: Optional[List[int]] = None,
         num_hops: int = 2,
-        max_pq_iterations: int = 20,
+        max_pq_iterations: int = 10,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
         use_prediction_time: bool = False,
@@ -262,7 +274,7 @@ class KumoRFM:
         run_mode: Union[RunMode, str] = RunMode.FAST,
         num_neighbors: Optional[List[int]] = None,
         num_hops: int = 2,
-        max_pq_iterations: int = 20,
+        max_pq_iterations: int = 10,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
         use_prediction_time: bool = False,
@@ -358,9 +370,9 @@ class KumoRFM:
             batch_size: Optional[int] = None
             if self._batch_size == 'max':
-                task_type = LocalPQueryDriver.get_task_type(
-                    query_def,
-                    edge_types=self._graph_store.edge_types,
+                task_type = self._get_task_type(
+                    query=query_def,
+                    edge_types=self._sampler.edge_types,
                 )
                 batch_size = _MAX_PRED_SIZE[task_type]
             else:
@@ -434,10 +446,10 @@ class KumoRFM:
                         # Cast 'ENTITY' to correct data type:
                         if 'ENTITY' in df:
-                            entity = query_def.entity_table
-                            pkey_map = self._graph_store.pkey_map_dict[entity]
-                            df['ENTITY'] = df['ENTITY'].astype(
-                                type(pkey_map.index[0]))
+                            table_dict = context.subgraph.table_dict
+                            table = table_dict[query_def.entity_table]
+                            ser = table.df[table.primary_key]
+                            df['ENTITY'] = df['ENTITY'].astype(ser.dtype)
                         # Cast 'ANCHOR_TIMESTAMP' to correct data type:
                         if 'ANCHOR_TIMESTAMP' in df:
@@ -520,23 +532,18 @@ class KumoRFM:
             raise ValueError("At least one entity is required")
         if anchor_time is None:
-            anchor_time = self._graph_store.max_time
+            anchor_time = self._get_default_anchor_time(query_def)
         if isinstance(anchor_time, pd.Timestamp):
             self._validate_time(query_def, anchor_time, None, False)
         else:
             assert anchor_time == 'entity'
-            if (query_def.entity_table not in self._graph_store.time_dict):
+            if query_def.entity_table not in self._sampler.time_column_dict:
                 raise ValueError(f"Anchor time 'entity' requires the entity "
                                  f"table '{query_def.entity_table}' "
                                  f"to have a time column.")
-        node = self._graph_store.get_node_id(
-            table_name=query_def.entity_table,
-            pkey=pd.Series(indices),
-        )
-        query_driver = LocalPQueryDriver(self._graph_store, query_def)
-        return query_driver.is_valid(node, anchor_time)
+        raise NotImplementedError
     def evaluate(
         self,
@@ -548,7 +555,7 @@ class KumoRFM:
         run_mode: Union[RunMode, str] = RunMode.FAST,
         num_neighbors: Optional[List[int]] = None,
         num_hops: int = 2,
-        max_pq_iterations: int = 20,
+        max_pq_iterations: int = 10,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
         use_prediction_time: bool = False,
@@ -659,7 +666,7 @@ class KumoRFM:
         *,
         anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
         random_seed: Optional[int] = _RANDOM_SEED,
-        max_iterations: int = 20,
+        max_iterations: int = 10,
     ) -> pd.DataFrame:
         """Returns the labels of a predictive query for a specified anchor
         time.
@@ -679,40 +686,37 @@ class KumoRFM:
         query_def = self._parse_query(query)
         if anchor_time is None:
-            anchor_time = self._graph_store.max_time
+            anchor_time = self._get_default_anchor_time(query_def)
             if query_def.target_ast.date_offset_range is not None:
-                anchor_time = anchor_time - (
-                    query_def.target_ast.date_offset_range.end_date_offset *
-                    query_def.num_forecasts)
+                offset = query_def.target_ast.date_offset_range.end_date_offset
+                offset *= query_def.num_forecasts
+                anchor_time -= offset
         assert anchor_time is not None
         if isinstance(anchor_time, pd.Timestamp):
             self._validate_time(query_def, anchor_time, None, evaluate=True)
         else:
             assert anchor_time == 'entity'
-            if (query_def.entity_table not in self._graph_store.time_dict):
+            if query_def.entity_table not in self._sampler.time_column_dict:
                 raise ValueError(f"Anchor time 'entity' requires the entity "
                                  f"table '{query_def.entity_table}' "
                                  f"to have a time column")
-        query_driver = LocalPQueryDriver(self._graph_store, query_def,
-                                         random_seed)
-        node, time, y = query_driver.collect_test(
-            size=size,
-            anchor_time=anchor_time,
-            batch_size=min(10_000, size),
-            max_iterations=max_iterations,
-            guarantee_train_examples=False,
+        train, test = self._sampler.sample_target(
+            query=query,
+            num_train_examples=0,
+            train_anchor_time=anchor_time,
+            num_train_trials=0,
+            num_test_examples=size,
+            test_anchor_time=anchor_time,
+            num_test_trials=max_iterations * size,
+            random_seed=random_seed,
         )
-        entity = self._graph_store.pkey_map_dict[
-            query_def.entity_table].index[node]
         return pd.DataFrame({
-            'ENTITY': entity,
-            'ANCHOR_TIMESTAMP': time,
-            'TARGET': y,
+            'ENTITY': test.entity_pkey,
+            'ANCHOR_TIMESTAMP': test.anchor_time,
+            'TARGET': test.target,
         })
     # Helpers #################################################################
@@ -735,8 +739,6 @@ class KumoRFM:
             resp = self._api_client.parse_query(request)
-            # TODO Expose validation warnings.
             if len(resp.validation_response.warnings) > 0:
                 msg = '\n'.join([
                     f'{i+1}. {warning.title}: {warning.message}' for i, warning
@@ -754,6 +756,60 @@ class KumoRFM:
             raise ValueError(f"Failed to parse query '{query}'. "
                              f"{msg}") from None
+    @staticmethod
+    def _get_task_type(
+        query: ValidatedPredictiveQuery,
+        edge_types: List[Tuple[str, str, str]],
+    ) -> TaskType:
+        if isinstance(query.target_ast, (Condition, LogicalOperation)):
+            return TaskType.BINARY_CLASSIFICATION
+        target = query.target_ast
+        if isinstance(target, Join):
+            target = target.rhs_target
+        if isinstance(target, Aggregation):
+            if target.aggr == AggregationType.LIST_DISTINCT:
+                table_name, col_name = target._get_target_column_name().split(
+                    '.')
+                target_edge_types = [
+                    edge_type for edge_type in edge_types
+                    if edge_type[0] == table_name and edge_type[1] == col_name
+                ]
+                if len(target_edge_types) != 1:
+                    raise NotImplementedError(
+                        f"Multilabel-classification queries based on "
+                        f"'LIST_DISTINCT' are not supported yet. If you "
+                        f"planned to write a link prediction query instead, "
+                        f"make sure to register '{col_name}' as a "
+                        f"foreign key.")
+                return TaskType.TEMPORAL_LINK_PREDICTION
+            return TaskType.REGRESSION
+        assert isinstance(target, Column)
+        if target.stype in {Stype.ID, Stype.categorical}:
+            return TaskType.MULTICLASS_CLASSIFICATION
+        if target.stype in {Stype.numerical}:
+            return TaskType.REGRESSION
+        raise NotImplementedError("Task type not yet supported")
+    def _get_default_anchor_time(
+        self,
+        query: ValidatedPredictiveQuery,
+    ) -> pd.Timestamp:
+        if query.query_type == QueryType.TEMPORAL:
+            aggr_table_names = [
+                aggr._get_target_column_name().split('.')[0]
+                for aggr in query.get_all_target_aggregations()
+            ]
+            return self._sampler.get_max_time(aggr_table_names)
+        assert query.query_type == QueryType.STATIC
+        return self._sampler.get_max_time()
     def _validate_time(
         self,
         query: ValidatedPredictiveQuery,
@@ -762,28 +818,30 @@ class KumoRFM:
         evaluate: bool,
     ) -> None:
-        if self._graph_store.min_time == pd.Timestamp.max:
+        if len(self._sampler.time_column_dict) == 0:
             return  # Graph without timestamps
-        if anchor_time < self._graph_store.min_time:
+        min_time = self._sampler.get_min_time()
+        max_time = self._sampler.get_max_time()
+        if anchor_time < min_time:
             raise ValueError(f"Anchor timestamp '{anchor_time}' is before "
-                             f"the earliest timestamp "
-                             f"'{self._graph_store.min_time}' in the data.")
+                             f"the earliest timestamp '{min_time}' in the "
+                             f"data.")
-        if (context_anchor_time is not None
-                and context_anchor_time < self._graph_store.min_time):
+        if context_anchor_time is not None and context_anchor_time < min_time:
             raise ValueError(f"Context anchor timestamp is too early or "
                              f"aggregation time range is too large. To make "
                              f"this prediction, we would need data back to "
                              f"'{context_anchor_time}', however, your data "
-                             f"only contains data back to "
-                             f"'{self._graph_store.min_time}'.")
+                             f"only contains data back to '{min_time}'.")
         if query.target_ast.date_offset_range is not None:
             end_offset = query.target_ast.date_offset_range.end_date_offset
         else:
             end_offset = pd.DateOffset(0)
-        forecast_end_offset = end_offset * query.num_forecasts
+        end_offset = end_offset * query.num_forecasts
         if (context_anchor_time is not None
                 and context_anchor_time > anchor_time):
             warnings.warn(f"Context anchor timestamp "
@@ -793,7 +851,7 @@ class KumoRFM:
                           f"intended.")
         elif (query.query_type == QueryType.TEMPORAL
               and context_anchor_time is not None
-              and context_anchor_time + forecast_end_offset > anchor_time):
+              and context_anchor_time + end_offset > anchor_time):
             warnings.warn(f"Aggregation for context examples at timestamp "
                           f"'{context_anchor_time}' will leak information "
                           f"from the prediction anchor timestamp "
@@ -801,26 +859,23 @@ class KumoRFM:
                           f"intended.")
         elif (context_anchor_time is not None
-              and context_anchor_time - forecast_end_offset
-              < self._graph_store.min_time):
-            _time = context_anchor_time - forecast_end_offset
+              and context_anchor_time - end_offset < min_time):
+            _time = context_anchor_time - end_offset
             warnings.warn(f"Context anchor timestamp is too early or "
                           f"aggregation time range is too large. To form "
                           f"proper input data, we would need data back to "
                           f"'{_time}', however, your data only contains "
-                          f"data back to '{self._graph_store.min_time}'.")
+                          f"data back to '{min_time}'.")
-        if (not evaluate and anchor_time
-                > self._graph_store.max_time + pd.DateOffset(days=1)):
+        if not evaluate and anchor_time > max_time + pd.DateOffset(days=1):
             warnings.warn(f"Anchor timestamp '{anchor_time}' is after the "
-                          f"latest timestamp '{self._graph_store.max_time}' "
-                          f"in the data. Please make sure this is intended.")
+                          f"latest timestamp '{max_time}' in the data. Please "
+                          f"make sure this is intended.")
-        max_eval_time = self._graph_store.max_time - forecast_end_offset
-        if evaluate and anchor_time > max_eval_time:
+        if evaluate and anchor_time > max_time - end_offset:
             raise ValueError(
                 f"Anchor timestamp for evaluation is after the latest "
-                f"supported timestamp '{max_eval_time}'.")
+                f"supported timestamp '{max_time - end_offset}'.")
     def _get_context(
         self,
@@ -851,10 +906,9 @@ class KumoRFM:
                              f"'https://github.com/kumo-ai/kumo-rfm' if you "
                              f"must go beyond this for your use-case.")
-        query_driver = LocalPQueryDriver(self._graph_store, query, random_seed)
-        task_type = LocalPQueryDriver.get_task_type(
-            query,
-            edge_types=self._graph_store.edge_types,
+        task_type = self._get_task_type(
+            query=query,
+            edge_types=self._sampler.edge_types,
         )
         if logger is not None:
@@ -886,14 +940,17 @@ class KumoRFM:
                 num_neighbors = [64, 64, 8, 8, 4, 4][:num_hops]
         if query.target_ast.date_offset_range is None:
-            end_offset = pd.DateOffset(0)
+            step_offset = pd.DateOffset(0)
         else:
-            end_offset = query.target_ast.date_offset_range.end_date_offset
-        forecast_end_offset = end_offset * query.num_forecasts
+            step_offset = query.target_ast.date_offset_range.end_date_offset
+        end_offset = step_offset * query.num_forecasts
         if anchor_time is None:
-            anchor_time = self._graph_store.max_time
+            anchor_time = self._get_default_anchor_time(query)
             if evaluate:
-                anchor_time = anchor_time - forecast_end_offset
+                anchor_time = anchor_time - end_offset
             if logger is not None:
                 assert isinstance(anchor_time, pd.Timestamp)
                 if anchor_time == pd.Timestamp.min:
@@ -907,57 +964,71 @@ class KumoRFM:
         assert anchor_time is not None
         if isinstance(anchor_time, pd.Timestamp):
+            if context_anchor_time == 'entity':
+                raise ValueError("Anchor time 'entity' needs to be shared "
+                                 "for context and prediction examples")
             if context_anchor_time is None:
-                context_anchor_time = anchor_time - forecast_end_offset
+                context_anchor_time = anchor_time - end_offset
             self._validate_time(query, anchor_time, context_anchor_time,
                                 evaluate)
         else:
             assert anchor_time == 'entity'
-            if query.entity_table not in self._graph_store.time_dict:
+            if query.query_type != QueryType.STATIC:
+                raise ValueError("Anchor time 'entity' is only valid for "
+                                 "static predictive queries")
+            if query.entity_table not in self._sampler.time_column_dict:
                 raise ValueError(f"Anchor time 'entity' requires the entity "
                                  f"table '{query.entity_table}' to "
                                  f"have a time column")
-            if context_anchor_time is not None:
-                warnings.warn("Ignoring option 'context_anchor_time' for "
-                              "`anchor_time='entity'`")
-            context_anchor_time = None
+            if isinstance(context_anchor_time, pd.Timestamp):
+                raise ValueError("Anchor time 'entity' needs to be shared "
+                                 "for context and prediction examples")
+            context_anchor_time = 'entity'
-        y_test: Optional[pd.Series] = None
+        num_train_examples = _MAX_CONTEXT_SIZE[run_mode]
         if evaluate:
-            max_test_size = _MAX_TEST_SIZE[run_mode]
+            num_test_examples = _MAX_TEST_SIZE[run_mode]
             if task_type.is_link_pred:
-                max_test_size = max_test_size // 5
+                num_test_examples = num_test_examples // 5
+        else:
+            num_test_examples = 0
+        train, test = self._sampler.sample_target(
+            query=query,
+            num_train_examples=num_train_examples,
+            train_anchor_time=context_anchor_time,
+            num_train_trials=max_pq_iterations * num_train_examples,
+            num_test_examples=num_test_examples,
+            test_anchor_time=anchor_time,
+            num_test_trials=max_pq_iterations * num_test_examples,
+            random_seed=random_seed,
+        )
+        train_pkey, train_time, y_train = train
+        test_pkey, test_time, y_test = test
-            test_node, test_time, y_test = query_driver.collect_test(
-                size=max_test_size,
-                anchor_time=anchor_time,
-                max_iterations=max_pq_iterations,
-                guarantee_train_examples=True,
-            )
-            if logger is not None:
-                if task_type == TaskType.BINARY_CLASSIFICATION:
-                    pos = 100 * int((y_test > 0).sum()) / len(y_test)
-                    msg = (f"Collected {len(y_test):,} test examples with "
-                           f"{pos:.2f}% positive cases")
-                elif task_type == TaskType.MULTICLASS_CLASSIFICATION:
-                    msg = (f"Collected {len(y_test):,} test examples "
-                           f"holding {y_test.nunique()} classes")
-                elif task_type == TaskType.REGRESSION:
-                    _min, _max = float(y_test.min()), float(y_test.max())
-                    msg = (f"Collected {len(y_test):,} test examples with "
-                           f"targets between {format_value(_min)} and "
-                           f"{format_value(_max)}")
-                elif task_type == TaskType.TEMPORAL_LINK_PREDICTION:
-                    num_rhs = y_test.explode().nunique()
-                    msg = (f"Collected {len(y_test):,} test examples with "
-                           f"{num_rhs:,} unique items")
-                else:
-                    raise NotImplementedError
-                logger.log(msg)
+        if evaluate and logger is not None:
+            if task_type == TaskType.BINARY_CLASSIFICATION:
+                pos = 100 * int((y_test > 0).sum()) / len(y_test)
+                msg = (f"Collected {len(y_test):,} test examples with "
+                       f"{pos:.2f}% positive cases")
+            elif task_type == TaskType.MULTICLASS_CLASSIFICATION:
+                msg = (f"Collected {len(y_test):,} test examples holding "
+                       f"{y_test.nunique()} classes")
+            elif task_type == TaskType.REGRESSION:
+                _min, _max = float(y_test.min()), float(y_test.max())
+                msg = (f"Collected {len(y_test):,} test examples with targets "
+                       f"between {format_value(_min)} and "
+                       f"{format_value(_max)}")
+            elif task_type == TaskType.TEMPORAL_LINK_PREDICTION:
+                num_rhs = y_test.explode().nunique()
+                msg = (f"Collected {len(y_test):,} test examples with "
+                       f"{num_rhs:,} unique items")
+            else:
+                raise NotImplementedError
+            logger.log(msg)
-        else:
+        if not evaluate:
             assert indices is not None
             if len(indices) > _MAX_PRED_SIZE[task_type]:
                 raise ValueError(f"Cannot predict for more than "
                                  f"{_MAX_PRED_SIZE[task_type]:,} entities at "
@@ -965,26 +1036,12 @@ class KumoRFM:
                                  f"`KumoRFM.batch_mode` to process entities "
                                  f"in batches")
-            test_node = self._graph_store.get_node_id(
-                table_name=query.entity_table,
-                pkey=pd.Series(indices),
-            )
+            test_pkey = pd.Series(indices, dtype=train_pkey.dtype)
             if isinstance(anchor_time, pd.Timestamp):
-                test_time = pd.Series(anchor_time).repeat(
-                    len(test_node)).reset_index(drop=True)
+                test_time = pd.Series([anchor_time]).repeat(
+                    len(indices)).reset_index(drop=True)
             else:
-                time = self._graph_store.time_dict[query.entity_table]
-                time = time[test_node] * 1000**3
-                test_time = pd.Series(time, dtype='datetime64[ns]')
-        train_node, train_time, y_train = query_driver.collect_train(
-            size=_MAX_CONTEXT_SIZE[run_mode],
-            anchor_time=context_anchor_time or 'entity',
-            exclude_node=test_node if (query.query_type == QueryType.STATIC
-                                       or anchor_time == 'entity') else None,
-            max_iterations=max_pq_iterations,
-        )
+                train_time = test_time = 'entity'
         if logger is not None:
             if task_type == TaskType.BINARY_CLASSIFICATION:
@@ -1012,7 +1069,7 @@ class KumoRFM:
             final_aggr = query.get_final_target_aggregation()
             assert final_aggr is not None
             edge_fkey = final_aggr._get_target_column_name()
-            for edge_type in self._graph_store.edge_types:
+            for edge_type in self._sampler.edge_types:
                 if edge_fkey == f'{edge_type[0]}.{edge_type[1]}':
                     entity_table_names = (
                         query.entity_table,
@@ -1024,20 +1081,24 @@ class KumoRFM:
         # Exclude the entity anchor time from the feature set to prevent
         # running out-of-distribution between in-context and test examples:
         exclude_cols_dict = query.get_exclude_cols_dict()
-        if anchor_time == 'entity':
+        if entity_table_names[0] in self._sampler.time_column_dict:
             if entity_table_names[0] not in exclude_cols_dict:
                 exclude_cols_dict[entity_table_names[0]] = []
-            time_column_dict = self._graph_store.time_column_dict
-            time_column = time_column_dict[entity_table_names[0]]
+            time_column = self._sampler.time_column_dict[entity_table_names[0]]
             exclude_cols_dict[entity_table_names[0]].append(time_column)
-        subgraph = self._graph_sampler(
+        subgraph = self._sampler.sample_subgraph(
             entity_table_names=entity_table_names,
-            node=np.concatenate([train_node, test_node]),
-            time=np.concatenate([
-                train_time.astype('datetime64[ns]').astype(int).to_numpy(),
-                test_time.astype('datetime64[ns]').astype(int).to_numpy(),
-            ]),
+            entity_pkey=pd.concat(
+                [train_pkey, test_pkey],
+                axis=0,
+                ignore_index=True,
+            ),
+            anchor_time=pd.concat(
+                [train_time, test_time],
+                axis=0,
+                ignore_index=True,
+            ) if isinstance(train_time, pd.Series) else 'entity',
             num_neighbors=num_neighbors,
             exclude_cols_dict=exclude_cols_dict,
         )
@@ -1049,18 +1110,14 @@ class KumoRFM:
                              f"'https://github.com/kumo-ai/kumo-rfm' if you "
                              f"must go beyond this for your use-case.")
-        step_size: Optional[int] = None
-        if query.query_type == QueryType.TEMPORAL:
-            step_size = date_offset_to_seconds(end_offset)
         return Context(
             task_type=task_type,
             entity_table_names=entity_table_names,
             subgraph=subgraph,
             y_train=y_train,
-            y_test=y_test,
+            y_test=y_test if evaluate else None,
             top_k=query.top_k,
-            step_size=step_size,
+            step_size=None,
         )
     @staticmethod

kumoai 2.13.0.dev202511211730__py3-none-any.whl → 2.14.0.dev202512141732__py3-none-any.whl

kumoai 2.13.0.dev202511211730py3-none-any.whl → 2.14.0.dev202512141732py3-none-any.whl