PyPI - kumoai - Versions diffs - 2.10.0.dev202510021830__py3-none-any.whl → 2.12.1__py3-none-any.whl - Mend

kumoai 2.10.0.dev202510021830py3-none-any.whl → 2.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

kumoai/__init__.py +4 -2
kumoai/_version.py +1 -1
kumoai/client/client.py +10 -5
kumoai/client/endpoints.py +1 -0
kumoai/client/rfm.py +35 -7
kumoai/experimental/rfm/__init__.py +5 -3
kumoai/experimental/rfm/infer/timestamp.py +5 -4
kumoai/experimental/rfm/local_graph.py +90 -74
kumoai/experimental/rfm/local_graph_sampler.py +16 -8
kumoai/experimental/rfm/local_graph_store.py +13 -1
kumoai/experimental/rfm/local_pquery_driver.py +323 -38
kumoai/experimental/rfm/local_table.py +100 -22
kumoai/experimental/rfm/pquery/__init__.py +4 -4
kumoai/experimental/rfm/pquery/{backend.py → executor.py} +24 -58
kumoai/experimental/rfm/pquery/{pandas_backend.py → pandas_executor.py} +277 -223
kumoai/experimental/rfm/rfm.py +220 -79
kumoai/jobs.py +1 -0
kumoai/pquery/predictive_query.py +10 -6
kumoai/trainer/trainer.py +9 -10
kumoai/utils/progress_logger.py +13 -0
{kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/METADATA +4 -5
{kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/RECORD +25 -25
{kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/WHEEL +0 -0
{kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.10.0.dev202510021830.dist-info → kumoai-2.12.1.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/rfm.py CHANGED Viewed

@@ -5,21 +5,28 @@ from collections import defaultdict
 from collections.abc import Generator
 from contextlib import contextmanager
 from dataclasses import dataclass, replace
-from typing import Iterator, List, Literal, Optional, Union, overload
+from typing import (
+    Any,
+    Dict,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
 import numpy as np
 import pandas as pd
 from kumoapi.model_plan import RunMode
-from kumoapi.pquery import QueryType
+from kumoapi.pquery import QueryType, ValidatedPredictiveQuery
 from kumoapi.rfm import Context
 from kumoapi.rfm import Explanation as ExplanationConfig
 from kumoapi.rfm import (
-    PQueryDefinition,
     RFMEvaluateRequest,
-    RFMExplanationResponse,
+    RFMParseQueryRequest,
     RFMPredictRequest,
-    RFMPredictResponse,
-    RFMValidateQueryRequest,
 )
 from kumoapi.task import TaskType
@@ -32,6 +39,7 @@ from kumoai.experimental.rfm.local_pquery_driver import (
     LocalPQueryDriver,
     date_offset_to_seconds,
 )
+from kumoai.mixin import CastMixin
 from kumoai.utils import InteractiveProgressLogger, ProgressLogger
 _RANDOM_SEED = 42
@@ -62,6 +70,17 @@ _SIZE_LIMIT_MSG = ("Context size exceeds the 30MB limit. {stats}\nPlease "
                    "beyond this for your use-case.")
+@dataclass(repr=False)
+class ExplainConfig(CastMixin):
+    """Configuration for explainability.
+    Args:
+        skip_summary: Whether to skip generating a human-readable summary of
+            the explanation.
+    """
+    skip_summary: bool = False
 @dataclass(repr=False)
 class Explanation:
     prediction: pd.DataFrame
@@ -89,6 +108,12 @@ class Explanation:
     def __repr__(self) -> str:
         return str((self.prediction, self.summary))
+    def _ipython_display_(self) -> None:
+        from IPython.display import Markdown, display
+        display(self.prediction)
+        display(Markdown(self.summary))
 class KumoRFM:
     r"""The Kumo Relational Foundation model (RFM) from the `KumoRFM: A
@@ -201,6 +226,7 @@ class KumoRFM:
         max_pq_iterations: int = 20,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
+        use_prediction_time: bool = False,
     ) -> pd.DataFrame:
         pass
@@ -210,7 +236,7 @@ class KumoRFM:
         query: str,
         indices: Union[List[str], List[float], List[int], None] = None,
         *,
-        explain: Literal[True],
+        explain: Union[Literal[True], ExplainConfig, Dict[str, Any]],
         anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
         context_anchor_time: Union[pd.Timestamp, None] = None,
         run_mode: Union[RunMode, str] = RunMode.FAST,
@@ -219,6 +245,7 @@ class KumoRFM:
         max_pq_iterations: int = 20,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
+        use_prediction_time: bool = False,
     ) -> Explanation:
         pass
@@ -227,7 +254,7 @@ class KumoRFM:
         query: str,
         indices: Union[List[str], List[float], List[int], None] = None,
         *,
-        explain: bool = False,
+        explain: Union[bool, ExplainConfig, Dict[str, Any]] = False,
         anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
         context_anchor_time: Union[pd.Timestamp, None] = None,
         run_mode: Union[RunMode, str] = RunMode.FAST,
@@ -236,16 +263,23 @@ class KumoRFM:
         max_pq_iterations: int = 20,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
+        use_prediction_time: bool = False,
     ) -> Union[pd.DataFrame, Explanation]:
         """Returns predictions for a predictive query.
         Args:
             query: The predictive query.
             indices: The entity primary keys to predict for. Will override the
-                indices given as part of the predictive query.
-            explain: If set to ``True``, will additionally explain the
-                prediction. Explainability is currently only supported for
-                single entity predictions with ``run_mode="FAST"``.
+                indices given as part of the predictive query. Predictions will
+                be generated for all indices, independent of whether they
+                fulfill entity filter constraints. To pre-filter entities, use
+                :meth:`~KumoRFM.is_valid_entity`.
+            explain: Configuration for explainability.
+                If set to ``True``, will additionally explain the prediction.
+                Passing in an :class:`ExplainConfig` instance provides control
+                over which parts of explanation are generated.
+                Explainability is currently only supported for single entity
+                predictions with ``run_mode="FAST"``.
             anchor_time: The anchor timestamp for the prediction. If set to
                 ``None``, will use the maximum timestamp in the data.
                 If set to ``"entity"``, will use the timestamp of the entity.
@@ -263,46 +297,54 @@ class KumoRFM:
                 entities to find valid labels.
             random_seed: A manual seed for generating pseudo-random numbers.
             verbose: Whether to print verbose output.
+            use_prediction_time: Whether to use the anchor timestamp as an
+                additional feature during prediction. This is typically
+                beneficial for time series forecasting tasks.
         Returns:
             The predictions as a :class:`pandas.DataFrame`.
-            If ``explain=True``, additionally returns a textual summary that
-            explains the prediction.
+            If ``explain`` is provided, returns an :class:`Explanation` object
+            containing the prediction, summary, and details.
         """
+        explain_config: Optional[ExplainConfig] = None
+        if explain is True:
+            explain_config = ExplainConfig()
+        elif explain is not False:
+            explain_config = ExplainConfig._cast(explain)
         query_def = self._parse_query(query)
+        query_str = query_def.to_string()
         if num_hops != 2 and num_neighbors is not None:
             warnings.warn(f"Received custom 'num_neighbors' option; ignoring "
                           f"custom 'num_hops={num_hops}' option")
-        if explain and run_mode in {RunMode.NORMAL, RunMode.BEST}:
+        if explain_config is not None and run_mode in {
+                RunMode.NORMAL, RunMode.BEST
+        }:
             warnings.warn(f"Explainability is currently only supported for "
                           f"run mode 'FAST' (got '{run_mode}'). Provided run "
                           f"mode has been reset. Please lower the run mode to "
                           f"suppress this warning.")
         if indices is None:
-            if query_def.entity.ids is None:
+            if query_def.rfm_entity_ids is None:
                 raise ValueError("Cannot find entities to predict for. Please "
                                  "pass them via `predict(query, indices=...)`")
-            indices = query_def.entity.ids.value
+            indices = query_def.get_rfm_entity_id_list()
         else:
-            query_def = replace(
-                query_def,
-                entity=replace(query_def.entity, ids=None),
-            )
+            query_def = replace(query_def, rfm_entity_ids=None)
         if len(indices) == 0:
-            raise ValueError("At least one entity is required for "
-                             "prediction")
+            raise ValueError("At least one entity is required")
-        if explain and len(indices) > 1:
+        if explain_config is not None and len(indices) > 1:
             raise ValueError(
                 f"Cannot explain predictions for more than a single entity "
                 f"(got {len(indices)})")
         query_repr = query_def.to_string(rich=True, exclude_predict=True)
-        if explain:
+        if explain_config is not None:
             msg = f'[bold]EXPLAIN[/bold] {query_repr}'
         else:
             msg = f'[bold]PREDICT[/bold] {query_repr}'
@@ -314,8 +356,8 @@ class KumoRFM:
             batch_size: Optional[int] = None
             if self._batch_size == 'max':
-                task_type = query_def.get_task_type(
-                    stypes=self._graph_store.stype_dict,
+                task_type = LocalPQueryDriver.get_task_type(
+                    query_def,
                     edge_types=self._graph_store.edge_types,
                 )
                 batch_size = _MAX_PRED_SIZE[task_type]
@@ -332,10 +374,9 @@ class KumoRFM:
                 logger.log(f"Splitting {len(indices):,} entities into "
                            f"{len(batches):,} batches of size {batch_size:,}")
-            resps: Union[
-                List[RFMPredictResponse],
-                List[RFMExplanationResponse],
-            ] = []
+            predictions: List[pd.DataFrame] = []
+            summary: Optional[str] = None
+            details: Optional[Explanation] = None
             for i, batch in enumerate(batches):
                 # TODO Re-use the context for subsequent predictions.
                 context = self._get_context(
@@ -354,6 +395,8 @@ class KumoRFM:
                 request = RFMPredictRequest(
                     context=context,
                     run_mode=RunMode(run_mode),
+                    query=query_str,
+                    use_prediction_time=use_prediction_time,
                 )
                 with warnings.catch_warnings():
                     warnings.filterwarnings('ignore', message='gencode')
@@ -376,11 +419,36 @@ class KumoRFM:
                 for attempt in range(self.num_retries + 1):
                     try:
-                        if explain:
-                            resp = global_state.client.rfm_api.explain(_bytes)
+                        if explain_config is not None:
+                            resp = global_state.client.rfm_api.explain(
+                                request=_bytes,
+                                skip_summary=explain_config.skip_summary,
+                            )
+                            summary = resp.summary
+                            details = resp.details
                         else:
                             resp = global_state.client.rfm_api.predict(_bytes)
-                        resps.append(resp)
+                        df = pd.DataFrame(**resp.prediction)
+                        # Cast 'ENTITY' to correct data type:
+                        if 'ENTITY' in df:
+                            entity = query_def.entity_table
+                            pkey_map = self._graph_store.pkey_map_dict[entity]
+                            df['ENTITY'] = df['ENTITY'].astype(
+                                type(pkey_map.index[0]))
+                        # Cast 'ANCHOR_TIMESTAMP' to correct data type:
+                        if 'ANCHOR_TIMESTAMP' in df:
+                            ser = df['ANCHOR_TIMESTAMP']
+                            if not pd.api.types.is_datetime64_any_dtype(ser):
+                                if isinstance(ser.iloc[0], str):
+                                    unit = None
+                                else:
+                                    unit = 'ms'
+                                df['ANCHOR_TIMESTAMP'] = pd.to_datetime(
+                                    ser, errors='coerce', unit=unit)
+                        predictions.append(df)
                         if (isinstance(verbose, InteractiveProgressLogger)
                                 and len(batches) > 1):
@@ -401,22 +469,73 @@ class KumoRFM:
                         time.sleep(2**attempt)  # 1s, 2s, 4s, 8s, ...
-        predictions = [pd.DataFrame(**resp.prediction) for resp in resps]
         if len(predictions) == 1:
             prediction = predictions[0]
         else:
             prediction = pd.concat(predictions, ignore_index=True)
-        if explain:
-            assert len(resps) == 1
+        if explain_config is not None:
+            assert len(predictions) == 1
+            assert summary is not None
+            assert details is not None
             return Explanation(
                 prediction=prediction,
-                summary=resps[0].summary,
-                details=resps[0].details,
+                summary=summary,
+                details=details,
             )
         return prediction
+    def is_valid_entity(
+        self,
+        query: str,
+        indices: Union[List[str], List[float], List[int], None] = None,
+        *,
+        anchor_time: Union[pd.Timestamp, Literal['entity'], None] = None,
+    ) -> np.ndarray:
+        r"""Returns a mask that denotes which entities are valid for the
+        given predictive query, *i.e.*, which entities fulfill (temporal)
+        entity filter constraints.
+        Args:
+            query: The predictive query.
+            indices: The entity primary keys to predict for. Will override the
+                indices given as part of the predictive query.
+            anchor_time: The anchor timestamp for the prediction. If set to
+                ``None``, will use the maximum timestamp in the data.
+                If set to ``"entity"``, will use the timestamp of the entity.
+        """
+        query_def = self._parse_query(query)
+        if indices is None:
+            if query_def.rfm_entity_ids is None:
+                raise ValueError("Cannot find entities to predict for. Please "
+                                 "pass them via "
+                                 "`is_valid_entity(query, indices=...)`")
+            indices = query_def.get_rfm_entity_id_list()
+        if len(indices) == 0:
+            raise ValueError("At least one entity is required")
+        if anchor_time is None:
+            anchor_time = self._graph_store.max_time
+        if isinstance(anchor_time, pd.Timestamp):
+            self._validate_time(query_def, anchor_time, None, False)
+        else:
+            assert anchor_time == 'entity'
+            if (query_def.entity_table not in self._graph_store.time_dict):
+                raise ValueError(f"Anchor time 'entity' requires the entity "
+                                 f"table '{query_def.entity_table}' "
+                                 f"to have a time column.")
+        node = self._graph_store.get_node_id(
+            table_name=query_def.entity_table,
+            pkey=pd.Series(indices),
+        )
+        query_driver = LocalPQueryDriver(self._graph_store, query_def)
+        return query_driver.is_valid(node, anchor_time)
     def evaluate(
         self,
         query: str,
@@ -430,6 +549,7 @@ class KumoRFM:
         max_pq_iterations: int = 20,
         random_seed: Optional[int] = _RANDOM_SEED,
         verbose: Union[bool, ProgressLogger] = True,
+        use_prediction_time: bool = False,
     ) -> pd.DataFrame:
         """Evaluates a predictive query.
@@ -453,6 +573,9 @@ class KumoRFM:
                 entities to find valid labels.
             random_seed: A manual seed for generating pseudo-random numbers.
             verbose: Whether to print verbose output.
+            use_prediction_time: Whether to use the anchor timestamp as an
+                additional feature during prediction. This is typically
+                beneficial for time series forecasting tasks.
         Returns:
             The metrics as a :class:`pandas.DataFrame`
@@ -463,10 +586,10 @@ class KumoRFM:
             warnings.warn(f"Received custom 'num_neighbors' option; ignoring "
                           f"custom 'num_hops={num_hops}' option")
-        if query_def.entity.ids is not None:
+        if query_def.rfm_entity_ids is not None:
             query_def = replace(
                 query_def,
-                entity=replace(query_def.entity, ids=None),
+                rfm_entity_ids=None,
             )
         query_repr = query_def.to_string(rich=True, exclude_predict=True)
@@ -496,6 +619,7 @@ class KumoRFM:
                 context=context,
                 run_mode=RunMode(run_mode),
                 metrics=metrics,
+                use_prediction_time=use_prediction_time,
             )
             with warnings.catch_warnings():
                 warnings.filterwarnings('ignore', message='Protobuf gencode')
@@ -506,7 +630,7 @@ class KumoRFM:
             if len(request_bytes) > _MAX_SIZE:
                 stats_msg = Context.get_memory_stats(request_msg.context)
-                raise ValueError(_SIZE_LIMIT_MSG.format(stats_msg=stats_msg))
+                raise ValueError(_SIZE_LIMIT_MSG.format(stats=stats_msg))
             try:
                 resp = global_state.client.rfm_api.evaluate(request_bytes)
@@ -554,18 +678,19 @@ class KumoRFM:
         if anchor_time is None:
             anchor_time = self._graph_store.max_time
-            anchor_time = anchor_time - (query_def.target.end_offset *
-                                         query_def.num_forecasts)
+            if query_def.target_ast.date_offset_range is not None:
+                anchor_time = anchor_time - (
+                    query_def.target_ast.date_offset_range.end_date_offset *
+                    query_def.num_forecasts)
         assert anchor_time is not None
         if isinstance(anchor_time, pd.Timestamp):
             self._validate_time(query_def, anchor_time, None, evaluate=True)
         else:
             assert anchor_time == 'entity'
-            if (query_def.entity.pkey.table_name
-                    not in self._graph_store.time_dict):
+            if (query_def.entity_table not in self._graph_store.time_dict):
                 raise ValueError(f"Anchor time 'entity' requires the entity "
-                                 f"table '{query_def.entity.pkey.table_name}' "
+                                 f"table '{query_def.entity_table}' "
                                  f"to have a time column")
         query_driver = LocalPQueryDriver(self._graph_store, query_def,
@@ -580,7 +705,7 @@ class KumoRFM:
         )
         entity = self._graph_store.pkey_map_dict[
-            query_def.entity.pkey.table_name].index[node]
+            query_def.entity_table].index[node]
         return pd.DataFrame({
             'ENTITY': entity,
@@ -590,8 +715,8 @@ class KumoRFM:
     # Helpers #################################################################
-    def _parse_query(self, query: str) -> PQueryDefinition:
-        if isinstance(query, PQueryDefinition):
+    def _parse_query(self, query: str) -> ValidatedPredictiveQuery:
+        if isinstance(query, ValidatedPredictiveQuery):
             return query
         if isinstance(query, str) and query.strip()[:9].lower() == 'evaluate ':
@@ -601,12 +726,12 @@ class KumoRFM:
                              "predictions or evaluations.")
         try:
-            request = RFMValidateQueryRequest(
+            request = RFMParseQueryRequest(
                 query=query,
                 graph_definition=self._graph_def,
             )
-            resp = global_state.client.rfm_api.validate_query(request)
+            resp = global_state.client.rfm_api.parse_query(request)
             # TODO Expose validation warnings.
             if len(resp.validation_response.warnings) > 0:
@@ -617,7 +742,7 @@ class KumoRFM:
                 warnings.warn(f"Encountered the following warnings during "
                               f"parsing:\n{msg}")
-            return resp.query_definition
+            return resp.query
         except HTTPException as e:
             try:
                 msg = json.loads(e.detail)['detail']
@@ -628,7 +753,7 @@ class KumoRFM:
     def _validate_time(
         self,
-        query: PQueryDefinition,
+        query: ValidatedPredictiveQuery,
         anchor_time: pd.Timestamp,
         context_anchor_time: Union[pd.Timestamp, None],
         evaluate: bool,
@@ -651,6 +776,11 @@ class KumoRFM:
                              f"only contains data back to "
                              f"'{self._graph_store.min_time}'.")
+        if query.target_ast.date_offset_range is not None:
+            end_offset = query.target_ast.date_offset_range.end_date_offset
+        else:
+            end_offset = pd.DateOffset(0)
+        forecast_end_offset = end_offset * query.num_forecasts
         if (context_anchor_time is not None
                 and context_anchor_time > anchor_time):
             warnings.warn(f"Context anchor timestamp "
@@ -659,19 +789,18 @@ class KumoRFM:
                           f"(got '{anchor_time}'). Please make sure this is "
                           f"intended.")
         elif (query.query_type == QueryType.TEMPORAL
-              and context_anchor_time is not None and context_anchor_time +
-              query.target.end_offset * query.num_forecasts > anchor_time):
+              and context_anchor_time is not None
+              and context_anchor_time + forecast_end_offset > anchor_time):
             warnings.warn(f"Aggregation for context examples at timestamp "
                           f"'{context_anchor_time}' will leak information "
                           f"from the prediction anchor timestamp "
                           f"'{anchor_time}'. Please make sure this is "
                           f"intended.")
-        elif (context_anchor_time is not None and context_anchor_time -
-              query.target.end_offset * query.num_forecasts
+        elif (context_anchor_time is not None
+              and context_anchor_time - forecast_end_offset
               < self._graph_store.min_time):
-            _time = context_anchor_time - (query.target.end_offset *
-                                           query.num_forecasts)
+            _time = context_anchor_time - forecast_end_offset
             warnings.warn(f"Context anchor timestamp is too early or "
                           f"aggregation time range is too large. To form "
                           f"proper input data, we would need data back to "
@@ -684,8 +813,7 @@ class KumoRFM:
                           f"latest timestamp '{self._graph_store.max_time}' "
                           f"in the data. Please make sure this is intended.")
-        max_eval_time = (self._graph_store.max_time -
-                         query.target.end_offset * query.num_forecasts)
+        max_eval_time = self._graph_store.max_time - forecast_end_offset
         if evaluate and anchor_time > max_eval_time:
             raise ValueError(
                 f"Anchor timestamp for evaluation is after the latest "
@@ -693,7 +821,7 @@ class KumoRFM:
     def _get_context(
         self,
-        query: PQueryDefinition,
+        query: ValidatedPredictiveQuery,
         indices: Union[List[str], List[float], List[int], None],
         anchor_time: Union[pd.Timestamp, Literal['entity'], None],
         context_anchor_time: Union[pd.Timestamp, None],
@@ -721,8 +849,8 @@ class KumoRFM:
                              f"must go beyond this for your use-case.")
         query_driver = LocalPQueryDriver(self._graph_store, query, random_seed)
-        task_type = query.get_task_type(
-            stypes=self._graph_store.stype_dict,
+        task_type = LocalPQueryDriver.get_task_type(
+            query,
             edge_types=self._graph_store.edge_types,
         )
@@ -754,11 +882,15 @@ class KumoRFM:
             else:
                 num_neighbors = [64, 64, 8, 8, 4, 4][:num_hops]
+        if query.target_ast.date_offset_range is None:
+            end_offset = pd.DateOffset(0)
+        else:
+            end_offset = query.target_ast.date_offset_range.end_date_offset
+        forecast_end_offset = end_offset * query.num_forecasts
         if anchor_time is None:
             anchor_time = self._graph_store.max_time
             if evaluate:
-                anchor_time = anchor_time - (query.target.end_offset *
-                                             query.num_forecasts)
+                anchor_time = anchor_time - forecast_end_offset
             if logger is not None:
                 assert isinstance(anchor_time, pd.Timestamp)
                 if anchor_time == pd.Timestamp.min:
@@ -773,15 +905,14 @@ class KumoRFM:
         assert anchor_time is not None
         if isinstance(anchor_time, pd.Timestamp):
             if context_anchor_time is None:
-                context_anchor_time = anchor_time - (query.target.end_offset *
-                                                     query.num_forecasts)
+                context_anchor_time = anchor_time - forecast_end_offset
             self._validate_time(query, anchor_time, context_anchor_time,
                                 evaluate)
         else:
             assert anchor_time == 'entity'
-            if query.entity.pkey.table_name not in self._graph_store.time_dict:
+            if query.entity_table not in self._graph_store.time_dict:
                 raise ValueError(f"Anchor time 'entity' requires the entity "
-                                 f"table '{query.entity.pkey.table_name}' to "
+                                 f"table '{query.entity_table}' to "
                                  f"have a time column")
             if context_anchor_time is not None:
                 warnings.warn("Ignoring option 'context_anchor_time' for "
@@ -832,7 +963,7 @@ class KumoRFM:
                                  f"in batches")
             test_node = self._graph_store.get_node_id(
-                table_name=query.entity.pkey.table_name,
+                table_name=query.entity_table,
                 pkey=pd.Series(indices),
             )
@@ -840,8 +971,7 @@ class KumoRFM:
                 test_time = pd.Series(anchor_time).repeat(
                     len(test_node)).reset_index(drop=True)
             else:
-                time = self._graph_store.time_dict[
-                    query.entity.pkey.table_name]
+                time = self._graph_store.time_dict[query.entity_table]
                 time = time[test_node] * 1000**3
                 test_time = pd.Series(time, dtype='datetime64[ns]')
@@ -874,12 +1004,23 @@ class KumoRFM:
                 raise NotImplementedError
             logger.log(msg)
-        entity_table_names = query.get_entity_table_names(
-            self._graph_store.edge_types)
+        entity_table_names: Tuple[str, ...]
+        if task_type.is_link_pred:
+            final_aggr = query.get_final_target_aggregation()
+            assert final_aggr is not None
+            edge_fkey = final_aggr._get_target_column_name()
+            for edge_type in self._graph_store.edge_types:
+                if edge_fkey == f'{edge_type[0]}.{edge_type[1]}':
+                    entity_table_names = (
+                        query.entity_table,
+                        edge_type[2],
+                    )
+        else:
+            entity_table_names = (query.entity_table, )
         # Exclude the entity anchor time from the feature set to prevent
         # running out-of-distribution between in-context and test examples:
-        exclude_cols_dict = query.exclude_cols_dict
+        exclude_cols_dict = query.get_exclude_cols_dict()
         if anchor_time == 'entity':
             if entity_table_names[0] not in exclude_cols_dict:
                 exclude_cols_dict[entity_table_names[0]] = []
@@ -908,7 +1049,7 @@ class KumoRFM:
         step_size: Optional[int] = None
         if query.query_type == QueryType.TEMPORAL:
-            step_size = date_offset_to_seconds(query.target.end_offset)
+            step_size = date_offset_to_seconds(end_offset)
         return Context(
             task_type=task_type,
@@ -933,7 +1074,7 @@ class KumoRFM:
         elif task_type == TaskType.MULTICLASS_CLASSIFICATION:
             supported_metrics = ['acc', 'precision', 'recall', 'f1', 'mrr']
         elif task_type == TaskType.REGRESSION:
-            supported_metrics = ['mae', 'mape', 'mse', 'rmse', 'smape']
+            supported_metrics = ['mae', 'mape', 'mse', 'rmse', 'smape', 'r2']
         elif task_type == TaskType.TEMPORAL_LINK_PREDICTION:
             supported_metrics = [
                 'map@', 'ndcg@', 'mrr@', 'precision@', 'recall@', 'f1@',

kumoai/jobs.py CHANGED Viewed

@@ -26,6 +26,7 @@ class JobInterface(ABC, Generic[IDType, JobRequestType, JobResourceType]):
             limit (int): Max number of jobs to list, default 10.
         Example:
+            >>> # doctest: +SKIP
             >>> tags = {'pquery_name': 'my_pquery_name'}
             >>> jobs = BatchPredictionJob.search_by_tags(tags)
             Search limited to 10 results based on the `limit` parameter.

kumoai/pquery/predictive_query.py CHANGED Viewed

@@ -370,9 +370,11 @@ class PredictiveQuery:
         train_table_job_api = global_state.client.generate_train_table_job_api
         job_id: GenerateTrainTableJobID = train_table_job_api.create(
             GenerateTrainTableRequest(
-                dict(custom_tags), pq_id, plan,
-                graph_snapshot_id=self.graph.snapshot(
-                    non_blocking=non_blocking)))
+                dict(custom_tags),
+                pq_id,
+                plan,
+                None,
+            ))
         self._train_table = TrainingTableJob(job_id=job_id)
         if non_blocking:
@@ -451,9 +453,11 @@ class PredictiveQuery:
         bp_table_api = global_state.client.generate_prediction_table_job_api
         job_id: GeneratePredictionTableJobID = bp_table_api.create(
             GeneratePredictionTableRequest(
-                dict(custom_tags), pq_id, plan,
-                graph_snapshot_id=self.graph.snapshot(
-                    non_blocking=non_blocking)))
+                dict(custom_tags),
+                pq_id,
+                plan,
+                None,
+            ))
         self._prediction_table = PredictionTableJob(job_id=job_id)
         if non_blocking:

kumoai 2.10.0.dev202510021830__py3-none-any.whl → 2.12.1__py3-none-any.whl

kumoai 2.10.0.dev202510021830py3-none-any.whl → 2.12.1py3-none-any.whl