PyPI - kumoai - Versions diffs - 2.13.0.dev202512081731__cp313-cp313-macosx_11_0_arm64.whl → 2.14.0.dev202512151351__cp313-cp313-macosx_11_0_arm64.whl - Mend

kumoai 2.13.0.dev202512081731__cp313-cp313-macosx_11_0_arm64.whl → 2.14.0.dev202512151351__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

kumoai/_version.py +1 -1
kumoai/client/pquery.py +6 -2
kumoai/experimental/rfm/backend/local/graph_store.py +19 -62
kumoai/experimental/rfm/backend/local/sampler.py +213 -14
kumoai/experimental/rfm/backend/local/table.py +12 -2
kumoai/experimental/rfm/backend/snow/__init__.py +2 -0
kumoai/experimental/rfm/backend/snow/sampler.py +264 -0
kumoai/experimental/rfm/backend/snow/table.py +35 -17
kumoai/experimental/rfm/backend/sqlite/__init__.py +2 -0
kumoai/experimental/rfm/backend/sqlite/sampler.py +354 -0
kumoai/experimental/rfm/backend/sqlite/table.py +36 -11
kumoai/experimental/rfm/base/__init__.py +17 -6
kumoai/experimental/rfm/base/sampler.py +438 -38
kumoai/experimental/rfm/base/source.py +1 -0
kumoai/experimental/rfm/base/sql_sampler.py +56 -0
kumoai/experimental/rfm/base/table.py +12 -1
kumoai/experimental/rfm/graph.py +26 -9
kumoai/experimental/rfm/pquery/pandas_executor.py +1 -1
kumoai/experimental/rfm/rfm.py +214 -151
kumoai/pquery/predictive_query.py +10 -6
kumoai/testing/snow.py +50 -0
kumoai/utils/__init__.py +2 -0
kumoai/utils/sql.py +3 -0
{kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512151351.dist-info}/METADATA +2 -2
{kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512151351.dist-info}/RECORD +28 -25
kumoai/experimental/rfm/local_graph_sampler.py +0 -223
kumoai/experimental/rfm/local_pquery_driver.py +0 -689
{kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512151351.dist-info}/WHEEL +0 -0
{kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512151351.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.13.0.dev202512081731.dist-info → kumoai-2.14.0.dev202512151351.dist-info}/top_level.txt +0 -0

kumoai/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '2.13.0.~~dev202512081731~~'
1	+ __version__ = '2.14.0.dev202512151351'

kumoai/client/pquery.py CHANGED Viewed

@@ -176,8 +176,12 @@ def filter_model_plan(
                     # Undefined
                     pass
-            new_opt_fields.append((field.name, _type, default))
-            new_opts.append(getattr(section, field.name))
+            # Forward compatibility - Remove any newly introduced arguments not
+            # returned yet by the backend:
+            value = getattr(section, field.name)
+            if value != MissingType.VALUE:
+                new_opt_fields.append((field.name, _type, default))
+                new_opts.append(value)
         Section = dataclass(
             config=dict(validate_assignment=True),

kumoai/experimental/rfm/backend/local/graph_store.py CHANGED Viewed

@@ -34,26 +34,21 @@ class LocalGraphStore:
         with verbose as logger:
             self.df_dict, self.mask_dict = self.sanitize(graph)
-            self.stype_dict = self.get_stype_dict(graph)
             logger.log("Sanitized input data")
-            self.pkey_name_dict, self.pkey_map_dict = self.get_pkey_data(graph)
+            self.pkey_map_dict = self.get_pkey_map_dict(graph)
             num_pkeys = sum(t.has_primary_key() for t in graph.tables.values())
             if num_pkeys > 1:
                 logger.log(f"Collected primary keys from {num_pkeys} tables")
             else:
                 logger.log(f"Collected primary key from {num_pkeys} table")
-            (
-                self.time_column_dict,
-                self.end_time_column_dict,
-                self.time_dict,
-                self.min_time,
-                self.max_time,
-            ) = self.get_time_data(graph)
-            if self.max_time != pd.Timestamp.min:
+            self.time_dict, self.min_max_time_dict = self.get_time_data(graph)
+            if len(self.min_max_time_dict) > 0:
+                min_time = min(t for t, _ in self.min_max_time_dict.values())
+                max_time = max(t for _, t in self.min_max_time_dict.values())
                 logger.log(f"Identified temporal graph from "
-                           f"{self.min_time.date()} to {self.max_time.date()}")
+                           f"{min_time.date()} to {max_time.date()}")
             else:
                 logger.log("Identified static graph without timestamps")
@@ -63,14 +58,6 @@ class LocalGraphStore:
             logger.log(f"Created graph with {num_nodes:,} nodes and "
                        f"{num_edges:,} edges")
-    @property
-    def node_types(self) -> List[str]:
-        return list(self.df_dict.keys())
-    @property
-    def edge_types(self) -> List[Tuple[str, str, str]]:
-        return list(self.row_dict.keys())
     def get_node_id(self, table_name: str, pkey: pd.Series) -> np.ndarray:
         r"""Returns the node ID given primary keys.
@@ -154,34 +141,16 @@ class LocalGraphStore:
         return df_dict, mask_dict
-    def get_stype_dict(self, graph: 'Graph') -> Dict[str, Dict[str, Stype]]:
-        stype_dict: Dict[str, Dict[str, Stype]] = {}
-        foreign_keys = {(edge.src_table, edge.fkey) for edge in graph.edges}
-        for table in graph.tables.values():
-            stype_dict[table.name] = {}
-            for column in table.columns:
-                if column == table.primary_key:
-                    continue
-                if (table.name, column.name) in foreign_keys:
-                    continue
-                stype_dict[table.name][column.name] = column.stype
-        return stype_dict
-    def get_pkey_data(
+    def get_pkey_map_dict(
         self,
         graph: 'Graph',
-    ) -> Tuple[
-            Dict[str, str],
-            Dict[str, pd.DataFrame],
-    ]:
-        pkey_name_dict: Dict[str, str] = {}
+    ) -> Dict[str, pd.DataFrame]:
         pkey_map_dict: Dict[str, pd.DataFrame] = {}
         for table in graph.tables.values():
             if table._primary_key is None:
                 continue
-            pkey_name_dict[table.name] = table._primary_key
             pkey = self.df_dict[table.name][table._primary_key]
             pkey_map = pd.DataFrame(
                 dict(arange=range(len(pkey))),
@@ -203,27 +172,18 @@ class LocalGraphStore:
             pkey_map_dict[table.name] = pkey_map
-        return pkey_name_dict, pkey_map_dict
+        return pkey_map_dict
     def get_time_data(
         self,
         graph: 'Graph',
     ) -> Tuple[
-            Dict[str, str],
-            Dict[str, str],
             Dict[str, np.ndarray],
-            pd.Timestamp,
-            pd.Timestamp,
+            Dict[str, Tuple[pd.Timestamp, pd.Timestamp]],
     ]:
-        time_column_dict: Dict[str, str] = {}
-        end_time_column_dict: Dict[str, str] = {}
         time_dict: Dict[str, np.ndarray] = {}
-        min_time = pd.Timestamp.max
-        max_time = pd.Timestamp.min
+        min_max_time_dict: Dict[str, tuple[pd.Timestamp, pd.Timestamp]] = {}
         for table in graph.tables.values():
-            if table._end_time_column is not None:
-                end_time_column_dict[table.name] = table._end_time_column
             if table._time_column is None:
                 continue
@@ -231,21 +191,18 @@ class LocalGraphStore:
             if time.dtype != 'datetime64[ns]':
                 time = time.astype('datetime64[ns]')
             time_dict[table.name] = time.astype(int).to_numpy() // 1000**3
-            time_column_dict[table.name] = table._time_column
             if table.name in self.mask_dict.keys():
                 time = time[self.mask_dict[table.name]]
             if len(time) > 0:
-                min_time = min(min_time, time.min())
-                max_time = max(max_time, time.max())
-        return (
-            time_column_dict,
-            end_time_column_dict,
-            time_dict,
-            min_time,
-            max_time,
-        )
+                min_max_time_dict[table.name] = (time.min(), time.max())
+            else:
+                min_max_time_dict[table.name] = (
+                    pd.Timestamp.max,
+                    pd.Timestamp.min,
+                )
+        return time_dict, min_max_time_dict
     def get_csc(
         self,

kumoai/experimental/rfm/backend/local/sampler.py CHANGED Viewed

@@ -1,10 +1,12 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 import numpy as np
 import pandas as pd
+from kumoapi.pquery import ValidatedPredictiveQuery
 from kumoai.experimental.rfm.backend.local import LocalGraphStore
-from kumoai.experimental.rfm.base import BackwardSamplerOutput, Sampler
+from kumoai.experimental.rfm.base import Sampler, SamplerOutput
+from kumoai.experimental.rfm.pquery import PQueryPandasExecutor
 from kumoai.utils import ProgressLogger
 if TYPE_CHECKING:
@@ -17,7 +19,7 @@ class LocalSampler(Sampler):
         graph: 'Graph',
         verbose: bool | ProgressLogger = True,
     ) -> None:
-        super().__init__(graph=graph)
+        super().__init__(graph=graph, verbose=verbose)
         import kumoai.kumolib as kumolib
@@ -36,19 +38,32 @@ class LocalSampler(Sampler):
             self._graph_store.time_dict,
         )
-    def _sample_backward(
+    def _get_min_max_time_dict(
+        self,
+        table_names: list[str],
+    ) -> dict[str, tuple[pd.Timestamp, pd.Timestamp]]:
+        return {
+            key: value
+            for key, value in self._graph_store.min_max_time_dict.items()
+            if key in table_names
+        }
+    def _sample_subgraph(
         self,
         entity_table_name: str,
         entity_pkey: pd.Series,
-        anchor_time: pd.Series,
+        anchor_time: pd.Series | Literal['entity'],
         columns_dict: dict[str, set[str]],
         num_neighbors: list[int],
-    ) -> BackwardSamplerOutput:
+    ) -> SamplerOutput:
-        num_neighbors_dict: dict[str, list[int]] = {
-            '__'.join(edge_type): num_neighbors
-            for edge_type in self.edge_types
-        }
+        index = self._graph_store.get_node_id(entity_table_name, entity_pkey)
+        if isinstance(anchor_time, pd.Series):
+            time = anchor_time.astype(int).to_numpy() // 1000**3  # to seconds
+        else:
+            assert anchor_time == 'entity'
+            time = self._graph_store.time_dict[entity_table_name][index]
         (
             row_dict,
@@ -58,11 +73,14 @@ class LocalSampler(Sampler):
             num_sampled_nodes_dict,
             num_sampled_edges_dict,
         ) = self._graph_sampler.sample(
-            num_neighbors_dict,
+            {
+                '__'.join(edge_type): num_neighbors
+                for edge_type in self.edge_types
+            },
             {},
             entity_table_name,
-            self._graph_store.get_node_id(entity_table_name, entity_pkey),
-            anchor_time.astype(int).to_numpy() // 1000**3,  # to seconds
+            index,
+            time,
         )
         df_dict: dict[str, pd.DataFrame] = {}
@@ -105,7 +123,8 @@ class LocalSampler(Sampler):
             for edge_type in self.edge_types
         }
-        return BackwardSamplerOutput(
+        return SamplerOutput(
+            anchor_time=time * 1000**3,  # to nanoseconds
             df_dict=df_dict,
             inverse_dict=inverse_dict,
             batch_dict=batch_dict,
@@ -114,3 +133,183 @@ class LocalSampler(Sampler):
             col_dict=col_dict,
             num_sampled_edges_dict=num_sampled_edges_dict,
         )
+    def _sample_entity_table(
+        self,
+        table_name: str,
+        columns: set[str],
+        num_rows: int,
+        random_seed: int | None = None,
+    ) -> pd.DataFrame:
+        pkey_map = self._graph_store.pkey_map_dict[table_name]
+        if len(pkey_map) > num_rows:
+            pkey_map = pkey_map.sample(
+                n=num_rows,
+                random_state=random_seed,
+                ignore_index=True,
+            )
+        df = self._graph_store.df_dict[table_name]
+        df = df.iloc[pkey_map['arange']][list(columns)]
+        return df
+    def _sample_target(
+        self,
+        query: ValidatedPredictiveQuery,
+        entity_df: pd.DataFrame,
+        train_index: np.ndarray,
+        train_time: pd.Series,
+        num_train_examples: int,
+        test_index: np.ndarray,
+        test_time: pd.Series,
+        num_test_examples: int,
+        columns_dict: dict[str, set[str]],
+        time_offset_dict: dict[
+            tuple[str, str, str],
+            tuple[pd.DateOffset | None, pd.DateOffset],
+        ],
+    ) -> tuple[pd.Series, np.ndarray, pd.Series, np.ndarray]:
+        train_y, train_mask = self._sample_target_set(
+            query=query,
+            pkey=entity_df[self.primary_key_dict[query.entity_table]],
+            index=train_index,
+            anchor_time=train_time,
+            num_examples=num_train_examples,
+            columns_dict=columns_dict,
+            time_offset_dict=time_offset_dict,
+        )
+        test_y, test_mask = self._sample_target_set(
+            query=query,
+            pkey=entity_df[self.primary_key_dict[query.entity_table]],
+            index=test_index,
+            anchor_time=test_time,
+            num_examples=num_test_examples,
+            columns_dict=columns_dict,
+            time_offset_dict=time_offset_dict,
+        )
+        return train_y, train_mask, test_y, test_mask
+    # Helper Methods ##########################################################
+    def _sample_target_set(
+        self,
+        query: ValidatedPredictiveQuery,
+        pkey: pd.Series,
+        index: np.ndarray,
+        anchor_time: pd.Series,
+        num_examples: int,
+        columns_dict: dict[str, set[str]],
+        time_offset_dict: dict[
+            tuple[str, str, str],
+            tuple[pd.DateOffset | None, pd.DateOffset],
+        ],
+        batch_size: int = 10_000,
+    ) -> tuple[pd.Series, np.ndarray]:
+        num_hops = 1 if len(time_offset_dict) > 0 else 0
+        num_neighbors_dict: dict[str, list[int]] = {}
+        unix_time_offset_dict: dict[str, list[list[int | None]]] = {}
+        for edge_type, (start, end) in time_offset_dict.items():
+            unix_time_offset_dict['__'.join(edge_type)] = [[
+                date_offset_to_seconds(start) if start is not None else None,
+                date_offset_to_seconds(end),
+            ]]
+        for edge_type in set(self.edge_types) - set(time_offset_dict.keys()):
+            num_neighbors_dict['__'.join(edge_type)] = [0] * num_hops
+        if anchor_time.dtype != 'datetime64[ns]':
+            anchor_time = anchor_time.astype('datetime64')
+        count = 0
+        ys: list[pd.Series] = []
+        mask = np.full(len(index), False, dtype=bool)
+        for start in range(0, len(index), batch_size):
+            subset = pkey.iloc[index[start:start + batch_size]]
+            time = anchor_time.iloc[start:start + batch_size]
+            _, _, node_dict, batch_dict, _, _ = self._graph_sampler.sample(
+                num_neighbors_dict,
+                unix_time_offset_dict,
+                query.entity_table,
+                self._graph_store.get_node_id(query.entity_table, subset),
+                time.astype(int).to_numpy() // 1000**3,  # to seconds
+            )
+            feat_dict: dict[str, pd.DataFrame] = {}
+            time_dict: dict[str, pd.Series] = {}
+            for table_name, columns in columns_dict.items():
+                df = self._graph_store.df_dict[table_name]
+                df = df.iloc[node_dict[table_name]].reset_index(drop=True)
+                df = df[list(columns)]
+                feat_dict[table_name] = df
+                time_column = self.time_column_dict.get(table_name)
+                if time_column in columns:
+                    time_dict[table_name] = df[time_column]
+            y, _mask = PQueryPandasExecutor().execute(
+                query=query,
+                feat_dict=feat_dict,
+                time_dict=time_dict,
+                batch_dict=batch_dict,
+                anchor_time=time,
+                num_forecasts=query.num_forecasts,
+            )
+            ys.append(y)
+            mask[start:start + batch_size] = _mask
+            count += len(y)
+            if count >= num_examples:
+                break
+        if len(ys) == 0:
+            y = pd.Series([], dtype=float)
+        elif len(ys) == 1:
+            y = ys[0]
+        else:
+            y = pd.concat(ys, axis=0, ignore_index=True)
+        return y, mask
+# Helper Functions ############################################################
+def date_offset_to_seconds(offset: pd.DateOffset) -> int:
+    r"""Convert a :class:`pandas.DateOffset` into a number of seconds.
+    .. note::
+        We are conservative and take months and years as their maximum value.
+        Additional values are then dropped in label computation where we know
+        the actual dates.
+    """
+    MAX_DAYS_IN_MONTH = 31
+    MAX_DAYS_IN_YEAR = 366
+    SECONDS_IN_MINUTE = 60
+    SECONDS_IN_HOUR = 60 * SECONDS_IN_MINUTE
+    SECONDS_IN_DAY = 24 * SECONDS_IN_HOUR
+    total_sec = 0
+    multiplier = getattr(offset, 'n', 1)  # The multiplier (if present).
+    for attr, value in offset.__dict__.items():
+        if value is None or value == 0:
+            continue
+        scaled_value = value * multiplier
+        if attr == 'years':
+            total_sec += scaled_value * MAX_DAYS_IN_YEAR * SECONDS_IN_DAY
+        elif attr == 'months':
+            total_sec += scaled_value * MAX_DAYS_IN_MONTH * SECONDS_IN_DAY
+        elif attr == 'days':
+            total_sec += scaled_value * SECONDS_IN_DAY
+        elif attr == 'hours':
+            total_sec += scaled_value * SECONDS_IN_HOUR
+        elif attr == 'minutes':
+            total_sec += scaled_value * SECONDS_IN_MINUTE
+        elif attr == 'seconds':
+            total_sec += scaled_value
+    return total_sec

kumoai/experimental/rfm/backend/local/table.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import warnings
-from typing import List, Optional
+from typing import List, Optional, cast
 import pandas as pd
-from kumoai.experimental.rfm.base import SourceColumn, SourceForeignKey, Table
+from kumoai.experimental.rfm.base import (
+    DataBackend,
+    SourceColumn,
+    SourceForeignKey,
+    Table,
+)
 from kumoai.experimental.rfm.infer import infer_dtype
@@ -76,6 +81,10 @@ class LocalTable(Table):
             end_time_column=end_time_column,
         )
+    @property
+    def backend(self) -> DataBackend:
+        return cast(DataBackend, DataBackend.LOCAL)
     def _get_source_columns(self) -> List[SourceColumn]:
         source_columns: List[SourceColumn] = []
         for column in self._data.columns:
@@ -94,6 +103,7 @@ class LocalTable(Table):
                 dtype=dtype,
                 is_primary_key=False,
                 is_unique_key=False,
+                is_nullable=True,
             )
             source_columns.append(source_column)

kumoai/experimental/rfm/backend/snow/__init__.py CHANGED Viewed

@@ -27,9 +27,11 @@ def connect(**kwargs: Any) -> Connection:
 from .table import SnowTable  # noqa: E402
+from .sampler import SnowSampler  # noqa: E402
 __all__ = [
     'connect',
     'Connection',
     'SnowTable',
+    'SnowSampler',
 ]