PyPI - kumoai - Versions diffs - 2.14.0.dev202601051732__cp311-cp311-macosx_11_0_arm64.whl → 2.15.0.dev202601141731__cp311-cp311-macosx_11_0_arm64.whl - Mend

kumoai 2.14.0.dev202601051732__cp311-cp311-macosx_11_0_arm64.whl → 2.15.0.dev202601141731__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

kumoai/_version.py +1 -1
kumoai/client/jobs.py +2 -0
kumoai/experimental/rfm/backend/snow/sampler.py +138 -28
kumoai/experimental/rfm/backend/snow/table.py +16 -13
kumoai/experimental/rfm/backend/sqlite/sampler.py +73 -15
kumoai/experimental/rfm/base/mapper.py +69 -0
kumoai/experimental/rfm/base/sampler.py +23 -1
kumoai/experimental/rfm/base/sql_sampler.py +252 -11
kumoai/experimental/rfm/base/table.py +15 -29
kumoai/experimental/rfm/base/utils.py +36 -0
kumoai/experimental/rfm/graph.py +9 -9
kumoai/experimental/rfm/infer/dtype.py +3 -1
kumoai/experimental/rfm/infer/time_col.py +4 -2
kumoai/experimental/rfm/rfm.py +195 -114
kumoai/experimental/rfm/task_table.py +2 -0
kumoai/pquery/training_table.py +16 -2
kumoai/testing/snow.py +3 -3
kumoai/utils/display.py +44 -8
kumoai/utils/progress_logger.py +2 -1
kumoai/utils/sql.py +2 -2
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/METADATA +2 -2
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/RECORD +25 -23
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202601051732.dist-info → kumoai-2.15.0.dev202601141731.dist-info}/top_level.txt +0 -0

kumoai/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '2.14.0.~~dev202601051732~~'
1	+ __version__ = '2.15.0.dev202601141731'

kumoai/client/jobs.py CHANGED Viewed

@@ -344,12 +344,14 @@ class GenerateTrainTableJobAPI(CommonJobAPI[GenerateTrainTableRequest,
         id: str,
         source_table_type: SourceTableType,
         train_table_mod: TrainingTableSpec,
+        extensive_validation: bool,
     ) -> ValidationResponse:
         response = self._client._post(
             f'{self._base_endpoint}/{id}/validate_custom_train_table',
             json=to_json_dict({
                 'custom_table': source_table_type,
                 'train_table_mod': train_table_mod,
+                'extensive_validation': extensive_validation,
             }),
         )
         return parse_response(ValidationResponse, response)

kumoai/experimental/rfm/backend/snow/sampler.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
+import math
 from collections.abc import Iterator
 from contextlib import contextmanager
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 import numpy as np
 import pandas as pd
@@ -11,7 +12,7 @@ from kumoapi.pquery import ValidatedPredictiveQuery
 from kumoai.experimental.rfm.backend.snow import Connection, SnowTable
 from kumoai.experimental.rfm.base import SQLSampler, Table
 from kumoai.experimental.rfm.pquery import PQueryPandasExecutor
-from kumoai.utils import ProgressLogger
+from kumoai.utils import ProgressLogger, quote_ident
 if TYPE_CHECKING:
     from kumoai.experimental.rfm import Graph
@@ -37,6 +38,15 @@ class SnowSampler(SQLSampler):
             assert isinstance(table, SnowTable)
             self._connection = table._connection
+        self._num_rows_dict: dict[str, int] = {
+            table.name: cast(int, table._num_rows)
+            for table in graph.tables.values()
+        }
+    @property
+    def num_rows_dict(self) -> dict[str, int]:
+        return self._num_rows_dict
     def _get_min_max_time_dict(
         self,
         table_names: list[str],
@@ -45,8 +55,9 @@ class SnowSampler(SQLSampler):
         for table_name in table_names:
             column = self.time_column_dict[table_name]
             column_ref = self.table_column_ref_dict[table_name][column]
+            ident = quote_ident(table_name, char="'")
             select = (f"SELECT\n"
-                      f"  ? as table_name,\n"
+                      f"  {ident} as table_name,\n"
                       f"  MIN({column_ref}) as min_date,\n"
                       f"  MAX({column_ref}) as max_date\n"
                       f"FROM {self.source_name_dict[table_name]}")
@@ -54,14 +65,13 @@ class SnowSampler(SQLSampler):
         sql = "\nUNION ALL\n".join(selects)
         out_dict: dict[str, tuple[pd.Timestamp, pd.Timestamp]] = {}
-        with paramstyle(self._connection), self._connection.cursor() as cursor:
-            cursor.execute(sql, table_names)
-            rows = cursor.fetchall()
-        for table_name, _min, _max in rows:
-            out_dict[table_name] = (
-                pd.Timestamp.max if _min is None else pd.Timestamp(_min),
-                pd.Timestamp.min if _max is None else pd.Timestamp(_max),
-            )
+        with self._connection.cursor() as cursor:
+            cursor.execute(sql)
+            for table_name, _min, _max in cursor.fetchall():
+                out_dict[table_name] = (
+                    pd.Timestamp.max if _min is None else pd.Timestamp(_min),
+                    pd.Timestamp.min if _max is None else pd.Timestamp(_max),
+                )
         return out_dict
@@ -144,11 +154,11 @@ class SnowSampler(SQLSampler):
             query.entity_table: np.arange(len(entity_df)),
         }
         for edge_type, (min_offset, max_offset) in time_offset_dict.items():
-            table_name, fkey, _ = edge_type
+            table_name, foreign_key, _ = edge_type
             feat_dict[table_name], batch_dict[table_name] = self._by_time(
                 table_name=table_name,
-                fkey=fkey,
-                pkey=entity_df[self.primary_key_dict[query.entity_table]],
+                foreign_key=foreign_key,
+                index=entity_df[self.primary_key_dict[query.entity_table]],
                 anchor_time=time,
                 min_offset=min_offset,
                 max_offset=max_offset,
@@ -179,7 +189,7 @@ class SnowSampler(SQLSampler):
     def _by_pkey(
         self,
         table_name: str,
-        pkey: pd.Series,
+        index: pd.Series,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
         key = self.primary_key_dict[table_name]
@@ -189,7 +199,7 @@ class SnowSampler(SQLSampler):
             for column in columns
         ]
-        payload = json.dumps(list(pkey))
+        payload = json.dumps(list(index))
         sql = ("WITH TMP as (\n"
                "  SELECT\n"
@@ -206,7 +216,7 @@ class SnowSampler(SQLSampler):
                 f"TMP.__KUMO_BATCH__ as __KUMO_BATCH__, "
                 f"{', '.join(projections)}\n"
                 f"FROM TMP\n"
-                f"JOIN {self.source_name_dict[table_name]} ENT\n"
+                f"JOIN {self.source_name_dict[table_name]}\n"
                 f"  ON {key_ref} = TMP.__KUMO_ID__")
         with paramstyle(self._connection), self._connection.cursor() as cursor:
@@ -228,13 +238,108 @@ class SnowSampler(SQLSampler):
             stype_dict=self.table_stype_dict[table_name],
         ), batch
+    def _by_fkey(
+        self,
+        table_name: str,
+        foreign_key: str,
+        index: pd.Series,
+        num_neighbors: int,
+        anchor_time: pd.Series | None,
+        columns: set[str],
+    ) -> tuple[pd.DataFrame, np.ndarray]:
+        time_column = self.time_column_dict.get(table_name)
+        end_time: pd.Series | None = None
+        start_time: pd.Series | None = None
+        if time_column is not None and anchor_time is not None:
+            # In order to avoid a full table scan, we limit foreign key
+            # sampling to a certain time range, approximated by the number of
+            # rows, timestamp ranges and `num_neighbors` value.
+            # Downstream, this helps Snowflake to apply partition pruning:
+            dst_table_name = [
+                dst_table
+                for key, dst_table in self.foreign_key_dict[table_name]
+                if key == foreign_key
+            ][0]
+            num_facts = self.num_rows_dict[table_name]
+            num_entities = self.num_rows_dict[dst_table_name]
+            min_time = self.get_min_time([table_name])
+            max_time = self.get_max_time([table_name])
+            freq = num_facts / num_entities
+            freq = freq / max((max_time - min_time).total_seconds(), 1)
+            offset = pd.Timedelta(seconds=math.ceil(5 * num_neighbors / freq))
+            end_time = anchor_time.dt.strftime("%Y-%m-%d %H:%M:%S")
+            start_time = anchor_time - offset
+            start_time = start_time.dt.strftime("%Y-%m-%d %H:%M:%S")
+            payload = json.dumps(list(zip(index, end_time, start_time)))
+        else:
+            payload = json.dumps(list(zip(index)))
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
+        sql = ("WITH TMP as (\n"
+               "  SELECT\n"
+               "    f.index as __KUMO_BATCH__,\n")
+        if self.table_dtype_dict[table_name][foreign_key].is_int():
+            sql += "    f.value[0]::NUMBER as __KUMO_ID__"
+        elif self.table_dtype_dict[table_name][foreign_key].is_float():
+            sql += "    f.value[0]::FLOAT as __KUMO_ID__"
+        else:
+            sql += "    f.value[0]::VARCHAR as __KUMO_ID__"
+        if end_time is not None and start_time is not None:
+            sql += (",\n"
+                    "    f.value[1]::TIMESTAMP_NTZ as __KUMO_END_TIME__,\n"
+                    "    f.value[2]::TIMESTAMP_NTZ as __KUMO_START_TIME__")
+        sql += (f"\n"
+                f"  FROM TABLE(FLATTEN(INPUT => PARSE_JSON(?))) f\n"
+                f")\n"
+                f"SELECT "
+                f"TMP.__KUMO_BATCH__ as __KUMO_BATCH__, "
+                f"{', '.join(projections)}\n"
+                f"FROM TMP\n"
+                f"JOIN {self.source_name_dict[table_name]}\n"
+                f"  ON {key_ref} = TMP.__KUMO_ID__\n")
+        if end_time is not None and start_time is not None:
+            assert time_column is not None
+            time_ref = self.table_column_ref_dict[table_name][time_column]
+            sql += (f" AND {time_ref} <= TMP.__KUMO_END_TIME__\n"
+                    f" AND {time_ref} > TMP.__KUMO_START_TIME__\n"
+                    f"WHERE {time_ref} <= '{end_time.max()}'\n"
+                    f"  AND {time_ref} > '{start_time.min()}'\n")
+        sql += ("QUALIFY ROW_NUMBER() OVER (\n"
+                "  PARTITION BY TMP.__KUMO_BATCH__\n")
+        if time_column is not None:
+            sql += f"  ORDER BY {time_ref} DESC\n"
+        else:
+            sql += f"  ORDER BY {key_ref}\n"
+        sql += f") <= {num_neighbors}"
+        with paramstyle(self._connection), self._connection.cursor() as cursor:
+            cursor.execute(sql, (payload, ))
+            table = cursor.fetch_arrow_all()
+        batch = table['__KUMO_BATCH__'].cast(pa.int64()).to_numpy()
+        batch_index = table.schema.get_field_index('__KUMO_BATCH__')
+        table = table.remove_column(batch_index)
+        return Table._sanitize(
+            df=table.to_pandas(),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch
     # Helper Methods ##########################################################
     def _by_time(
         self,
         table_name: str,
-        fkey: str,
-        pkey: pd.Series,
+        foreign_key: str,
+        index: pd.Series,
         anchor_time: pd.Series,
         min_offset: pd.DateOffset | None,
         max_offset: pd.DateOffset,
@@ -244,14 +349,15 @@ class SnowSampler(SQLSampler):
         end_time = anchor_time + max_offset
         end_time = end_time.dt.strftime("%Y-%m-%d %H:%M:%S")
+        start_time: pd.Series | None = None
         if min_offset is not None:
             start_time = anchor_time + min_offset
             start_time = start_time.dt.strftime("%Y-%m-%d %H:%M:%S")
-            payload = json.dumps(list(zip(pkey, end_time, start_time)))
+            payload = json.dumps(list(zip(index, end_time, start_time)))
         else:
-            payload = json.dumps(list(zip(pkey, end_time)))
+            payload = json.dumps(list(zip(index, end_time)))
-        key_ref = self.table_column_ref_dict[table_name][fkey]
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
         time_ref = self.table_column_ref_dict[table_name][time_column]
         projections = [
             self.table_column_proj_dict[table_name][column]
@@ -260,9 +366,9 @@ class SnowSampler(SQLSampler):
         sql = ("WITH TMP as (\n"
                "  SELECT\n"
                "    f.index as __KUMO_BATCH__,\n")
-        if self.table_dtype_dict[table_name][fkey].is_int():
+        if self.table_dtype_dict[table_name][foreign_key].is_int():
             sql += "    f.value[0]::NUMBER as __KUMO_ID__,\n"
-        elif self.table_dtype_dict[table_name][fkey].is_float():
+        elif self.table_dtype_dict[table_name][foreign_key].is_float():
             sql += "    f.value[0]::FLOAT as __KUMO_ID__,\n"
         else:
             sql += "    f.value[0]::VARCHAR as __KUMO_ID__,\n"
@@ -276,11 +382,15 @@ class SnowSampler(SQLSampler):
                 f"TMP.__KUMO_BATCH__ as __KUMO_BATCH__, "
                 f"{', '.join(projections)}\n"
                 f"FROM TMP\n"
-                f"JOIN {self.source_name_dict[table_name]} FACT\n"
+                f"JOIN {self.source_name_dict[table_name]}\n"
                 f"  ON {key_ref} = TMP.__KUMO_ID__\n"
-                f" AND {time_ref} <= TMP.__KUMO_END_TIME__")
-        if min_offset is not None:
-            sql += f"\n AND {time_ref} > TMP.__KUMO_START_TIME__"
+                f" AND {time_ref} <= TMP.__KUMO_END_TIME__\n")
+        if start_time is not None:
+            sql += f"AND {time_ref} > TMP.__KUMO_START_TIME__\n"
+        # Add global time bounds to enable partition pruning:
+        sql += f"WHERE {time_ref} <= '{end_time.max()}'"
+        if start_time is not None:
+            sql += f"\nAND {time_ref} > '{start_time.min()}'"
         with paramstyle(self._connection), self._connection.cursor() as cursor:
             cursor.execute(sql, (payload, ))

kumoai/experimental/rfm/backend/snow/table.py CHANGED Viewed

@@ -76,21 +76,13 @@ class SnowTable(Table):
     @property
     def source_name(self) -> str:
-        names: list[str] = []
-        if self._database is not None:
-            names.append(self._database)
-        if self._schema is not None:
-            names.append(self._schema)
-        return '.'.join(names + [self._source_name])
+        names = [self._database, self._schema, self._source_name]
+        return '.'.join(names)
     @property
     def _quoted_source_name(self) -> str:
-        names: list[str] = []
-        if self._database is not None:
-            names.append(quote_ident(self._database))
-        if self._schema is not None:
-            names.append(quote_ident(self._schema))
-        return '.'.join(names + [quote_ident(self._source_name)])
+        names = [self._database, self._schema, self._source_name]
+        return '.'.join([quote_ident(name) for name in names])
     @property
     def backend(self) -> DataBackend:
@@ -159,7 +151,18 @@ class SnowTable(Table):
         )
     def _get_num_rows(self) -> int | None:
-        return None
+        with self._connection.cursor() as cursor:
+            quoted_source_name = quote_ident(self._source_name, char="'")
+            sql = (f"SHOW TABLES LIKE {quoted_source_name} "
+                   f"IN SCHEMA {quote_ident(self._database)}."
+                   f"{quote_ident(self._schema)}")
+            cursor.execute(sql)
+            num_rows = cursor.fetchone()[7]
+        if num_rows == 0:
+            raise RuntimeError("Table '{self.source_name}' is empty")
+        return num_rows
     def _get_expr_sample_df(
         self,

kumoai/experimental/rfm/backend/sqlite/sampler.py CHANGED Viewed

@@ -121,8 +121,9 @@ class SQLiteSampler(SQLSampler):
         for table_name in table_names:
             column = self.time_column_dict[table_name]
             column_ref = self.table_column_ref_dict[table_name][column]
+            ident = quote_ident(table_name, char="'")
             select = (f"SELECT\n"
-                      f"  ? as table_name,\n"
+                      f"  {ident} as table_name,\n"
                       f"  MIN({column_ref}) as min_date,\n"
                       f"  MAX({column_ref}) as max_date\n"
                       f"FROM {self.source_name_dict[table_name]}")
@@ -131,12 +132,13 @@ class SQLiteSampler(SQLSampler):
         out_dict: dict[str, tuple[pd.Timestamp, pd.Timestamp]] = {}
         with self._connection.cursor() as cursor:
-            cursor.execute(sql, table_names)
+            cursor.execute(sql)
             for table_name, _min, _max in cursor.fetchall():
                 out_dict[table_name] = (
                     pd.Timestamp.max if _min is None else pd.Timestamp(_min),
                     pd.Timestamp.min if _max is None else pd.Timestamp(_max),
                 )
         return out_dict
     def _sample_entity_table(
@@ -226,7 +228,7 @@ class SQLiteSampler(SQLSampler):
     def _by_pkey(
         self,
         table_name: str,
-        pkey: pd.Series,
+        index: pd.Series,
         columns: set[str],
     ) -> tuple[pd.DataFrame, np.ndarray]:
         source_table = self.source_table_dict[table_name]
@@ -237,7 +239,7 @@ class SQLiteSampler(SQLSampler):
             for column in columns
         ]
-        tmp = pa.table([pa.array(pkey)], names=['__kumo_id__'])
+        tmp = pa.table([pa.array(index)], names=['__kumo_id__'])
         tmp_name = f'tmp_{table_name}_{key}_{id(tmp)}'
         sql = (f"SELECT "
@@ -245,7 +247,6 @@ class SQLiteSampler(SQLSampler):
                f"{', '.join(projections)}\n"
                f"FROM {quote_ident(tmp_name)} tmp\n"
                f"JOIN {self.source_name_dict[table_name]} ent\n")
         if key in source_table and source_table[key].is_unique_key:
             sql += (f"  ON {key_ref} = tmp.__kumo_id__")
         else:
@@ -271,13 +272,70 @@ class SQLiteSampler(SQLSampler):
             stype_dict=self.table_stype_dict[table_name],
         ), batch
+    def _by_fkey(
+        self,
+        table_name: str,
+        foreign_key: str,
+        index: pd.Series,
+        num_neighbors: int,
+        anchor_time: pd.Series | None,
+        columns: set[str],
+    ) -> tuple[pd.DataFrame, np.ndarray]:
+        time_column = self.time_column_dict.get(table_name)
+        # NOTE SQLite does not have a native datetime format. Currently, we
+        # assume timestamps are given as `TEXT` in `ISO-8601 UTC`:
+        tmp = pa.table([pa.array(index)], names=['__kumo_id__'])
+        if time_column is not None and anchor_time is not None:
+            anchor_time = anchor_time.dt.strftime("%Y-%m-%d %H:%M:%S")
+            tmp = tmp.append_column('__kumo_time__', pa.array(anchor_time))
+        tmp_name = f'tmp_{table_name}_{foreign_key}_{id(tmp)}'
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
+        projections = [
+            self.table_column_proj_dict[table_name][column]
+            for column in columns
+        ]
+        sql = (f"SELECT "
+               f"tmp.rowid - 1 as __kumo_batch__, "
+               f"{', '.join(projections)}\n"
+               f"FROM {quote_ident(tmp_name)} tmp\n"
+               f"JOIN {self.source_name_dict[table_name]} fact\n"
+               f"ON fact.rowid IN (\n"
+               f"  SELECT rowid\n"
+               f"  FROM {self.source_name_dict[table_name]}\n"
+               f"  WHERE {key_ref} = tmp.__kumo_id__\n")
+        if time_column is not None and anchor_time is not None:
+            time_ref = self.table_column_ref_dict[table_name][time_column]
+            sql += f"  AND {time_ref} <= tmp.__kumo_time__\n"
+        if time_column is not None:
+            time_ref = self.table_column_ref_dict[table_name][time_column]
+            sql += f"  ORDER BY {time_ref} DESC\n"
+        sql += (f"  LIMIT {num_neighbors}\n"
+                f")")
+        with self._connection.cursor() as cursor:
+            cursor.adbc_ingest(tmp_name, tmp, mode='replace')
+            cursor.execute(sql)
+            table = cursor.fetch_arrow_table()
+        batch = table['__kumo_batch__'].to_numpy()
+        batch_index = table.schema.get_field_index('__kumo_batch__')
+        table = table.remove_column(batch_index)
+        return Table._sanitize(
+            df=table.to_pandas(),
+            dtype_dict=self.table_dtype_dict[table_name],
+            stype_dict=self.table_stype_dict[table_name],
+        ), batch
     # Helper Methods ##########################################################
     def _by_time(
         self,
         table_name: str,
-        fkey: str,
-        pkey: pd.Series,
+        foreign_key: str,
+        index: pd.Series,
         anchor_time: pd.Series,
         min_offset: pd.DateOffset | None,
         max_offset: pd.DateOffset,
@@ -287,7 +345,7 @@ class SQLiteSampler(SQLSampler):
         # NOTE SQLite does not have a native datetime format. Currently, we
         # assume timestamps are given as `TEXT` in `ISO-8601 UTC`:
-        tmp = pa.table([pa.array(pkey)], names=['__kumo_id__'])
+        tmp = pa.table([pa.array(index)], names=['__kumo_id__'])
         end_time = anchor_time + max_offset
         end_time = end_time.dt.strftime("%Y-%m-%d %H:%M:%S")
         tmp = tmp.append_column('__kumo_end__', pa.array(end_time))
@@ -295,9 +353,9 @@ class SQLiteSampler(SQLSampler):
             start_time = anchor_time + min_offset
             start_time = start_time.dt.strftime("%Y-%m-%d %H:%M:%S")
             tmp = tmp.append_column('__kumo_start__', pa.array(start_time))
-        tmp_name = f'tmp_{table_name}_{fkey}_{id(tmp)}'
+        tmp_name = f'tmp_{table_name}_{foreign_key}_{id(tmp)}'
-        key_ref = self.table_column_ref_dict[table_name][fkey]
+        key_ref = self.table_column_ref_dict[table_name][foreign_key]
         time_ref = self.table_column_ref_dict[table_name][time_column]
         projections = [
             self.table_column_proj_dict[table_name][column]
@@ -307,7 +365,7 @@ class SQLiteSampler(SQLSampler):
                f"tmp.rowid - 1 as __kumo_batch__, "
                f"{', '.join(projections)}\n"
                f"FROM {quote_ident(tmp_name)} tmp\n"
-               f"JOIN {self.source_name_dict[table_name]} fact\n"
+               f"JOIN {self.source_name_dict[table_name]}\n"
                f"  ON {key_ref} = tmp.__kumo_id__\n"
                f" AND {time_ref} <= tmp.__kumo_end__")
         if min_offset is not None:
@@ -359,11 +417,11 @@ class SQLiteSampler(SQLSampler):
                 query.entity_table: np.arange(len(df)),
             }
             for edge_type, (_min, _max) in time_offset_dict.items():
-                table_name, fkey, _ = edge_type
+                table_name, foreign_key, _ = edge_type
                 feat_dict[table_name], batch_dict[table_name] = self._by_time(
                     table_name=table_name,
-                    fkey=fkey,
-                    pkey=df[self.primary_key_dict[query.entity_table]],
+                    foreign_key=foreign_key,
+                    index=df[self.primary_key_dict[query.entity_table]],
                     anchor_time=time,
                     min_offset=_min,
                     max_offset=_max,
@@ -378,7 +436,7 @@ class SQLiteSampler(SQLSampler):
                 feat_dict=feat_dict,
                 time_dict=time_dict,
                 batch_dict=batch_dict,
-                anchor_time=anchor_time,
+                anchor_time=time,
                 num_forecasts=query.num_forecasts,
             )
             ys.append(y)

kumoai/experimental/rfm/base/mapper.py ADDED Viewed

@@ -0,0 +1,69 @@
+import numpy as np
+import pandas as pd
+class Mapper:
+    r"""A mapper to map ``(pkey, batch)`` pairs to contiguous node IDs.
+    Args:
+        num_examples: The maximum number of examples to add/retrieve.
+    """
+    def __init__(self, num_examples: int):
+        self._pkey_dtype: pd.CategoricalDtype | None = None
+        self._indices: list[np.ndarray] = []
+        self._index_dtype: pd.CategoricalDtype | None = None
+        self._num_examples = num_examples
+    def add(self, pkey: pd.Series, batch: np.ndarray) -> None:
+        r"""Adds a set of ``(pkey, batch)`` pairs to the mapper.
+        Args:
+            pkey: The primary keys.
+            batch: The batch vector.
+        """
+        if self._pkey_dtype is not None:
+            category = np.concatenate([
+                self._pkey_dtype.categories.values,
+                pkey,
+            ], axis=0)
+            category = pd.unique(category)
+            self._pkey_dtype = pd.CategoricalDtype(category)
+        elif pd.api.types.is_string_dtype(pkey):
+            category = pd.unique(pkey)
+            self._pkey_dtype = pd.CategoricalDtype(category)
+        if self._pkey_dtype is not None:
+            index = pd.Categorical(pkey, dtype=self._pkey_dtype).codes
+            index = index.astype('int64')
+        else:
+            index = pkey.to_numpy()
+        index = self._num_examples * index + batch
+        self._indices.append(index)
+        self._index_dtype = None
+    def get(self, pkey: pd.Series, batch: np.ndarray) -> np.ndarray:
+        r"""Retrieves the node IDs for a set of ``(pkey, batch)`` pairs.
+        Returns ``-1`` for any pair not registered in the mapping.
+        Args:
+            pkey: The primary keys.
+            batch: The batch vector.
+        """
+        if len(self._indices) == 0:
+            return np.full(len(pkey), -1, dtype=np.int64)
+        if self._index_dtype is None:  # Lazy build index:
+            category = pd.unique(np.concatenate(self._indices))
+            self._index_dtype = pd.CategoricalDtype(category)
+        if self._pkey_dtype is not None:
+            index = pd.Categorical(pkey, dtype=self._pkey_dtype).codes
+            index = index.astype('int64')
+        else:
+            index = pkey.to_numpy()
+        index = self._num_examples * index + batch
+        out = pd.Categorical(index, dtype=self._index_dtype).codes
+        out = out.astype('int64')
+        return out

kumoai/experimental/rfm/base/sampler.py CHANGED Viewed

@@ -59,6 +59,17 @@ class Sampler(ABC):
             self._edge_types.append(edge_type)
             self._edge_types.append(Subgraph.rev_edge_type(edge_type))
+        # Source Table -> [(Foreign Key, Destination Table)]
+        self._foreign_key_dict: dict[str, list[tuple[str, str]]] = {}
+        # Destination Table -> [(Source Table, Foreign Key)]
+        self._rev_foreign_key_dict: dict[str, list[tuple[str, str]]] = {}
+        for table in graph.tables.values():
+            self._foreign_key_dict[table.name] = []
+            self._rev_foreign_key_dict[table.name] = []
+        for src_table, fkey, dst_table in graph.edges:
+            self._foreign_key_dict[src_table].append((fkey, dst_table))
+            self._rev_foreign_key_dict[dst_table].append((src_table, fkey))
         self._primary_key_dict: dict[str, str] = {
             table.name: table._primary_key
             for table in graph.tables.values()
@@ -98,6 +109,16 @@ class Sampler(ABC):
         r"""All available edge types in the graph."""
         return self._edge_types
+    @property
+    def foreign_key_dict(self) -> dict[str, list[tuple[str, str]]]:
+        r"""The foreign keys for all tables in the graph."""
+        return self._foreign_key_dict
+    @property
+    def rev_foreign_key_dict(self) -> dict[str, list[tuple[str, str]]]:
+        r"""The foreign key back references for all tables in the graph."""
+        return self._rev_foreign_key_dict
     @property
     def primary_key_dict(self) -> dict[str, str]:
         r"""All available primary keys in the graph."""
@@ -274,7 +295,8 @@ class Sampler(ABC):
             # Store in compressed representation if more efficient:
             num_cols = subgraph.table_dict[edge_type[2]].num_rows
-            if col is not None and len(col) > num_cols + 1:
+            if (col is not None and len(col) > num_cols + 1
+                    and ((col[1:] - col[:-1]) >= 0).all()):
                 layout = EdgeLayout.CSC
                 colcount = np.bincount(col, minlength=num_cols)
                 col = np.empty(num_cols + 1, dtype=col.dtype)