PyPI - kumoai - Versions diffs - 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202601041732__cp312-cp312-macosx_11_0_arm64.whl - Mend

kumoai 2.14.0.dev202512181731__cp312-cp312-macosx_11_0_arm64.whl → 2.14.0.dev202601041732__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kumoai/__init__.py +23 -26
kumoai/_version.py +1 -1
kumoai/client/client.py +6 -0
kumoai/client/jobs.py +24 -0
kumoai/connector/utils.py +21 -7
kumoai/experimental/rfm/__init__.py +24 -22
kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
kumoai/experimental/rfm/backend/local/sampler.py +0 -3
kumoai/experimental/rfm/backend/local/table.py +25 -24
kumoai/experimental/rfm/backend/snow/sampler.py +106 -61
kumoai/experimental/rfm/backend/snow/table.py +146 -51
kumoai/experimental/rfm/backend/sqlite/sampler.py +127 -78
kumoai/experimental/rfm/backend/sqlite/table.py +94 -47
kumoai/experimental/rfm/base/__init__.py +6 -7
kumoai/experimental/rfm/base/column.py +97 -5
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/sampler.py +5 -17
kumoai/experimental/rfm/base/source.py +1 -1
kumoai/experimental/rfm/base/sql_sampler.py +68 -9
kumoai/experimental/rfm/base/table.py +291 -126
kumoai/experimental/rfm/graph.py +139 -86
kumoai/experimental/rfm/infer/__init__.py +6 -4
kumoai/experimental/rfm/infer/dtype.py +6 -1
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/infer/stype.py +35 -0
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +30 -42
kumoai/experimental/rfm/task_table.py +247 -0
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/utils/display.py +51 -0
{kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202601041732.dist-info}/METADATA +1 -1
{kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202601041732.dist-info}/RECORD +35 -31
kumoai/experimental/rfm/base/column_expression.py +0 -16
kumoai/experimental/rfm/base/sql_table.py +0 -113
{kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202601041732.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202601041732.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512181731.dist-info → kumoai-2.14.0.dev202601041732.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/graph.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import contextlib
 import copy
 import io
@@ -16,9 +18,10 @@ from kumoapi.typing import Stype
 from typing_extensions import Self
 from kumoai import in_notebook, in_snowflake_notebook
-from kumoai.experimental.rfm.base import DataBackend, SQLTable, Table
+from kumoai.experimental.rfm.base import ColumnSpec, DataBackend, Table
 from kumoai.graph import Edge
 from kumoai.mixin import CastMixin
+from kumoai.utils import display
 if TYPE_CHECKING:
     import graphviz
@@ -98,24 +101,25 @@ class Graph:
         for table in tables:
             self.add_table(table)
-        for table in tables:
-            if not isinstance(table, SQLTable):
+        for table in tables:  # Use links from source metadata:
+            if not any(column.is_source for column in table.columns):
                 continue
             for fkey in table._source_foreign_key_dict.values():
                 if fkey.name not in table:
                     continue
-                # TODO Skip for non-physical table[fkey.name].
+                if not table[fkey.name].is_source:
+                    continue
                 dst_table_names = [
                     table.name for table in self.tables.values()
-                    if isinstance(table, SQLTable)
-                    and table._source_name == fkey.dst_table
+                    if table.source_name == fkey.dst_table
                 ]
                 if len(dst_table_names) != 1:
                     continue
                 dst_table = self[dst_table_names[0]]
                 if dst_table._primary_key != fkey.primary_key:
                     continue
-                # TODO Skip for non-physical dst_table.primary_key.
+                if not dst_table[fkey.primary_key].is_source:
+                    continue
                 self.link(table.name, fkey.name, dst_table.name)
         for edge in (edges or []):
@@ -418,6 +422,7 @@ class Graph:
         graph = cls(tables=[])
         msgs = []
+        table_names = {table_cfg['name'] for table_cfg in cfg['tables']}
         for table_cfg in cfg['tables']:
             table_name = table_cfg['name']
             source_table_name = table_cfg['base_table']['table']
@@ -434,14 +439,47 @@ class Graph:
                                 f"'{table_name}' since composite primary keys "
                                 f"are not yet supported")
-            columns: list[str] = []
+            columns: list[ColumnSpec] = []
+            unsupported_columns: list[str] = []
             for column_cfg in chain(
                     table_cfg.get('dimensions', []),
                     table_cfg.get('time_dimensions', []),
                     table_cfg.get('facts', []),
             ):
-                # TODO Add support for derived columns.
-                columns.append(column_cfg['name'])
+                column_name = column_cfg['name']
+                column_expr = column_cfg.get('expr', None)
+                column_data_type = column_cfg.get('data_type', None)
+                if column_expr is None:
+                    columns.append(ColumnSpec(name=column_name))
+                    continue
+                column_expr = column_expr.replace(f'{table_name}.', '')
+                if column_expr == column_name:
+                    columns.append(ColumnSpec(name=column_name))
+                    continue
+                # Drop expressions that reference other tables (for now):
+                if any(f'{name}.' in column_expr for name in table_names):
+                    unsupported_columns.append(column_name)
+                    continue
+                column = ColumnSpec(
+                    name=column_name,
+                    expr=column_expr,
+                    dtype=SnowTable._to_dtype(column_data_type),
+                )
+                columns.append(column)
+            if len(unsupported_columns) == 1:
+                msgs.append(f"Failed to add column '{unsupported_columns[0]}' "
+                            f"of table '{table_name}' since its expression "
+                            f"references other tables")
+            elif len(unsupported_columns) > 1:
+                msgs.append(f"Failed to add columns '{unsupported_columns}' "
+                            f"of table '{table_name}' since their expressions "
+                            f"reference other tables")
             table = SnowTable(
                 connection,
@@ -501,6 +539,35 @@ class Graph:
         return graph
+    @classmethod
+    def from_relbench(
+        cls,
+        dataset: str,
+        verbose: bool = True,
+    ) -> Graph:
+        r"""Loads a `RelBench <https://relbench.stanford.edu>`_ dataset into a
+        :class:`Graph` instance.
+        .. code-block:: python
+            >>> # doctest: +SKIP
+            >>> import kumoai.experimental.rfm as rfm
+            >>> graph = rfm.Graph.from_relbench("f1")
+        Args:
+            dataset: The RelBench dataset name.
+            verbose: Whether to print verbose output.
+        """
+        from kumoai.experimental.rfm.relbench import from_relbench
+        graph = from_relbench(dataset, verbose=verbose)
+        if verbose:
+            graph.print_metadata()
+            graph.print_links()
+        return graph
     # Backend #################################################################
     @property
@@ -612,24 +679,8 @@ class Graph:
     def print_metadata(self) -> None:
         r"""Prints the :meth:`~Graph.metadata` of the graph."""
-        if in_snowflake_notebook():
-            import streamlit as st
-            st.markdown("### 🗂️ Graph Metadata")
-            st.dataframe(self.metadata, hide_index=True)
-        elif in_notebook():
-            from IPython.display import Markdown, display
-            display(Markdown("### 🗂️ Graph Metadata"))
-            df = self.metadata
-            try:
-                if hasattr(df.style, 'hide'):
-                    display(df.style.hide(axis='index'))  # pandas=2
-                else:
-                    display(df.style.hide_index())  # pandas<1.3
-            except ImportError:
-                print(df.to_string(index=False))  # missing jinja2
-        else:
-            print("🗂️ Graph Metadata:")
-            print(self.metadata.to_string(index=False))
+        display.title("🗂️ Graph Metadata")
+        display.dataframe(self.metadata)
     def infer_metadata(self, verbose: bool = True) -> Self:
         r"""Infers metadata for all tables in the graph.
@@ -658,40 +709,21 @@ class Graph:
     def print_links(self) -> None:
         r"""Prints the :meth:`~Graph.edges` of the graph."""
-        edges = [(edge.dst_table, self[edge.dst_table]._primary_key,
-                  edge.src_table, edge.fkey) for edge in self.edges]
-        edges = sorted(edges)
-        if in_snowflake_notebook():
-            import streamlit as st
-            st.markdown("### 🕸️ Graph Links (FK ↔️ PK)")
-            if len(edges) > 0:
-                st.markdown('\n'.join([
-                    f"- {edge[2]}.{edge[3]} ↔️ {edge[0]}.{edge[1]}"
-                    for edge in edges
-                ]))
-            else:
-                st.markdown("*No links registered*")
-        elif in_notebook():
-            from IPython.display import Markdown, display
-            display(Markdown("### 🕸️ Graph Links (FK ↔️ PK)"))
-            if len(edges) > 0:
-                display(
-                    Markdown('\n'.join([
-                        f"- `{edge[2]}.{edge[3]}` ↔️ `{edge[0]}.{edge[1]}`"
-                        for edge in edges
-                    ])))
-            else:
-                display(Markdown("*No links registered*"))
+        edges = sorted([(
+            edge.dst_table,
+            self[edge.dst_table]._primary_key,
+            edge.src_table,
+            edge.fkey,
+        ) for edge in self.edges])
+        display.title("🕸️ Graph Links (FK ↔️ PK)")
+        if len(edges) > 0:
+            display.unordered_list(items=[
+                f"`{edge[2]}.{edge[3]}` ↔️ `{edge[0]}.{edge[1]}`"
+                for edge in edges
+            ])
         else:
-            print("🕸️ Graph Links (FK ↔️ PK):")
-            if len(edges) > 0:
-                print('\n'.join([
-                    f"• {edge[2]}.{edge[3]} ↔️ {edge[0]}.{edge[1]}"
-                    for edge in edges
-                ]))
-            else:
-                print("No links registered")
+            display.italic("No links registered")
     def link(
         self,
@@ -798,6 +830,30 @@ class Graph:
         """
         known_edges = {(edge.src_table, edge.fkey) for edge in self.edges}
+        for table in self.tables.values():  # Use links from source metadata:
+            if not any(column.is_source for column in table.columns):
+                continue
+            for fkey in table._source_foreign_key_dict.values():
+                if fkey.name not in table:
+                    continue
+                if not table[fkey.name].is_source:
+                    continue
+                if (table.name, fkey.name) in known_edges:
+                    continue
+                dst_table_names = [
+                    table.name for table in self.tables.values()
+                    if table.source_name == fkey.dst_table
+                ]
+                if len(dst_table_names) != 1:
+                    continue
+                dst_table = self[dst_table_names[0]]
+                if dst_table._primary_key != fkey.primary_key:
+                    continue
+                if not dst_table[fkey.primary_key].is_source:
+                    continue
+                self.link(table.name, fkey.name, dst_table.name)
+                known_edges.add((table.name, fkey.name))
         # A list of primary key candidates (+score) for every column:
         candidate_dict: dict[
             tuple[str, str],
@@ -897,13 +953,8 @@ class Graph:
                     if score < 5.0:
                         continue
-                    candidate_dict[(
-                        src_table.name,
-                        src_key.name,
-                    )].append((
-                        dst_table.name,
-                        score,
-                    ))
+                    candidate_dict[(src_table.name, src_key.name)].append(
+                        (dst_table.name, score))
         for (src_table_name, src_key_name), scores in candidate_dict.items():
             scores.sort(key=lambda x: x[-1], reverse=True)
@@ -962,24 +1013,26 @@ class Graph:
                                  f"either the primary key or the link before "
                                  f"before proceeding.")
-            # Check that fkey/pkey have valid and consistent data types:
-            assert src_key.dtype is not None
-            src_number = src_key.dtype.is_int() or src_key.dtype.is_float()
-            src_string = src_key.dtype.is_string()
-            assert dst_key.dtype is not None
-            dst_number = dst_key.dtype.is_int() or dst_key.dtype.is_float()
-            dst_string = dst_key.dtype.is_string()
-            if not src_number and not src_string:
-                raise ValueError(f"{edge} is invalid as foreign key must be a "
-                                 f"number or string (got '{src_key.dtype}'")
-            if src_number != dst_number or src_string != dst_string:
-                raise ValueError(f"{edge} is invalid as foreign key "
-                                 f"'{fkey}' and primary key '{dst_key.name}' "
-                                 f"have incompatible data types (got "
-                                 f"fkey.dtype '{src_key.dtype}' and "
-                                 f"pkey.dtype '{dst_key.dtype}')")
+            if self.backend == DataBackend.LOCAL:
+                # Check that fkey/pkey have valid and consistent data types:
+                assert src_key.dtype is not None
+                src_number = src_key.dtype.is_int() or src_key.dtype.is_float()
+                src_string = src_key.dtype.is_string()
+                assert dst_key.dtype is not None
+                dst_number = dst_key.dtype.is_int() or dst_key.dtype.is_float()
+                dst_string = dst_key.dtype.is_string()
+                if not src_number and not src_string:
+                    raise ValueError(
+                        f"{edge} is invalid as foreign key must be a number "
+                        f"or string (got '{src_key.dtype}'")
+                if src_number != dst_number or src_string != dst_string:
+                    raise ValueError(
+                        f"{edge} is invalid as foreign key '{fkey}' and "
+                        f"primary key '{dst_key.name}' have incompatible data "
+                        f"types (got foreign key data type '{src_key.dtype}' "
+                        f"and primary key data type '{dst_key.dtype}')")
         return self

kumoai/experimental/rfm/infer/__init__.py CHANGED Viewed

@@ -1,17 +1,19 @@
 from .dtype import infer_dtype
-from .pkey import infer_primary_key
-from .time_col import infer_time_column
 from .id import contains_id
 from .timestamp import contains_timestamp
 from .categorical import contains_categorical
 from .multicategorical import contains_multicategorical
+from .stype import infer_stype
+from .pkey import infer_primary_key
+from .time_col import infer_time_column
 __all__ = [
     'infer_dtype',
-    'infer_primary_key',
-    'infer_time_column',
     'contains_id',
     'contains_timestamp',
     'contains_categorical',
     'contains_multicategorical',
+    'infer_stype',
+    'infer_primary_key',
+    'infer_time_column',
 ]

kumoai/experimental/rfm/infer/dtype.py CHANGED Viewed

@@ -10,6 +10,8 @@ PANDAS_TO_DTYPE: dict[str, Dtype] = {
     'int16': Dtype.int,
     'int32': Dtype.int,
     'int64': Dtype.int,
+    'float': Dtype.float,
+    'double': Dtype.float,
     'float16': Dtype.float,
     'float32': Dtype.float,
     'float64': Dtype.float,
@@ -18,6 +20,8 @@ PANDAS_TO_DTYPE: dict[str, Dtype] = {
     'string[python]': Dtype.string,
     'string[pyarrow]': Dtype.string,
     'binary': Dtype.binary,
+    'binary[python]': Dtype.binary,
+    'binary[pyarrow]': Dtype.binary,
 }
@@ -48,7 +52,8 @@ def infer_dtype(ser: pd.Series) -> Dtype:
             ser = pd.Series(arr, dtype=pd.ArrowDtype(arr.type))
     if isinstance(ser.dtype, pd.ArrowDtype):
-        if pa.types.is_list(ser.dtype.pyarrow_dtype):
+        if (pa.types.is_list(ser.dtype.pyarrow_dtype)
+                or pa.types.is_fixed_size_list(ser.dtype.pyarrow_dtype)):
             elem_dtype = ser.dtype.pyarrow_dtype.value_type
             if pa.types.is_integer(elem_dtype):
                 return Dtype.intlist

kumoai/experimental/rfm/infer/multicategorical.py CHANGED Viewed

@@ -40,7 +40,7 @@ def contains_multicategorical(
         sep = max(candidates, key=candidates.get)  # type: ignore
         ser = ser.str.split(sep)
-    num_unique_multi = ser.explode().nunique()
+    num_unique_multi = ser.astype('object').explode().nunique()
     if dtype.is_list():
         return num_unique_multi <= MAX_CAT

kumoai/experimental/rfm/infer/stype.py ADDED Viewed

@@ -0,0 +1,35 @@
+import pandas as pd
+from kumoapi.typing import Dtype, Stype
+from kumoai.experimental.rfm.infer import (
+    contains_categorical,
+    contains_id,
+    contains_multicategorical,
+    contains_timestamp,
+)
+def infer_stype(ser: pd.Series, column_name: str, dtype: Dtype) -> Stype:
+    """Infers the :class:`Stype` from a :class:`pandas.Series`.
+    Args:
+        ser: A :class:`pandas.Series` to analyze.
+        column_name: The column name.
+        dtype: The data type.
+    Returns:
+        The semantic type.
+    """
+    if contains_id(ser, column_name, dtype):
+        return Stype.ID
+    if contains_timestamp(ser, column_name, dtype):
+        return Stype.timestamp
+    if contains_multicategorical(ser, column_name, dtype):
+        return Stype.multicategorical
+    if contains_categorical(ser, column_name, dtype):
+        return Stype.categorical
+    return dtype.default_stype

kumoai/experimental/rfm/relbench.py ADDED Viewed

@@ -0,0 +1,76 @@
+import difflib
+import json
+from functools import lru_cache
+from urllib.request import urlopen
+import pooch
+import pyarrow as pa
+from kumoai.experimental.rfm import Graph
+from kumoai.experimental.rfm.backend.local import LocalTable
+PREFIX = 'rel-'
+CACHE_DIR = pooch.os_cache('relbench')
+HASH_URL = ('https://raw.githubusercontent.com/snap-stanford/relbench/main/'
+            'relbench/datasets/hashes.json')
+@lru_cache
+def get_registry() -> pooch.Pooch:
+    with urlopen(HASH_URL) as r:
+        hashes = json.load(r)
+    return pooch.create(
+        path=CACHE_DIR,
+        base_url='https://relbench.stanford.edu/download/',
+        registry=hashes,
+    )
+def from_relbench(dataset: str, verbose: bool = True) -> Graph:
+    dataset = dataset.lower()
+    if dataset.startswith(PREFIX):
+        dataset = dataset[len(PREFIX):]
+    registry = get_registry()
+    datasets = [key.split('/')[0][len(PREFIX):] for key in registry.registry]
+    if dataset not in datasets:
+        matches = difflib.get_close_matches(dataset, datasets, n=1)
+        hint = f" Did you mean '{matches[0]}'?" if len(matches) > 0 else ''
+        raise ValueError(f"Unknown RelBench dataset '{dataset}'.{hint} Valid "
+                         f"datasets are {str(datasets)[1:-1]}.")
+    registry.fetch(
+        f'{PREFIX}{dataset}/db.zip',
+        processor=pooch.Unzip(extract_dir='.'),
+        progressbar=verbose,
+    )
+    graph = Graph(tables=[])
+    edges: list[tuple[str, str, str]] = []
+    for path in (CACHE_DIR / f'{PREFIX}{dataset}' / 'db').glob('*.parquet'):
+        data = pa.parquet.read_table(path)
+        metadata = {
+            key.decode('utf-8'): json.loads(value.decode('utf-8'))
+            for key, value in data.schema.metadata.items()
+            if key in [b"fkey_col_to_pkey_table", b"pkey_col", b"time_col"]
+        }
+        table = LocalTable(
+            df=data.to_pandas(),
+            name=path.stem,
+            primary_key=metadata['pkey_col'],
+            time_column=metadata['time_col'],
+        )
+        graph.add_table(table)
+        edges.extend([
+            (path.stem, fkey, dst_table)
+            for fkey, dst_table in metadata['fkey_col_to_pkey_table'].items()
+        ])
+    for edge in edges:
+        graph.link(*edge)
+    return graph

kumoai/experimental/rfm/rfm.py CHANGED Viewed

@@ -28,13 +28,12 @@ from kumoapi.rfm import (
 from kumoapi.task import TaskType
 from kumoapi.typing import AggregationType, Stype
-from kumoai import in_notebook, in_snowflake_notebook
 from kumoai.client.rfm import RFMAPI
 from kumoai.exceptions import HTTPException
 from kumoai.experimental.rfm import Graph
 from kumoai.experimental.rfm.base import DataBackend, Sampler
 from kumoai.mixin import CastMixin
-from kumoai.utils import ProgressLogger
+from kumoai.utils import ProgressLogger, display
 _RANDOM_SEED = 42
@@ -104,23 +103,8 @@ class Explanation:
     def print(self) -> None:
         r"""Prints the explanation."""
-        if in_snowflake_notebook():
-            import streamlit as st
-            st.dataframe(self.prediction, hide_index=True)
-            st.markdown(self.summary)
-        elif in_notebook():
-            from IPython.display import Markdown, display
-            try:
-                if hasattr(self.prediction.style, 'hide'):
-                    display(self.prediction.hide(axis='index'))  # pandas=2
-                else:
-                    display(self.prediction.hide_index())  # pandas <1.3
-            except ImportError:
-                print(self.prediction.to_string(index=False))  # missing jinja2
-            display(Markdown(self.summary))
-        else:
-            print(self.prediction.to_string(index=False))
-            print(self.summary)
+        display.dataframe(self.prediction)
+        display.message(self.summary)
     def _ipython_display_(self) -> None:
         self.print()
@@ -714,7 +698,7 @@ class KumoRFM:
                                  f"to have a time column")
         train, test = self._sampler.sample_target(
-            query=query,
+            query=query_def,
             num_train_examples=0,
             train_anchor_time=anchor_time,
             num_train_trials=0,
@@ -742,30 +726,34 @@ class KumoRFM:
                              "`predict()` or `evaluate()` methods to perform "
                              "predictions or evaluations.")
-        try:
-            request = RFMParseQueryRequest(
-                query=query,
-                graph_definition=self._graph_def,
-            )
-            resp = self._api_client.parse_query(request)
-            if len(resp.validation_response.warnings) > 0:
-                msg = '\n'.join([
-                    f'{i+1}. {warning.title}: {warning.message}' for i, warning
-                    in enumerate(resp.validation_response.warnings)
-                ])
-                warnings.warn(f"Encountered the following warnings during "
-                              f"parsing:\n{msg}")
+        request = RFMParseQueryRequest(
+            query=query,
+            graph_definition=self._graph_def,
+        )
-            return resp.query
-        except HTTPException as e:
+        for attempt in range(self.num_retries + 1):
             try:
-                msg = json.loads(e.detail)['detail']
-            except Exception:
-                msg = e.detail
-            raise ValueError(f"Failed to parse query '{query}'. "
-                             f"{msg}") from None
+                resp = self._api_client.parse_query(request)
+                break
+            except HTTPException as e:
+                if attempt == self.num_retries:
+                    try:
+                        msg = json.loads(e.detail)['detail']
+                    except Exception:
+                        msg = e.detail
+                    raise ValueError(f"Failed to parse query '{query}'. {msg}")
+                time.sleep(2**attempt)  # 1s, 2s, 4s, 8s, ...
+        if len(resp.validation_response.warnings) > 0:
+            msg = '\n'.join([
+                f'{i+1}. {warning.title}: {warning.message}'
+                for i, warning in enumerate(resp.validation_response.warnings)
+            ])
+            warnings.warn(f"Encountered the following warnings during "
+                          f"parsing:\n{msg}")
+        return resp.query
     @staticmethod
     def _get_task_type(