PyPI - kumoai - Versions diffs - 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl → 2.15.0.dev202601151732__cp313-cp313-win_amd64.whl - Mend

kumoai 2.14.0.dev202512211732__cp313-cp313-win_amd64.whl → 2.15.0.dev202601151732__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

kumoai/__init__.py +23 -26
kumoai/_version.py +1 -1
kumoai/client/client.py +6 -0
kumoai/client/jobs.py +26 -0
kumoai/connector/utils.py +21 -7
kumoai/experimental/rfm/__init__.py +24 -22
kumoai/experimental/rfm/backend/local/graph_store.py +12 -21
kumoai/experimental/rfm/backend/local/sampler.py +0 -3
kumoai/experimental/rfm/backend/local/table.py +24 -25
kumoai/experimental/rfm/backend/snow/sampler.py +235 -80
kumoai/experimental/rfm/backend/snow/table.py +146 -70
kumoai/experimental/rfm/backend/sqlite/sampler.py +196 -89
kumoai/experimental/rfm/backend/sqlite/table.py +85 -55
kumoai/experimental/rfm/base/__init__.py +6 -9
kumoai/experimental/rfm/base/column.py +95 -11
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/mapper.py +69 -0
kumoai/experimental/rfm/base/sampler.py +28 -18
kumoai/experimental/rfm/base/source.py +1 -1
kumoai/experimental/rfm/base/sql_sampler.py +320 -19
kumoai/experimental/rfm/base/table.py +256 -109
kumoai/experimental/rfm/base/utils.py +36 -0
kumoai/experimental/rfm/graph.py +130 -110
kumoai/experimental/rfm/infer/dtype.py +7 -2
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/infer/time_col.py +4 -2
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +540 -306
kumoai/experimental/rfm/task_table.py +292 -0
kumoai/kumolib.cp313-win_amd64.pyd +0 -0
kumoai/pquery/training_table.py +16 -2
kumoai/testing/snow.py +3 -3
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/utils/display.py +87 -0
kumoai/utils/progress_logger.py +15 -2
kumoai/utils/sql.py +2 -2
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601151732.dist-info}/METADATA +2 -2
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601151732.dist-info}/RECORD +41 -36
kumoai/experimental/rfm/base/column_expression.py +0 -50
kumoai/experimental/rfm/base/sql_table.py +0 -229
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601151732.dist-info}/WHEEL +0 -0
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601151732.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.14.0.dev202512211732.dist-info → kumoai-2.15.0.dev202601151732.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/graph.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import contextlib
 import copy
 import io
@@ -16,14 +18,11 @@ from kumoapi.typing import Stype
 from typing_extensions import Self
 from kumoai import in_notebook, in_snowflake_notebook
-from kumoai.experimental.rfm.base import (
-    ColumnExpressionSpec,
-    DataBackend,
-    SQLTable,
-    Table,
-)
+from kumoai.experimental.rfm.base import ColumnSpec, DataBackend, Table
+from kumoai.experimental.rfm.infer import infer_time_column
 from kumoai.graph import Edge
 from kumoai.mixin import CastMixin
+from kumoai.utils import display
 if TYPE_CHECKING:
     import graphviz
@@ -103,27 +102,24 @@ class Graph:
         for table in tables:
             self.add_table(table)
-        for table in tables:
-            if not isinstance(table, SQLTable):
-                continue
-            if '_source_column_dict' not in table.__dict__:
+        for table in tables:  # Use links from source metadata:
+            if not any(column.is_source for column in table.columns):
                 continue
             for fkey in table._source_foreign_key_dict.values():
                 if fkey.name not in table:
                     continue
-                if not table[fkey.name].is_physical:
+                if not table[fkey.name].is_source:
                     continue
                 dst_table_names = [
                     table.name for table in self.tables.values()
-                    if isinstance(table, SQLTable)
-                    and table._source_name == fkey.dst_table
+                    if table.source_name == fkey.dst_table
                 ]
                 if len(dst_table_names) != 1:
                     continue
                 dst_table = self[dst_table_names[0]]
                 if dst_table._primary_key != fkey.primary_key:
                     continue
-                if not dst_table[fkey.primary_key].is_physical:
+                if not dst_table[fkey.primary_key].is_source:
                     continue
                 self.link(table.name, fkey.name, dst_table.name)
@@ -420,8 +416,9 @@ class Graph:
         assert isinstance(connection, Connection)
         with connection.cursor() as cursor:
-            cursor.execute(f"SELECT SYSTEM$READ_YAML_FROM_SEMANTIC_VIEW("
-                           f"'{semantic_view_name}')")
+            sql = (f"SELECT SYSTEM$READ_YAML_FROM_SEMANTIC_VIEW("
+                   f"'{semantic_view_name}')")
+            cursor.execute(sql)
             cfg = yaml.safe_load(cursor.fetchone()[0])
         graph = cls(tables=[])
@@ -444,9 +441,8 @@ class Graph:
                                 f"'{table_name}' since composite primary keys "
                                 f"are not yet supported")
-            columns: list[str] = []
+            columns: list[ColumnSpec] = []
             unsupported_columns: list[str] = []
-            column_expression_specs: list[ColumnExpressionSpec] = []
             for column_cfg in chain(
                     table_cfg.get('dimensions', []),
                     table_cfg.get('time_dimensions', []),
@@ -457,13 +453,13 @@ class Graph:
                 column_data_type = column_cfg.get('data_type', None)
                 if column_expr is None:
-                    columns.append(column_name)
+                    columns.append(ColumnSpec(name=column_name))
                     continue
                 column_expr = column_expr.replace(f'{table_name}.', '')
                 if column_expr == column_name:
-                    columns.append(column_name)
+                    columns.append(ColumnSpec(name=column_name))
                     continue
                 # Drop expressions that reference other tables (for now):
@@ -471,12 +467,12 @@ class Graph:
                     unsupported_columns.append(column_name)
                     continue
-                spec = ColumnExpressionSpec(
+                column = ColumnSpec(
                     name=column_name,
                     expr=column_expr,
-                    dtype=SnowTable.to_dtype(column_data_type),
+                    dtype=SnowTable._to_dtype(column_data_type),
                 )
-                column_expression_specs.append(spec)
+                columns.append(column)
             if len(unsupported_columns) == 1:
                 msgs.append(f"Failed to add column '{unsupported_columns[0]}' "
@@ -494,12 +490,21 @@ class Graph:
                 database=database,
                 schema=schema,
                 columns=columns,
-                column_expressions=column_expression_specs,
                 primary_key=primary_key,
             )
             # TODO Add a way to register time columns without heuristic usage.
-            table.infer_time_column(verbose=False)
+            time_candidates = [
+                column_cfg['name']
+                for column_cfg in table_cfg.get('time_dimensions', [])
+                if table.has_column(column_cfg['name'])
+                and table[column_cfg['name']].stype == Stype.timestamp
+            ]
+            if time_column := infer_time_column(
+                    df=table._get_sample_df(),
+                    candidates=time_candidates,
+            ):
+                table.time_column = time_column
             graph.add_table(table)
@@ -546,6 +551,35 @@ class Graph:
         return graph
+    @classmethod
+    def from_relbench(
+        cls,
+        dataset: str,
+        verbose: bool = True,
+    ) -> Graph:
+        r"""Loads a `RelBench <https://relbench.stanford.edu>`_ dataset into a
+        :class:`Graph` instance.
+        .. code-block:: python
+            >>> # doctest: +SKIP
+            >>> import kumoai.experimental.rfm as rfm
+            >>> graph = rfm.Graph.from_relbench("f1")
+        Args:
+            dataset: The RelBench dataset name.
+            verbose: Whether to print verbose output.
+        """
+        from kumoai.experimental.rfm.relbench import from_relbench
+        graph = from_relbench(dataset, verbose=verbose)
+        if verbose:
+            graph.print_metadata()
+            graph.print_links()
+        return graph
     # Backend #################################################################
     @property
@@ -627,28 +661,28 @@ class Graph:
         r"""Returns a :class:`pandas.DataFrame` object containing metadata
         information about the tables in this graph.
-        The returned dataframe has columns ``name``, ``primary_key``,
-        ``time_column``, and ``end_time_column``, which provide an aggregate
-        view of the properties of the tables of this graph.
+        The returned dataframe has columns ``"Name"``, ``"Primary Key"``,
+        ``"Time Column"``, and ``"End Time Column"``, which provide an
+        aggregated view of the properties of the tables of this graph.
         Example:
             >>> # doctest: +SKIP
             >>> import kumoai.experimental.rfm as rfm
             >>> graph = rfm.Graph(tables=...).infer_metadata()
             >>> graph.metadata  # doctest: +SKIP
-                name   primary_key  time_column  end_time_column
-            0   users      user_id            -                -
+                Name   Primary Key  Time Column  End Time Column
+            0   users  user_id      -            -
         """
         tables = list(self.tables.values())
         return pd.DataFrame({
-            'name':
+            'Name':
             pd.Series(dtype=str, data=[t.name for t in tables]),
-            'primary_key':
+            'Primary Key':
             pd.Series(dtype=str, data=[t._primary_key or '-' for t in tables]),
-            'time_column':
+            'Time Column':
             pd.Series(dtype=str, data=[t._time_column or '-' for t in tables]),
-            'end_time_column':
+            'End Time Column':
             pd.Series(
                 dtype=str,
                 data=[t._end_time_column or '-' for t in tables],
@@ -657,24 +691,8 @@ class Graph:
     def print_metadata(self) -> None:
         r"""Prints the :meth:`~Graph.metadata` of the graph."""
-        if in_snowflake_notebook():
-            import streamlit as st
-            st.markdown("### 🗂️ Graph Metadata")
-            st.dataframe(self.metadata, hide_index=True)
-        elif in_notebook():
-            from IPython.display import Markdown, display
-            display(Markdown("### 🗂️ Graph Metadata"))
-            df = self.metadata
-            try:
-                if hasattr(df.style, 'hide'):
-                    display(df.style.hide(axis='index'))  # pandas=2
-                else:
-                    display(df.style.hide_index())  # pandas<1.3
-            except ImportError:
-                print(df.to_string(index=False))  # missing jinja2
-        else:
-            print("🗂️ Graph Metadata:")
-            print(self.metadata.to_string(index=False))
+        display.title("🗂️ Graph Metadata")
+        display.dataframe(self.metadata)
     def infer_metadata(self, verbose: bool = True) -> Self:
         r"""Infers metadata for all tables in the graph.
@@ -703,40 +721,21 @@ class Graph:
     def print_links(self) -> None:
         r"""Prints the :meth:`~Graph.edges` of the graph."""
-        edges = [(edge.dst_table, self[edge.dst_table]._primary_key,
-                  edge.src_table, edge.fkey) for edge in self.edges]
-        edges = sorted(edges)
-        if in_snowflake_notebook():
-            import streamlit as st
-            st.markdown("### 🕸️ Graph Links (FK ↔️ PK)")
-            if len(edges) > 0:
-                st.markdown('\n'.join([
-                    f"- {edge[2]}.{edge[3]} ↔️ {edge[0]}.{edge[1]}"
-                    for edge in edges
-                ]))
-            else:
-                st.markdown("*No links registered*")
-        elif in_notebook():
-            from IPython.display import Markdown, display
-            display(Markdown("### 🕸️ Graph Links (FK ↔️ PK)"))
-            if len(edges) > 0:
-                display(
-                    Markdown('\n'.join([
-                        f"- `{edge[2]}.{edge[3]}` ↔️ `{edge[0]}.{edge[1]}`"
-                        for edge in edges
-                    ])))
-            else:
-                display(Markdown("*No links registered*"))
+        edges = sorted([(
+            edge.dst_table,
+            self[edge.dst_table]._primary_key,
+            edge.src_table,
+            edge.fkey,
+        ) for edge in self.edges])
+        display.title("🕸️ Graph Links (FK ↔️ PK)")
+        if len(edges) > 0:
+            display.unordered_list(items=[
+                f"`{edge[2]}.{edge[3]}` ↔️ `{edge[0]}.{edge[1]}`"
+                for edge in edges
+            ])
         else:
-            print("🕸️ Graph Links (FK ↔️ PK):")
-            if len(edges) > 0:
-                print('\n'.join([
-                    f"• {edge[2]}.{edge[3]} ↔️ {edge[0]}.{edge[1]}"
-                    for edge in edges
-                ]))
-            else:
-                print("No links registered")
+            display.italic("No links registered")
     def link(
         self,
@@ -843,6 +842,30 @@ class Graph:
         """
         known_edges = {(edge.src_table, edge.fkey) for edge in self.edges}
+        for table in self.tables.values():  # Use links from source metadata:
+            if not any(column.is_source for column in table.columns):
+                continue
+            for fkey in table._source_foreign_key_dict.values():
+                if fkey.name not in table:
+                    continue
+                if not table[fkey.name].is_source:
+                    continue
+                if (table.name, fkey.name) in known_edges:
+                    continue
+                dst_table_names = [
+                    table.name for table in self.tables.values()
+                    if table.source_name == fkey.dst_table
+                ]
+                if len(dst_table_names) != 1:
+                    continue
+                dst_table = self[dst_table_names[0]]
+                if dst_table._primary_key != fkey.primary_key:
+                    continue
+                if not dst_table[fkey.primary_key].is_source:
+                    continue
+                self.link(table.name, fkey.name, dst_table.name)
+                known_edges.add((table.name, fkey.name))
         # A list of primary key candidates (+score) for every column:
         candidate_dict: dict[
             tuple[str, str],
@@ -942,13 +965,8 @@ class Graph:
                     if score < 5.0:
                         continue
-                    candidate_dict[(
-                        src_table.name,
-                        src_key.name,
-                    )].append((
-                        dst_table.name,
-                        score,
-                    ))
+                    candidate_dict[(src_table.name, src_key.name)].append(
+                        (dst_table.name, score))
         for (src_table_name, src_key_name), scores in candidate_dict.items():
             scores.sort(key=lambda x: x[-1], reverse=True)
@@ -1007,24 +1025,26 @@ class Graph:
                                  f"either the primary key or the link before "
                                  f"before proceeding.")
-            # Check that fkey/pkey have valid and consistent data types:
-            assert src_key.dtype is not None
-            src_number = src_key.dtype.is_int() or src_key.dtype.is_float()
-            src_string = src_key.dtype.is_string()
-            assert dst_key.dtype is not None
-            dst_number = dst_key.dtype.is_int() or dst_key.dtype.is_float()
-            dst_string = dst_key.dtype.is_string()
-            if not src_number and not src_string:
-                raise ValueError(f"{edge} is invalid as foreign key must be a "
-                                 f"number or string (got '{src_key.dtype}'")
-            if src_number != dst_number or src_string != dst_string:
-                raise ValueError(f"{edge} is invalid as foreign key "
-                                 f"'{fkey}' and primary key '{dst_key.name}' "
-                                 f"have incompatible data types (got "
-                                 f"fkey.dtype '{src_key.dtype}' and "
-                                 f"pkey.dtype '{dst_key.dtype}')")
+            if self.backend == DataBackend.LOCAL:
+                # Check that fkey/pkey have valid and consistent data types:
+                assert src_key.dtype is not None
+                src_number = src_key.dtype.is_int() or src_key.dtype.is_float()
+                src_string = src_key.dtype.is_string()
+                assert dst_key.dtype is not None
+                dst_number = dst_key.dtype.is_int() or dst_key.dtype.is_float()
+                dst_string = dst_key.dtype.is_string()
+                if not src_number and not src_string:
+                    raise ValueError(
+                        f"{edge} is invalid as foreign key must be a number "
+                        f"or string (got '{src_key.dtype}'")
+                if src_number != dst_number or src_string != dst_string:
+                    raise ValueError(
+                        f"{edge} is invalid as foreign key '{fkey}' and "
+                        f"primary key '{dst_key.name}' have incompatible data "
+                        f"types (got foreign key data type '{src_key.dtype}' "
+                        f"and primary key data type '{dst_key.dtype}')")
         return self

kumoai/experimental/rfm/infer/dtype.py CHANGED Viewed

@@ -3,6 +3,8 @@ import pandas as pd
 import pyarrow as pa
 from kumoapi.typing import Dtype
+from kumoai.experimental.rfm.base.utils import is_datetime
 PANDAS_TO_DTYPE: dict[str, Dtype] = {
     'bool': Dtype.bool,
     'boolean': Dtype.bool,
@@ -20,6 +22,8 @@ PANDAS_TO_DTYPE: dict[str, Dtype] = {
     'string[python]': Dtype.string,
     'string[pyarrow]': Dtype.string,
     'binary': Dtype.binary,
+    'binary[python]': Dtype.binary,
+    'binary[pyarrow]': Dtype.binary,
 }
@@ -32,7 +36,7 @@ def infer_dtype(ser: pd.Series) -> Dtype:
     Returns:
         The data type.
     """
-    if pd.api.types.is_datetime64_any_dtype(ser.dtype):
+    if is_datetime(ser):
         return Dtype.date
     if pd.api.types.is_timedelta64_dtype(ser.dtype):
         return Dtype.timedelta
@@ -50,7 +54,8 @@ def infer_dtype(ser: pd.Series) -> Dtype:
             ser = pd.Series(arr, dtype=pd.ArrowDtype(arr.type))
     if isinstance(ser.dtype, pd.ArrowDtype):
-        if pa.types.is_list(ser.dtype.pyarrow_dtype):
+        if (pa.types.is_list(ser.dtype.pyarrow_dtype)
+                or pa.types.is_fixed_size_list(ser.dtype.pyarrow_dtype)):
             elem_dtype = ser.dtype.pyarrow_dtype.value_type
             if pa.types.is_integer(elem_dtype):
                 return Dtype.intlist

kumoai/experimental/rfm/infer/multicategorical.py CHANGED Viewed

@@ -40,7 +40,7 @@ def contains_multicategorical(
         sep = max(candidates, key=candidates.get)  # type: ignore
         ser = ser.str.split(sep)
-    num_unique_multi = ser.explode().nunique()
+    num_unique_multi = ser.astype('object').explode().nunique()
     if dtype.is_list():
         return num_unique_multi <= MAX_CAT

kumoai/experimental/rfm/infer/time_col.py CHANGED Viewed

@@ -3,6 +3,8 @@ import warnings
 import pandas as pd
+from kumoai.experimental.rfm.base.utils import to_datetime
 def infer_time_column(
     df: pd.DataFrame,
@@ -43,11 +45,11 @@ def infer_time_column(
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', message='Could not infer format')
         min_timestamp_dict = {
-            key: pd.to_datetime(df[key].iloc[:10_000], 'coerce')
+            key: to_datetime(df[key].iloc[:10_000])
             for key in candidates
         }
     min_timestamp_dict = {
-        key: value.min().tz_localize(None)
+        key: value.min()
         for key, value in min_timestamp_dict.items()
     }
     min_timestamp_dict = {

kumoai/experimental/rfm/relbench.py ADDED Viewed

@@ -0,0 +1,76 @@
+import difflib
+import json
+from functools import lru_cache
+from urllib.request import urlopen
+import pooch
+import pyarrow as pa
+from kumoai.experimental.rfm import Graph
+from kumoai.experimental.rfm.backend.local import LocalTable
+PREFIX = 'rel-'
+CACHE_DIR = pooch.os_cache('relbench')
+HASH_URL = ('https://raw.githubusercontent.com/snap-stanford/relbench/main/'
+            'relbench/datasets/hashes.json')
+@lru_cache
+def get_registry() -> pooch.Pooch:
+    with urlopen(HASH_URL) as r:
+        hashes = json.load(r)
+    return pooch.create(
+        path=CACHE_DIR,
+        base_url='https://relbench.stanford.edu/download/',
+        registry=hashes,
+    )
+def from_relbench(dataset: str, verbose: bool = True) -> Graph:
+    dataset = dataset.lower()
+    if dataset.startswith(PREFIX):
+        dataset = dataset[len(PREFIX):]
+    registry = get_registry()
+    datasets = [key.split('/')[0][len(PREFIX):] for key in registry.registry]
+    if dataset not in datasets:
+        matches = difflib.get_close_matches(dataset, datasets, n=1)
+        hint = f" Did you mean '{matches[0]}'?" if len(matches) > 0 else ''
+        raise ValueError(f"Unknown RelBench dataset '{dataset}'.{hint} Valid "
+                         f"datasets are {str(datasets)[1:-1]}.")
+    registry.fetch(
+        f'{PREFIX}{dataset}/db.zip',
+        processor=pooch.Unzip(extract_dir='.'),
+        progressbar=verbose,
+    )
+    graph = Graph(tables=[])
+    edges: list[tuple[str, str, str]] = []
+    for path in (CACHE_DIR / f'{PREFIX}{dataset}' / 'db').glob('*.parquet'):
+        data = pa.parquet.read_table(path)
+        metadata = {
+            key.decode('utf-8'): json.loads(value.decode('utf-8'))
+            for key, value in data.schema.metadata.items()
+            if key in [b"fkey_col_to_pkey_table", b"pkey_col", b"time_col"]
+        }
+        table = LocalTable(
+            df=data.to_pandas(),
+            name=path.stem,
+            primary_key=metadata['pkey_col'],
+            time_column=metadata['time_col'],
+        )
+        graph.add_table(table)
+        edges.extend([
+            (path.stem, fkey, dst_table)
+            for fkey, dst_table in metadata['fkey_col_to_pkey_table'].items()
+        ])
+    for edge in edges:
+        graph.link(*edge)
+    return graph