PyPI - kumoai - Versions diffs - 2.12.0.dev202510231830__cp311-cp311-win_amd64.whl → 2.14.0.dev202512311733__cp311-cp311-win_amd64.whl - Mend

kumoai 2.12.0.dev202510231830__cp311-cp311-win_amd64.whl → 2.14.0.dev202512311733__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

kumoai/__init__.py +41 -35
kumoai/_version.py +1 -1
kumoai/client/client.py +15 -13
kumoai/client/endpoints.py +1 -0
kumoai/client/jobs.py +24 -0
kumoai/client/pquery.py +6 -2
kumoai/client/rfm.py +35 -7
kumoai/connector/utils.py +23 -2
kumoai/experimental/rfm/__init__.py +191 -48
kumoai/experimental/rfm/authenticate.py +3 -4
kumoai/experimental/rfm/backend/__init__.py +0 -0
kumoai/experimental/rfm/backend/local/__init__.py +42 -0
kumoai/experimental/rfm/{local_graph_store.py → backend/local/graph_store.py} +65 -127
kumoai/experimental/rfm/backend/local/sampler.py +312 -0
kumoai/experimental/rfm/backend/local/table.py +113 -0
kumoai/experimental/rfm/backend/snow/__init__.py +37 -0
kumoai/experimental/rfm/backend/snow/sampler.py +297 -0
kumoai/experimental/rfm/backend/snow/table.py +242 -0
kumoai/experimental/rfm/backend/sqlite/__init__.py +32 -0
kumoai/experimental/rfm/backend/sqlite/sampler.py +398 -0
kumoai/experimental/rfm/backend/sqlite/table.py +184 -0
kumoai/experimental/rfm/base/__init__.py +30 -0
kumoai/experimental/rfm/base/column.py +152 -0
kumoai/experimental/rfm/base/expression.py +44 -0
kumoai/experimental/rfm/base/sampler.py +761 -0
kumoai/experimental/rfm/base/source.py +19 -0
kumoai/experimental/rfm/base/sql_sampler.py +143 -0
kumoai/experimental/rfm/base/table.py +735 -0
kumoai/experimental/rfm/graph.py +1237 -0
kumoai/experimental/rfm/infer/__init__.py +8 -0
kumoai/experimental/rfm/infer/dtype.py +82 -0
kumoai/experimental/rfm/infer/multicategorical.py +1 -1
kumoai/experimental/rfm/infer/pkey.py +128 -0
kumoai/experimental/rfm/infer/stype.py +35 -0
kumoai/experimental/rfm/infer/time_col.py +61 -0
kumoai/experimental/rfm/pquery/__init__.py +0 -4
kumoai/experimental/rfm/pquery/executor.py +27 -27
kumoai/experimental/rfm/pquery/pandas_executor.py +64 -40
kumoai/experimental/rfm/relbench.py +76 -0
kumoai/experimental/rfm/rfm.py +386 -276
kumoai/experimental/rfm/sagemaker.py +138 -0
kumoai/kumolib.cp311-win_amd64.pyd +0 -0
kumoai/pquery/predictive_query.py +10 -6
kumoai/spcs.py +1 -3
kumoai/testing/decorators.py +1 -1
kumoai/testing/snow.py +50 -0
kumoai/trainer/distilled_trainer.py +175 -0
kumoai/trainer/trainer.py +9 -10
kumoai/utils/__init__.py +3 -2
kumoai/utils/display.py +51 -0
kumoai/utils/progress_logger.py +188 -16
kumoai/utils/sql.py +3 -0
{kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/METADATA +13 -2
{kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/RECORD +57 -36
kumoai/experimental/rfm/local_graph.py +0 -810
kumoai/experimental/rfm/local_graph_sampler.py +0 -184
kumoai/experimental/rfm/local_pquery_driver.py +0 -494
kumoai/experimental/rfm/local_table.py +0 -545
kumoai/experimental/rfm/pquery/backend.py +0 -136
kumoai/experimental/rfm/pquery/pandas_backend.py +0 -478
kumoai/experimental/rfm/utils.py +0 -344
{kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/WHEEL +0 -0
{kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/licenses/LICENSE +0 -0
{kumoai-2.12.0.dev202510231830.dist-info → kumoai-2.14.0.dev202512311733.dist-info}/top_level.txt +0 -0

kumoai/experimental/rfm/sagemaker.py ADDED Viewed

@@ -0,0 +1,138 @@
+import base64
+import json
+from typing import Any
+import requests
+from kumoai.client import KumoClient
+from kumoai.client.endpoints import Endpoint, HTTPMethod
+from kumoai.exceptions import HTTPException
+try:
+    # isort: off
+    from mypy_boto3_sagemaker_runtime.client import SageMakerRuntimeClient
+    from mypy_boto3_sagemaker_runtime.type_defs import (
+        InvokeEndpointOutputTypeDef, )
+    # isort: on
+except ImportError:
+    SageMakerRuntimeClient = Any
+    InvokeEndpointOutputTypeDef = Any
+class SageMakerResponseAdapter(requests.Response):
+    def __init__(self, sm_response: InvokeEndpointOutputTypeDef):
+        super().__init__()
+        # Read the body bytes
+        self._content = sm_response['Body'].read()
+        self.status_code = 200
+        self.headers['Content-Type'] = sm_response.get('ContentType',
+                                                       'application/json')
+        # Optionally, you can store original sm_response for debugging
+        self.sm_response = sm_response
+    @property
+    def text(self) -> str:
+        assert isinstance(self._content, bytes)
+        return self._content.decode('utf-8')
+    def json(self, **kwargs) -> dict[str, Any]:  # type: ignore
+        return json.loads(self.text, **kwargs)
+class KumoClient_SageMakerAdapter(KumoClient):
+    def __init__(self, region: str, endpoint_name: str):
+        import boto3
+        self._client: SageMakerRuntimeClient = boto3.client(
+            service_name="sagemaker-runtime", region_name=region)
+        self._endpoint_name = endpoint_name
+        # Recording buffers.
+        self._recording_active = False
+        self._recorded_reqs: list[dict[str, Any]] = []
+        self._recorded_resps: list[dict[str, Any]] = []
+    def authenticate(self) -> None:
+        # TODO(siyang): call /ping to verify?
+        pass
+    def _request(self, endpoint: Endpoint, **kwargs: Any) -> requests.Response:
+        assert endpoint.method == HTTPMethod.POST
+        if 'json' in kwargs:
+            payload = json.dumps(kwargs.pop('json'))
+        elif 'data' in kwargs:
+            raw_payload = kwargs.pop('data')
+            assert isinstance(raw_payload, bytes)
+            payload = base64.b64encode(raw_payload).decode()
+        else:
+            raise HTTPException(400, 'Unable to send data to KumoRFM.')
+        request = {
+            'method': endpoint.get_path().rsplit('/')[-1],
+            'payload': payload,
+        }
+        response: InvokeEndpointOutputTypeDef = self._client.invoke_endpoint(
+            EndpointName=self._endpoint_name,
+            ContentType="application/json",
+            Body=json.dumps(request),
+        )
+        adapted_response = SageMakerResponseAdapter(response)
+        # If validation is active, store input/output
+        if self._recording_active:
+            self._recorded_reqs.append(request)
+            self._recorded_resps.append(adapted_response.json())
+        return adapted_response
+    def start_recording(self) -> None:
+        """Start recording requests/responses to/from sagemaker endpoint."""
+        assert not self._recording_active
+        self._recording_active = True
+        self._recorded_reqs.clear()
+        self._recorded_resps.clear()
+    def end_recording(self) -> list[tuple[dict[str, Any], dict[str, Any]]]:
+        """Stop recording and return recorded requests/responses."""
+        assert self._recording_active
+        self._recording_active = False
+        recorded = list(zip(self._recorded_reqs, self._recorded_resps))
+        self._recorded_reqs.clear()
+        self._recorded_resps.clear()
+        return recorded
+class KumoClient_SageMakerProxy_Local(KumoClient):
+    def __init__(self, url: str):
+        self._client = KumoClient(url, api_key=None)
+        self._client._api_url = self._client._url
+        self._endpoint = Endpoint('/invocations', HTTPMethod.POST)
+    def authenticate(self) -> None:
+        try:
+            self._client._session.get(
+                self._url + '/ping',
+                verify=self._verify_ssl).raise_for_status()
+        except Exception:
+            raise ValueError(
+                "Client authentication failed. Please check if you "
+                "have a valid API key/credentials.")
+    def _request(self, endpoint: Endpoint, **kwargs: Any) -> requests.Response:
+        assert endpoint.method == HTTPMethod.POST
+        if 'json' in kwargs:
+            payload = json.dumps(kwargs.pop('json'))
+        elif 'data' in kwargs:
+            raw_payload = kwargs.pop('data')
+            assert isinstance(raw_payload, bytes)
+            payload = base64.b64encode(raw_payload).decode()
+        else:
+            raise HTTPException(400, 'Unable to send data to KumoRFM.')
+        return self._client._request(
+            self._endpoint,
+            json={
+                'method': endpoint.get_path().rsplit('/')[-1],
+                'payload': payload,
+            },
+            **kwargs,
+        )

kumoai/kumolib.cp311-win_amd64.pyd CHANGED Viewed

Binary file

kumoai/pquery/predictive_query.py CHANGED Viewed

@@ -370,9 +370,11 @@ class PredictiveQuery:
         train_table_job_api = global_state.client.generate_train_table_job_api
         job_id: GenerateTrainTableJobID = train_table_job_api.create(
             GenerateTrainTableRequest(
-                dict(custom_tags), pq_id, plan,
-                graph_snapshot_id=self.graph.snapshot(
-                    non_blocking=non_blocking)))
+                dict(custom_tags),
+                pq_id,
+                plan,
+                None,
+            ))
         self._train_table = TrainingTableJob(job_id=job_id)
         if non_blocking:
@@ -451,9 +453,11 @@ class PredictiveQuery:
         bp_table_api = global_state.client.generate_prediction_table_job_api
         job_id: GeneratePredictionTableJobID = bp_table_api.create(
             GeneratePredictionTableRequest(
-                dict(custom_tags), pq_id, plan,
-                graph_snapshot_id=self.graph.snapshot(
-                    non_blocking=non_blocking)))
+                dict(custom_tags),
+                pq_id,
+                plan,
+                None,
+            ))
         self._prediction_table = PredictionTableJob(job_id=job_id)
         if non_blocking:

kumoai/spcs.py CHANGED Viewed

@@ -54,9 +54,7 @@ def _refresh_spcs_token() -> None:
         api_key=global_state._api_key,
         spcs_token=spcs_token,
     )
-    if not client.authenticate():
-        raise ValueError("Client authentication failed. Please check if you "
-                         "have a valid API key.")
+    client.authenticate()
     # Update state:
     global_state.set_spcs_token(spcs_token)

kumoai/testing/decorators.py CHANGED Viewed

@@ -25,7 +25,7 @@ def onlyFullTest(func: Callable) -> Callable:
 def has_package(package: str) -> bool:
     r"""Returns ``True`` in case ``package`` is installed."""
     req = Requirement(package)
-    if importlib.util.find_spec(req.name) is None:
+    if importlib.util.find_spec(req.name) is None:  # type: ignore
         return False
     try:

kumoai/testing/snow.py ADDED Viewed

@@ -0,0 +1,50 @@
+import json
+import os
+from kumoai.experimental.rfm.backend.snow import Connection
+from kumoai.experimental.rfm.backend.snow import connect as _connect
+def connect(
+    region: str,
+    id: str,
+    account: str,
+    user: str,
+    warehouse: str,
+    database: str | None = None,
+    schema: str | None = None,
+) -> Connection:
+    kwargs = dict(password=os.getenv('SNOWFLAKE_PASSWORD'))
+    if kwargs['password'] is None:
+        import boto3
+        from cryptography.hazmat.primitives import serialization
+        client = boto3.client(
+            service_name='secretsmanager',
+            region_name=region,
+        )
+        secret_id = (f'arn:aws:secretsmanager:{region}:{id}:secret:'
+                     f'{account}.snowflakecomputing.com')
+        response = client.get_secret_value(SecretId=secret_id)['SecretString']
+        secret = json.loads(response)
+        private_key = serialization.load_pem_private_key(
+            secret['kumo_user_secretkey'].encode(),
+            password=None,
+        )
+        kwargs['private_key'] = private_key.private_bytes(
+            encoding=serialization.Encoding.DER,
+            format=serialization.PrivateFormat.PKCS8,
+            encryption_algorithm=serialization.NoEncryption(),
+        )
+    return _connect(
+        account=account,
+        user=user,
+        warehouse='WH_XS',
+        database='KUMO',
+        schema=schema,
+        session_parameters=dict(CLIENT_TELEMETRY_ENABLED=False),
+        **kwargs,
+    )

kumoai/trainer/distilled_trainer.py ADDED Viewed

@@ -0,0 +1,175 @@
+import logging
+from typing import Literal, Mapping, Optional, Union, overload
+from kumoapi.distilled_model_plan import DistilledModelPlan
+from kumoapi.jobs import DistillationJobRequest, DistillationJobResource
+from kumoai import global_state
+from kumoai.client.jobs import TrainingJobID
+from kumoai.graph import Graph
+from kumoai.pquery.training_table import TrainingTable, TrainingTableJob
+from kumoai.trainer.job import TrainingJob, TrainingJobResult
+logger = logging.getLogger(__name__)
+class DistillationTrainer:
+    r"""A trainer supports creating a Kumo machine learning model
+    for use in an online serving endpoint. The distllation process involes
+    training a shallow model on a :class:`~kumoai.pquery.PredictiveQuery` using
+    the embeddings generated by a base model :args:`base_training_job_id`.
+    Args:
+        model_plan: The distilled model plan to use for the distillation process.
+        base_training_job_id: The ID of the base training job to use for the distillation process.
+    """  # noqa: E501
+    def __init__(
+        self,
+        model_plan: DistilledModelPlan,
+        base_training_job_id: TrainingJobID,
+    ) -> None:
+        self.model_plan: DistilledModelPlan = model_plan
+        self.base_training_job_id: TrainingJobID = base_training_job_id
+        # Cached from backend:
+        self._training_job_id: Optional[TrainingJobID] = None
+    # Metadata ################################################################
+    @property
+    def is_trained(self) -> bool:
+        r"""Returns ``True`` if this trainer instance has successfully been
+        trained (and is therefore ready for prediction); ``False`` otherwise.
+        """
+        raise NotImplementedError(
+            "Checking if a distilled trainer is trained is not "
+            "implemented yet.")
+    @overload
+    def fit(
+        self,
+        graph: Graph,
+        train_table: Union[TrainingTable, TrainingTableJob],
+    ) -> TrainingJobResult:
+        pass
+    @overload
+    def fit(
+        self,
+        graph: Graph,
+        train_table: Union[TrainingTable, TrainingTableJob],
+        *,
+        non_blocking: Literal[False],
+    ) -> TrainingJobResult:
+        pass
+    @overload
+    def fit(
+        self,
+        graph: Graph,
+        train_table: Union[TrainingTable, TrainingTableJob],
+        *,
+        non_blocking: Literal[True],
+    ) -> TrainingJob:
+        pass
+    @overload
+    def fit(
+        self,
+        graph: Graph,
+        train_table: Union[TrainingTable, TrainingTableJob],
+        *,
+        non_blocking: bool,
+    ) -> Union[TrainingJob, TrainingJobResult]:
+        pass
+    def fit(
+        self,
+        graph: Graph,
+        train_table: Union[TrainingTable, TrainingTableJob],
+        *,
+        non_blocking: bool = False,
+        custom_tags: Mapping[str, str] = {},
+    ) -> Union[TrainingJob, TrainingJobResult]:
+        r"""Fits a model to the specified graph and training table, with the
+        strategy defined by :class:`DistilledTrainer`'s :obj:`model_plan`.
+        Args:
+            graph: The :class:`~kumoai.graph.Graph` object that represents the
+                tables and relationships that Kumo will learn from.
+            train_table: The :class:`~kumoai.pquery.TrainingTable`, or
+                in-progress :class:`~kumoai.pquery.TrainingTableJob`, that
+                represents the training data produced by a
+                :class:`~kumoai.pquery.PredictiveQuery` on :obj:`graph`.
+            non_blocking: Whether this operation should return immediately
+                after launching the training job, or await completion of the
+                training job.
+            custom_tags: Additional, customer defined k-v tags to be associated
+                with the job to be launched. Job tags are useful for grouping
+                and searching jobs.
+        Returns:
+            Union[TrainingJobResult, TrainingJob]:
+                If ``non_blocking=False``, returns a training job object. If
+                ``non_blocking=True``, returns a training job future object.
+        """
+        # TODO(manan, siyang): remove soon:
+        job_id = train_table.job_id
+        assert job_id is not None
+        train_table_job_api = global_state.client.generate_train_table_job_api
+        pq_id = train_table_job_api.get(job_id).config.pquery_id
+        assert pq_id is not None
+        custom_table = None
+        if isinstance(train_table, TrainingTable):
+            custom_table = train_table._custom_train_table
+        # NOTE the backend implementation currently handles sequentialization
+        # between a training table future and a training job; that is, if the
+        # training table future is still executing, the backend will wait on
+        # the job ID completion before executing a training job. This preserves
+        # semantics for both futures, ensures that Kumo works as expected if
+        # used only via REST API, and allows us to avoid chaining calllbacks
+        # in an ugly way here:
+        api = global_state.client.distillation_job_api
+        self._training_job_id = api.create(
+            DistillationJobRequest(
+                dict(custom_tags),
+                pquery_id=pq_id,
+                base_training_job_id=self.base_training_job_id,
+                distilled_model_plan=self.model_plan,
+                graph_snapshot_id=graph.snapshot(non_blocking=non_blocking),
+                train_table_job_id=job_id,
+                custom_train_table=custom_table,
+            ))
+        out = TrainingJob(job_id=self._training_job_id)
+        if non_blocking:
+            return out
+        return out.attach()
+    @classmethod
+    def _load_from_job(
+        cls,
+        job: DistillationJobResource,
+    ) -> 'DistillationTrainer':
+        trainer = cls(job.config.distilled_model_plan,
+                      job.config.base_training_job_id)
+        trainer._training_job_id = job.job_id
+        return trainer
+    @classmethod
+    def load(cls, job_id: TrainingJobID) -> 'DistillationTrainer':
+        r"""Creates a :class:`~kumoai.trainer.Trainer` instance from a training
+        job ID.
+        """
+        raise NotImplementedError(
+            "Loading a distilled trainer from a job ID is not implemented yet."
+        )
+    @classmethod
+    def load_from_tags(cls, tags: Mapping[str, str]) -> 'DistillationTrainer':
+        raise NotImplementedError(
+            "Loading a distilled trainer from tags is not implemented yet.")

kumoai/trainer/trainer.py CHANGED Viewed

@@ -20,7 +20,6 @@ from kumoapi.jobs import (
     TrainingJobResource,
 )
 from kumoapi.model_plan import ModelPlan
-from kumoapi.task import TaskType
 from kumoai import global_state
 from kumoai.artifact_export.config import OutputConfig
@@ -405,15 +404,15 @@ class Trainer:
                 pred_table_data_path = prediction_table.table_data_uri
         api = global_state.client.batch_prediction_job_api
-        from kumoai.pquery.predictive_query import PredictiveQuery
-        pquery = PredictiveQuery.load_from_training_job(training_job_id)
-        if pquery.get_task_type() == TaskType.BINARY_CLASSIFICATION:
-            if binary_classification_threshold is None:
-                logger.warning("No binary classification threshold provided. "
-                               "Using default threshold of 0.5.")
-                binary_classification_threshold = 0.5
+        # Remove to resolve https://github.com/kumo-ai/kumo/issues/24250
+        # from kumoai.pquery.predictive_query import PredictiveQuery
+        # pquery = PredictiveQuery.load_from_training_job(training_job_id)
+        # if pquery.get_task_type() == TaskType.BINARY_CLASSIFICATION:
+        #     if binary_classification_threshold is None:
+        #         logger.warning(
+        # "No binary classification threshold provided. "
+        # "Using default threshold of 0.5.")
+        #         binary_classification_threshold = 0.5
         job_id, response = api.maybe_create(
             BatchPredictionRequest(
                 dict(custom_tags),

kumoai/utils/__init__.py CHANGED Viewed

@@ -1,10 +1,11 @@
-from .progress_logger import ProgressLogger, InteractiveProgressLogger
+from .sql import quote_ident
+from .progress_logger import ProgressLogger
 from .forecasting import ForecastVisualizer
 from .datasets import from_relbench
 __all__ = [
+    'quote_ident',
     'ProgressLogger',
-    'InteractiveProgressLogger',
     'ForecastVisualizer',
     'from_relbench',
 ]

kumoai/utils/display.py ADDED Viewed

@@ -0,0 +1,51 @@
+from collections.abc import Sequence
+import pandas as pd
+from kumoai import in_notebook, in_snowflake_notebook
+def message(msg: str) -> None:
+    msg = msg.replace("`", "'") if not in_notebook() else msg
+    if in_snowflake_notebook():
+        import streamlit as st
+        st.markdown(msg)
+    elif in_notebook():
+        from IPython.display import Markdown, display
+        display(Markdown(msg))
+    else:
+        print(msg)
+def title(msg: str) -> None:
+    message(f"### {msg}" if in_notebook() else f"{msg}:")
+def italic(msg: str) -> None:
+    message(f"*{msg}*" if in_notebook() else msg)
+def unordered_list(items: Sequence[str]) -> None:
+    if in_notebook():
+        msg = '\n'.join([f"- {item}" for item in items])
+    else:
+        msg = '\n'.join([f"• {item.replace('`', '')}" for item in items])
+    message(msg)
+def dataframe(df: pd.DataFrame) -> None:
+    if in_snowflake_notebook():
+        import streamlit as st
+        st.dataframe(df, hide_index=True)
+    elif in_notebook():
+        from IPython.display import display
+        try:
+            if hasattr(df.style, 'hide'):
+                display(df.style.hide(axis='index'))  # pandas=2
+            else:
+                display(df.style.hide_index())  # pandas<1.3
+        except ImportError:
+            print(df.to_string(index=False))  # missing jinja2
+    else:
+        print(df.to_string(index=False))