PyPI - outerproduct-http-types - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

outerproduct-http-types 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: outerproduct-http-types
-Version: 0.2.0
+Version: 0.3.0
 Summary: Shared HTTP-facing type definitions for OuterProduct services and SDKs.
 Project-URL: Homepage, https://outerproduct.com
 Author: OuterProduct, Inc.

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "outerproduct-http-types"
-version = "0.2.0"
+version = "0.3.0"
 description = "Shared HTTP-facing type definitions for OuterProduct services and SDKs."
 readme = "README.md"
 requires-python = ">=3.12"

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/__init__.py RENAMED Viewed

@@ -47,8 +47,19 @@ from .inference import (
     ScenarioResponse,
     ScenarioResultItem,
 )
+from .patterns import (
+    FilterPatternSchema,
+    PatternTrackerApplyRequest,
+    PatternTrackerDistributionResponse,
+    PatternTrackerFitRequest,
+    PatternTrackerFitResponse,
+    PatternTrackerPartitionResponse,
+    PatternTrackerResponse,
+    PatternTrackerTransformResponse,
+    PredicateSchema,
+    SchemaInfoSchema,
+)
 from .reasoning import ReasoningFitRequest, ReasoningFitResponse
-from .segment import ClusterPersonaSchema, SegmentRequest, SegmentResultResponse
 from .trainer import (
     HardwareSpec,
     ModalHardwareSpec,
@@ -62,7 +73,6 @@ __version__ = "0.1.0"
 __all__ = [
     "AnswerType",
-    "ClusterPersonaSchema",
     "ConnectorResponse",
     "ConnectorType",
     "CreateConnectorRequest",
@@ -88,6 +98,16 @@ __all__ = [
     "ListTablesRequest",
     "ListTablesResponse",
     "ModalHardwareSpec",
+    "PatternTrackerApplyRequest",
+    "PatternTrackerDistributionResponse",
+    "PatternTrackerFitRequest",
+    "PatternTrackerFitResponse",
+    "PatternTrackerPartitionResponse",
+    "PatternTrackerResponse",
+    "PatternTrackerTransformResponse",
+    "PredicateSchema",
+    "FilterPatternSchema",
+    "SchemaInfoSchema",
     "PredictAndExplainRequest",
     "PredictAndExplainResponse",
     "PredictRequest",
@@ -102,8 +122,6 @@ __all__ = [
     "ScenarioResultItem",
     "Schema",
     "SchemaResultResponse",
-    "SegmentRequest",
-    "SegmentResultResponse",
     "StatusResponse",
     "TabularizeJobResponse",
     "TabularizeRequest",

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/common.py RENAMED Viewed

@@ -31,7 +31,9 @@ class StatusResponse(BaseModel):
     """Returned by GET /models/{model_id}/status."""
     model_id: str
-    job_type: str = Field(description=("One of: trainer_run, reasoning_fit, segment."))
+    job_type: str = Field(
+        description="One of: trainer_run, reasoning_fit, patterns_fit:<tracker_id>."
+    )
     status: JobStatus
     progress: dict[str, Any] | None = Field(
         None, description='Progress info, e.g. {"step": 3, "total_steps": 5}'

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/inference.py RENAMED Viewed

@@ -2,7 +2,49 @@
 from typing import Any, Literal
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+# --- Request size caps ---------------------------------------------------- #
+# Tune in review. Defined as module-level constants so callers and tests can
+# reference them, and so the team can adjust without hunting for the values.
+MAX_PREDICT_ROWS = 100_000
+"""Hard cap on `samples` length for /predict requests."""
+MAX_EXPLAIN_ROWS = 10_000
+"""Hard cap on `samples` length for /explain and /predict_and_explain.
+Lower than /predict because explanation computes per-row feature
+attributions, which are roughly an order of magnitude more expensive
+than prediction.
+Note: real per-call cost also depends on training-time choices made
+when the model was fit (cached sample counts, etc.). This row cap is
+tight only when training stayed near defaults; models trained with
+large internal caches can make explain expensive even within this
+budget."""
+MAX_SCENARIO_QUERIES = 100
+"""Hard cap on `queries` length for /scenario.
+Each query runs `n_walks` random walks of up to `max_steps` steps; total
+work scales as n_queries x n_walks x max_steps."""
+MAX_SCENARIO_WALK_BUDGET = 50_000
+"""Hard cap on `n_walks x max_steps` per request.
+Bounds the per-query inner work regardless of how the user splits the
+budget between width (more walks) and depth (more steps)."""
+# TODO(reviewers): consider adding MAX_SCENARIO_TOTAL_EXPLAINS to bound the
+# aggregate explain work across a scenario request. Each walk step calls
+# explain() on currently-active walks; worst-case totals (100 queries ×
+# 500 walks × ~30 steps) can run for minutes of wall-clock even though
+# each individual explain is small. A cap like
+# MAX_SCENARIO_QUERIES * default n_walks would prevent the worst pile-ups
+# without affecting typical usage. Not enforced today — wait until we see
+# real timeouts before adding it.
 # --- POST /v1/models/{model_id}/predict ---
@@ -19,6 +61,16 @@ class PredictRequest(BaseModel):
         "schema in name and order."
     )
+    @model_validator(mode="after")
+    def _check_sample_count(self):
+        if len(self.samples) > MAX_PREDICT_ROWS:
+            raise ValueError(
+                f"samples has {len(self.samples)} rows; the per-request cap is "
+                f"{MAX_PREDICT_ROWS}. Batch the request or use a connector-backed "
+                "workflow."
+            )
+        return self
 class PredictResponse(BaseModel):
     model_id: str
@@ -40,6 +92,15 @@ class ExplainRequest(BaseModel):
         "schema in name and order."
     )
+    @model_validator(mode="after")
+    def _check_sample_count(self):
+        if len(self.samples) > MAX_EXPLAIN_ROWS:
+            raise ValueError(
+                f"samples has {len(self.samples)} rows; the per-request cap for "
+                f"/explain is {MAX_EXPLAIN_ROWS}."
+            )
+        return self
 class ExplainResponse(BaseModel):
     """Batch-shaped explanation arrays. Dimension 0 is the batch.
@@ -88,6 +149,15 @@ class PredictAndExplainRequest(BaseModel):
         ),
     )
+    @model_validator(mode="after")
+    def _check_sample_count(self):
+        if len(self.samples) > MAX_EXPLAIN_ROWS:
+            raise ValueError(
+                f"samples has {len(self.samples)} rows; the per-request cap for "
+                f"/predict_and_explain is {MAX_EXPLAIN_ROWS}."
+            )
+        return self
 class PredictAndExplainResponse(BaseModel):
     """Batch-shaped predict + explain arrays. Dimension 0 is the batch."""
@@ -155,6 +225,27 @@ class FeatureConstraintSchema(BaseModel):
     value_range: tuple[float | None, float | None] | None = None
     allowed_values: list[Any] | None = None
+    @model_validator(mode="after")
+    def _check_invariants(self):
+        if self.immutable and (
+            self.monotonic is not None
+            or self.value_range is not None
+            or self.allowed_values is not None
+        ):
+            raise ValueError(
+                "immutable=True cannot be combined with monotonic, value_range, "
+                "or allowed_values."
+            )
+        if self.value_range is not None:
+            lo, hi = self.value_range
+            if lo is not None and hi is not None and lo > hi:
+                raise ValueError(
+                    f"value_range lower bound {lo} exceeds upper bound {hi}."
+                )
+        if self.allowed_values is not None and len(self.allowed_values) == 0:
+            raise ValueError("allowed_values must be non-empty when provided.")
+        return self
 class ScenarioRequest(BaseModel):
     """POST /v1/models/{model_id}/scenario -- Counterfactual search with constraints."""
@@ -168,12 +259,27 @@ class ScenarioRequest(BaseModel):
         "schema in name and order."
     )
     desired_class: int = 1
-    n_walks: int = 500
-    max_steps: int = 30
-    epsilon: float = 0.2
+    n_walks: int = Field(500, ge=1)
+    max_steps: int = Field(30, ge=1)
+    epsilon: float = Field(0.2, gt=0.0, le=1.0)
     random_state: int | None = 42
     constraints: dict[str, FeatureConstraintSchema] = Field(default_factory=dict)
+    @model_validator(mode="after")
+    def _check_request_budget(self):
+        if len(self.queries) > MAX_SCENARIO_QUERIES:
+            raise ValueError(
+                f"queries has {len(self.queries)} entries; the per-request cap is "
+                f"{MAX_SCENARIO_QUERIES}."
+            )
+        budget = self.n_walks * self.max_steps
+        if budget > MAX_SCENARIO_WALK_BUDGET:
+            raise ValueError(
+                f"n_walks × max_steps = {budget} exceeds the per-query cap of "
+                f"{MAX_SCENARIO_WALK_BUDGET}. Reduce n_walks or max_steps."
+            )
+        return self
 class ScenarioChange(BaseModel):
     """Single-feature diff between a query and one counterfactual row."""

outerproduct_http_types-0.3.0/src/outerproduct_http_types/patterns.py ADDED Viewed

@@ -0,0 +1,213 @@
+"""Request/response schemas for the /v1/models/{model_id}/patterns/* endpoints.
+PatternTracker aggregates per-sample local-rule explanations into a small set
+of executable, labeled filter patterns. Fit runs server-side; the SDK
+client holds an ``id`` and calls transform/distribution/partition by
+``tracker_id``.
+"""
+from typing import Any, Literal
+from pydantic import BaseModel, Field
+from .common import JobStatus
+class PredicateSchema(BaseModel):
+    """One literal of a conjunctive filter: ``feature <op> value``.
+    Mirrors ``outerproduct_reasoning.internal.local_rules.Predicate``: ``op``
+    is ``<=``/``>=`` for continuous features (numeric ``value``) or ``==`` for
+    categoricals (string/bool/int ``value``).
+    """
+    feature: str
+    op: Literal["<=", ">=", "=="]
+    value: float | int | str | bool
+class FilterPatternSchema(BaseModel):
+    """One executable conjunctive filter with coverage stats and a label."""
+    predicates: list[PredicateSchema]
+    label: str
+    support_rejects: float = Field(
+        description="Share of fit-time rejected rows the pattern matches."
+    )
+    precision: float = Field(
+        description="Share of all matched rows that are rejects."
+    )
+    lift: float = Field(description="precision / base_reject_rate.")
+    n_local_rules_covered: int
+class SchemaInfoSchema(BaseModel):
+    """Column / dtype / categorical-level snapshot captured at fit time."""
+    columns: list[str]
+    dtypes: dict[str, str]
+    # frozensets don't survive JSON; the server emits sorted lists, the SDK
+    # rehydrates into frozensets on its side.
+    categorical_levels: dict[str, list[Any]]
+# --- POST /v1/models/{model_id}/patterns/fit ---
+class PatternTrackerFitRequest(BaseModel):
+    """POST /v1/models/{model_id}/patterns/fit -- Submit a pattern-tracker fit job.
+    Carries the fit hyperparameters plus a dataset reference that mirrors
+    :class:`ReasoningFitRequest`. The dataset is resolved on the API server
+    (inline parquet upload / pre-uploaded passthrough / connector validation)
+    before the heavy compute is dispatched to a Modal CPU function; no inline
+    data is held in memory on the API tier.
+    Exactly one of the three dataset modes must be set:
+    - **Connector-backed** (``data_connector=True``): ``connector_id`` +
+      ``table_name`` reference a registered connector. The Modal worker
+      materializes the table at compute time.
+    - **Pre-uploaded** (``data_uploaded=True``): ``data_model_id`` is the
+      upload-scope pointer returned by ``POST /v1/uploads`` (independent
+      of the route's owning ``model_id``).
+    - **Inline** (default): ``data`` and ``feature_names`` carry a small
+      tabular payload directly in the request body.
+    """
+    target_range: tuple[float | None, float | None] = Field(
+        description=(
+            "(lo, hi) bounds defining the 'rejected' band of predictions. "
+            "Either side may be null for an open bound; at least one must be "
+            "set."
+        )
+    )
+    mode: Literal["cover", "discovery"] = "discovery"
+    max_patterns: int = 25
+    coverage_target: float = 0.95
+    min_pattern_support: float = 0.005
+    min_precision: float = 0.5
+    max_pattern_size: int = 3
+    threshold_n_bins: int = 10
+    ensure_coverage: bool = True
+    min_wracc: float = 0.0
+    diversity_threshold: float = 0.5
+    drop_redundant: bool = True
+    child_overlap_threshold: float = 0.9
+    explained_lift_threshold: float = 1.1
+    rule_kwargs: dict[str, Any] | None = None
+    # --- Dataset routing (mirrors ReasoningFitRequest) ----------------------
+    data_connector: bool = Field(
+        default=False,
+        description="If True, fit over a connector-backed table.",
+    )
+    connector_id: str | None = None
+    table_name: str | None = None
+    data_uploaded: bool = Field(
+        default=False,
+        description=(
+            "If True, fit over a dataset uploaded via POST /v1/uploads. "
+            "``data_model_id`` is the upload-scope pointer."
+        ),
+    )
+    data_model_id: str | None = Field(
+        default=None,
+        description=(
+            "Upload-scope model_id pointer for pre-uploaded datasets; "
+            "named distinctly to avoid colliding with the route's "
+            "owning ``model_id``."
+        ),
+    )
+    data: list[list[float | str | bool | None]] | None = Field(
+        default=None,
+        description=(
+            "Inline 2D feature matrix for small datasets. The API "
+            "server writes this to S3 as parquet before dispatching the "
+            "Modal compute."
+        ),
+    )
+    feature_names: list[str] | None = Field(
+        default=None,
+        description="Column names aligned to ``data``. Required when ``data`` is set.",
+    )
+    label_column: str | None = Field(
+        default=None,
+        description=(
+            "Optional label column name to drop from the dataset before "
+            "fitting. Only meaningful for pre-uploaded and connector-backed "
+            "datasets."
+        ),
+    )
+class PatternTrackerFitResponse(BaseModel):
+    """Returned immediately by POST /patterns/fit; the actual artifact is
+    retrieved via GET /patterns/{tracker_id} once the job completes."""
+    model_id: str
+    tracker_id: str
+    status: JobStatus
+    message: str | None = None
+# --- GET /v1/models/{model_id}/patterns/{tracker_id} ---
+class PatternTrackerResponse(BaseModel):
+    """The fitted pattern tracker. Optional fields are None while the job is
+    pending/running/failed; populated once the job completes successfully.
+    """
+    model_id: str
+    tracker_id: str
+    status: JobStatus
+    patterns: list[FilterPatternSchema] | None = None
+    schema_info: SchemaInfoSchema | None = None
+    target_range: tuple[float | None, float | None] | None = None
+    n_rejected_fit: int | None = None
+    coverage_fit: float | None = None
+    error_message: str | None = None
+# --- POST /v1/models/{model_id}/patterns/{tracker_id}/transform ---
+class PatternTrackerApplyRequest(BaseModel):
+    """Body shared by /transform, /distribution, and /partition."""
+    samples: list[list[float | str | bool | None]] = Field(
+        description=(
+            "2D array, shape (n_samples, n_features). Cells may be numeric, "
+            "string (categorical), or bool."
+        )
+    )
+    feature_names: list[str] = Field(
+        description=(
+            "Column names. Must include every column the tracker's frozen "
+            "schema requires (extras are ignored)."
+        )
+    )
+class PatternTrackerTransformResponse(BaseModel):
+    """Boolean match matrix aligned to ``labels``."""
+    labels: list[str]
+    matrix: list[list[bool]] = Field(
+        description="Shape (n_samples, n_patterns); cell `[i, j]` true iff "
+        "sample i matches pattern labels[j]."
+    )
+class PatternTrackerDistributionResponse(BaseModel):
+    """Per-pattern match rate over the supplied samples."""
+    match_rate: dict[str, float]
+class PatternTrackerPartitionResponse(BaseModel):
+    """Matching row indices (positional, into the request's `samples`) keyed
+    by pattern label."""
+    indices: dict[str, list[int]]

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/reasoning.py RENAMED Viewed

@@ -14,21 +14,35 @@ from pydantic import BaseModel, Field, model_validator
 from .common import JobResponse
 from .trainer import HardwareSpec
+# --- Compute caps --------------------------------------------------------- #
+MAX_REASONING_HYPEROPT_STEPS = 200
+"""Hard cap on `n_hyperopt_steps`.
+reasoning.fit pins the surrogate to a single model type, so the budget
+is just n_hyperopt_steps. Each step is a Modal trial."""
 class ReasoningFitRequest(BaseModel):
     """POST /v1/reasoning/fit -- Fit a ReasoningModel.
-    Dataset delivery mirrors the other training endpoints:
-      * Inline: set `data` and `labels`. `label_column` is ignored.
-      * Pre-uploaded: set `data_uploaded=True`, supply `model_id` and
-        `label_column`. When `teacher_predict_url` is set, labels become
-        evaluation-only (the teacher provides the training target).
+    Dataset delivery has exactly one of three modes:
+      * Inline: set `data` and `labels`. The server creates a transient
+        dataset row and uploads the data to the dataset's canonical S3
+        location before training begins.
+      * Pre-uploaded: set `dataset_id` to the id returned by
+        ``POST /v1/uploads``, plus `label_column`.
+      * Connector: set `data_connector=True`, `connector_id`, and
+        `table_name`.
+    When `teacher_predict_url` is set, labels become evaluation-only (the
+    teacher provides the training target).
     """
     # --- dataset
     data: list[list[float | str | bool | None]] | None = Field(
         None,
-        description="2D feature matrix (n_samples, n_features). Omit when data_uploaded=True.",
+        description="2D feature matrix (n_samples, n_features). Omit when dataset_id is set.",
     )
     labels: list[float] | None = Field(
         None,
@@ -44,24 +58,22 @@ class ReasoningFitRequest(BaseModel):
         None,
         description="Optional per-column schema for inline data.",
     )
-    data_uploaded: bool = Field(
-        False,
-        description="If true, read the dataset already at "
-        "traces/{org_id}/{model_id}/training_data.{pkl|csv|parquet}.",
+    dataset_id: str | None = Field(
+        None,
+        description="Identifier of a previously uploaded dataset (from "
+        "``POST /v1/uploads``). Mutually exclusive with inline data and "
+        "data_connector.",
     )
     label_column: str | None = Field(
         None,
-        description="Target column name in the uploaded table.",
-    )
-    model_id: str | None = Field(
-        None, description="Custom model ID; required when data_uploaded=True."
+        description="Target column name in the dataset.",
     )
     # --- connector-based data source
     data_connector: bool = Field(
         False,
         description="If true, read the dataset from a registered connector. "
-        "Mutually exclusive with inline data and data_uploaded.",
+        "Mutually exclusive with inline data and dataset_id.",
     )
     connector_id: str | None = Field(
         None,
@@ -78,7 +90,7 @@ class ReasoningFitRequest(BaseModel):
         description="Candidate model-family identifiers. reasoning.fit pins the "
         "surrogate via force_model_type, so at most one entry is accepted.",
     )
-    n_hyperopt_steps: int = 5
+    n_hyperopt_steps: int = Field(5, ge=1, le=MAX_REASONING_HYPEROPT_STEPS)
     device: str | None = Field(None, description="'auto' | 'cuda' | 'cpu'.")
     random_state: int = 42
     task_type: Literal["regression", "binclass", "multiclass"] | None = Field(
@@ -113,11 +125,13 @@ class ReasoningFitRequest(BaseModel):
     @model_validator(mode="after")
     def _check_dataset_source(self):
-        # Exactly one of three modes: inline, uploaded, or connector.
-        if self.data_connector and self.data_uploaded:
-            raise ValueError("data_connector and data_uploaded are mutually exclusive")
+        # Exactly one of three modes: inline, dataset_id, or connector.
+        if self.data_connector and self.dataset_id is not None:
+            raise ValueError("data_connector and dataset_id are mutually exclusive")
         if self.data_connector and self.data is not None:
             raise ValueError("data_connector and inline data are mutually exclusive")
+        if self.dataset_id is not None and self.data is not None:
+            raise ValueError("dataset_id and inline data are mutually exclusive")
         if self.data_connector:
             if not self.connector_id:
@@ -129,21 +143,20 @@ class ReasoningFitRequest(BaseModel):
                     "label_column is required when data_connector=True (unless "
                     "teacher_predict_url is set)"
                 )
-        elif self.data_uploaded:
-            if not self.model_id:
-                raise ValueError("model_id is required when data_uploaded=True")
+        elif self.dataset_id is not None:
             if not self.label_column and not self.teacher_predict_url:
                 raise ValueError(
-                    "label_column is required when data_uploaded=True (unless "
+                    "label_column is required when dataset_id is set (unless "
                     "teacher_predict_url is set)"
                 )
         else:
             if self.data is None:
-                raise ValueError("data is required when data_uploaded is False")
+                raise ValueError(
+                    "data is required for inline mode (or supply dataset_id / data_connector=True)"
+                )
             if self.labels is None and self.teacher_predict_url is None:
                 raise ValueError(
-                    "labels is required when data_uploaded is False (unless "
-                    "teacher_predict_url is set)"
+                    "labels is required for inline mode (unless teacher_predict_url is set)"
                 )
         return self

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/trainer.py RENAMED Viewed

@@ -12,6 +12,14 @@ from pydantic import BaseModel, Field, model_validator
 from .common import JobResponse
+# --- Compute caps --------------------------------------------------------- #
+MAX_HYPEROPT_BUDGET = 200
+"""Hard cap on `n_trials × max(1, len(model_types))`.
+Bounds total HPO trials per training request. Each trial spawns a Modal
+container, so this is the primary cost gate."""
 class ModalHardwareSpec(BaseModel):
     """Fan trials out to additional Modal containers.
@@ -36,20 +44,27 @@ HardwareSpec = ModalHardwareSpec
 class TrainerRunRequest(BaseModel):
     """POST /v1/trainer/run -- Configure a Trainer and run HPO across a model matrix.
-    Dataset delivery mirrors the other training endpoints:
-      * Inline: set `data` and `labels`. `label_column` is ignored.
-      * Pre-uploaded: set `data_uploaded=True`, supply `model_id` and
-        `label_column` (the column in the uploaded table that holds the target).
+    Dataset delivery has exactly one of three modes:
+      * Inline: set `data` and `labels`. The server creates a transient
+        dataset row and uploads the data to the dataset's canonical S3
+        location before training begins.
+      * Pre-uploaded: set `dataset_id` to the id returned by
+        ``POST /v1/uploads``, plus `label_column` (the column in the uploaded
+        table that holds the target).
+      * Connector: set `data_connector=True`, `connector_id`, and
+        `table_name`.
+    The produced model id is returned in the response.
     """
     # --- dataset
     data: list[list[float | str | bool | None]] | None = Field(
         None,
-        description="2D feature matrix (n_samples, n_features). Omit when data_uploaded=True.",
+        description="2D feature matrix (n_samples, n_features). Omit when dataset_id is set.",
     )
     labels: list[float] | None = Field(
         None,
-        description="Target values, length n_samples. Omit when data_uploaded=True.",
+        description="Target values, length n_samples. Omit when dataset_id is set.",
     )
     feature_names: list[str] | None = Field(
         None,
@@ -61,24 +76,23 @@ class TrainerRunRequest(BaseModel):
         description="Optional per-column schema for inline data: "
         "{name: {dtype: 'float' | 'int' | 'bool' | 'categorical'}}.",
     )
-    data_uploaded: bool = Field(
-        False,
-        description="If true, read the dataset already at "
-        "traces/{org_id}/{model_id}/training_data.{pkl|csv|parquet}.",
+    dataset_id: str | None = Field(
+        None,
+        description="Identifier of a previously uploaded dataset (from "
+        "``POST /v1/uploads``). Mutually exclusive with inline data and "
+        "data_connector.",
     )
     label_column: str | None = Field(
         None,
-        description="Target column name in the uploaded table. Required when data_uploaded=True.",
-    )
-    model_id: str | None = Field(
-        None, description="Custom model ID; required when data_uploaded=True."
+        description="Target column name in the dataset. Required when dataset_id "
+        "is set (unless teacher_predict_url is also set).",
     )
     # --- connector-based data source
     data_connector: bool = Field(
         False,
         description="If true, read the dataset from a registered connector. "
-        "Mutually exclusive with inline data and data_uploaded.",
+        "Mutually exclusive with inline data and dataset_id.",
     )
     connector_id: str | None = Field(
         None,
@@ -106,7 +120,7 @@ class TrainerRunRequest(BaseModel):
         "random",
         description="HPO strategy: 'random' or 'optuna'. Resolved server-side.",
     )
-    n_trials: int = Field(4, description="Number of HPO trials per matrix row.")
+    n_trials: int = Field(4, ge=1, description="Number of HPO trials per matrix row.")
     n_splits: int | None = Field(
         None,
         description="K-fold cross-validation folds. None means a single holdout split.",
@@ -149,11 +163,13 @@ class TrainerRunRequest(BaseModel):
     @model_validator(mode="after")
     def _check_dataset_source(self):
-        # Exactly one of three modes: inline, uploaded, or connector.
-        if self.data_connector and self.data_uploaded:
-            raise ValueError("data_connector and data_uploaded are mutually exclusive")
+        # Exactly one of three modes: inline, dataset_id, or connector.
+        if self.data_connector and self.dataset_id is not None:
+            raise ValueError("data_connector and dataset_id are mutually exclusive")
         if self.data_connector and self.data is not None:
             raise ValueError("data_connector and inline data are mutually exclusive")
+        if self.dataset_id is not None and self.data is not None:
+            raise ValueError("dataset_id and inline data are mutually exclusive")
         if self.data_connector:
             if not self.connector_id:
@@ -165,22 +181,21 @@ class TrainerRunRequest(BaseModel):
                     "label_column is required when data_connector=True (unless "
                     "teacher_predict_url is set)"
                 )
-        elif self.data_uploaded:
-            if not self.model_id:
-                raise ValueError("model_id is required when data_uploaded=True")
+        elif self.dataset_id is not None:
             if not self.label_column and not self.teacher_predict_url:
                 raise ValueError(
-                    "label_column is required when data_uploaded=True (unless "
+                    "label_column is required when dataset_id is set (unless "
                     "teacher_predict_url is set, in which case the teacher provides "
                     "the training target)"
                 )
         else:
             if self.data is None:
-                raise ValueError("data is required when data_uploaded is False")
+                raise ValueError(
+                    "data is required for inline mode (or supply dataset_id / data_connector=True)"
+                )
             if self.labels is None and self.teacher_predict_url is None:
                 raise ValueError(
-                    "labels is required when data_uploaded is False (unless "
-                    "teacher_predict_url is set)"
+                    "labels is required for inline mode (unless teacher_predict_url is set)"
                 )
         return self
@@ -192,6 +207,22 @@ class TrainerRunRequest(BaseModel):
                 raise ValueError("feature_names is required when data contains non-numeric values")
         return self
+    @model_validator(mode="after")
+    def _check_hyperopt_budget(self):
+        # When model_types is None the server picks a default set, so the
+        # client-visible budget is just n_trials. We cap that as a lower
+        # bound on the real budget; the server should re-check after
+        # resolving the default model list.
+        n_models = max(1, len(self.model_types)) if self.model_types else 1
+        budget = self.n_trials * n_models
+        if budget > MAX_HYPEROPT_BUDGET:
+            raise ValueError(
+                f"n_trials x len(model_types) = {budget} exceeds the per-request "
+                f"cap of {MAX_HYPEROPT_BUDGET}. Reduce n_trials or shrink "
+                "model_types."
+            )
+        return self
 class TrainerRunResponse(JobResponse):
     """POST /v1/trainer/run -- async trainer job submission response."""

outerproduct_http_types-0.3.0/src/outerproduct_http_types/uploads.py ADDED Viewed

@@ -0,0 +1,58 @@
+"""Request/response schemas for the presigned-upload endpoint."""
+from typing import Literal
+from pydantic import BaseModel, Field
+# --- Upload size policy --------------------------------------------------- #
+# Tunable in review. These constants are the single source of truth for the
+# SDK's client-side guards. Server-side enforcement of MAX_UPLOAD_BYTES will
+# be wired when the upload flow switches from presigned PUT to presigned POST
+# (where S3's `content-length-range` condition becomes available).
+MAX_UPLOAD_BYTES = 10_000_000_000
+"""Hard cap on uploaded file size, all formats.
+Picked to comfortably hold 1M x 1k Parquet workloads with headroom for
+~10x growth. Enforced today by the SDK before upload starts. When the
+presigned POST switch lands, S3 will enforce server-side via
+`content-length-range`."""
+MAX_CSV_UPLOAD_BYTES = 3_000_000_000
+"""Hard cap on CSV uploads specifically.
+CSV is ~10-20x larger than the same data as Parquet, so a CSV near
+MAX_UPLOAD_BYTES would almost always be a misformatted workload. Reject
+early with a clear message rather than burn upload bandwidth on a file
+that should have been Parquet."""
+CSV_UPLOAD_WARN_BYTES = 500_000_000
+"""SDK emits a UserWarning above this CSV size, suggesting Parquet.
+Soft signal — at this scale Parquet would be ~50x100 MB for the same
+data, and subsequent training reads are also faster."""
+class CreateUploadRequest(BaseModel):
+    """POST /v1/uploads -- request a presigned URL for direct-to-S3 upload."""
+    file_format: Literal["pkl", "csv", "parquet"] = Field(
+        ...,
+        description=(
+            "Format of the dataset you will PUT to the returned URL. "
+            "'pkl' = a pickled pandas DataFrame, 'csv' = RFC4180 CSV with a "
+            "header row, 'parquet' = Apache Parquet. The label column must be "
+            "present in the uploaded table and its name is supplied on the "
+            "subsequent /v1/trainer/run or /v1/reasoning/fit call as "
+            "`label_column`."
+        ),
+    )
+class CreateUploadResponse(BaseModel):
+    dataset_id: str
+    upload_url: str
+    upload_key: str
+    file_format: Literal["pkl", "csv", "parquet"]
+    content_type: str
+    expires_in: int

outerproduct_http_types-0.2.0/src/outerproduct_http_types/segment.py DELETED Viewed

@@ -1,59 +0,0 @@
-"""Request/response schemas for segmentation endpoints."""
-from typing import Any
-from pydantic import BaseModel, ConfigDict, Field
-class ClusterPersonaSchema(BaseModel):
-    """One cluster's persona description as exposed over HTTP.
-    Mirrors the shape written into ``segments.json`` by the segment Lambda;
-    the API repo aliases that S3 wire type onto this HTTP type at the
-    response boundary.
-    """
-    model_config = ConfigDict(from_attributes=True)
-    cluster_id: int
-    persona_name: str
-    persona_description: str
-    stats: dict[str, Any]
-    differentiating_features: list[dict[str, Any]] | None = None
-class SegmentRequest(BaseModel):
-    """POST /v1/models/{model_id}/segment -- Supervised segmentation (async)."""
-    data: list[list[float | str | bool | None]] | None = Field(
-        None,
-        description="Dataset to segment; uses training data if omitted. "
-        "Cells may be numeric, string (categorical), or bool.",
-    )
-    target_values: list[float] | None = None
-    feature_names: list[str] | None = None
-    min_clusters: int = 4
-    max_clusters: int | None = 10
-    n_search_steps: int = 50
-    use_agent: bool | None = None
-    kpi_field: str | None = None
-    problem_context: str | None = None
-class SegmentResultResponse(BaseModel):
-    """GET /v1/models/{model_id}/segments -- Retrieve segmentation results.
-    Result fields are Optional because pending/running/failed jobs return
-    only model_id + status; populated only once the Lambda has uploaded
-    segments.json and the SegmentsResult is available.
-    """
-    model_id: str
-    status: str
-    n_clusters: int | None = None
-    cluster_ids: list[int] | None = None
-    resolution: float | None = None
-    quality: float | None = None
-    personas: list[ClusterPersonaSchema] | None = None
-    agent_score: float | None = None
-    agent_reasoning: str | None = None

outerproduct_http_types-0.2.0/src/outerproduct_http_types/uploads.py DELETED Viewed

@@ -1,34 +0,0 @@
-"""Request/response schemas for the presigned-upload endpoint."""
-from typing import Literal
-from pydantic import BaseModel, Field
-class CreateUploadRequest(BaseModel):
-    """POST /v1/uploads -- request a presigned URL for direct-to-S3 upload."""
-    model_id: str | None = Field(
-        None,
-        description="Custom model ID; auto-generated if omitted.",
-    )
-    file_format: Literal["pkl", "csv", "parquet"] = Field(
-        ...,
-        description=(
-            "Format of the dataset you will PUT to the returned URL. "
-            "'pkl' = a pickled pandas DataFrame, 'csv' = RFC4180 CSV with a "
-            "header row, 'parquet' = Apache Parquet. The label column must be "
-            "present in the uploaded table and its name is supplied on the "
-            "subsequent /v1/trainer/run or /v1/reasoning/fit call as "
-            "`label_column`."
-        ),
-    )
-class CreateUploadResponse(BaseModel):
-    model_id: str
-    upload_url: str
-    upload_key: str
-    file_format: Literal["pkl", "csv", "parquet"]
-    content_type: str
-    expires_in: int

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/.gitignore RENAMED Viewed

File without changes

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/LICENSE RENAMED Viewed

File without changes

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/README.md RENAMED Viewed

File without changes

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/agentic_documents.py RENAMED Viewed

File without changes

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/connectors.py RENAMED Viewed

File without changes

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/py.typed RENAMED Viewed

File without changes

{outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/workers.py RENAMED Viewed

File without changes

outerproduct-http-types 0.2.0__tar.gz → 0.3.0__tar.gz

outerproduct-http-types 0.2.0tar.gz → 0.3.0tar.gz