outerproduct-http-types 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/PKG-INFO +1 -1
  2. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/pyproject.toml +1 -1
  3. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/__init__.py +22 -4
  4. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/common.py +3 -1
  5. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/inference.py +110 -4
  6. outerproduct_http_types-0.3.0/src/outerproduct_http_types/patterns.py +213 -0
  7. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/reasoning.py +39 -26
  8. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/trainer.py +57 -26
  9. outerproduct_http_types-0.3.0/src/outerproduct_http_types/uploads.py +58 -0
  10. outerproduct_http_types-0.2.0/src/outerproduct_http_types/segment.py +0 -59
  11. outerproduct_http_types-0.2.0/src/outerproduct_http_types/uploads.py +0 -34
  12. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/.gitignore +0 -0
  13. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/LICENSE +0 -0
  14. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/README.md +0 -0
  15. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/agentic_documents.py +0 -0
  16. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/connectors.py +0 -0
  17. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/py.typed +0 -0
  18. {outerproduct_http_types-0.2.0 → outerproduct_http_types-0.3.0}/src/outerproduct_http_types/workers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: outerproduct-http-types
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Shared HTTP-facing type definitions for OuterProduct services and SDKs.
5
5
  Project-URL: Homepage, https://outerproduct.com
6
6
  Author: OuterProduct, Inc.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "outerproduct-http-types"
3
- version = "0.2.0"
3
+ version = "0.3.0"
4
4
  description = "Shared HTTP-facing type definitions for OuterProduct services and SDKs."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -47,8 +47,19 @@ from .inference import (
47
47
  ScenarioResponse,
48
48
  ScenarioResultItem,
49
49
  )
50
+ from .patterns import (
51
+ FilterPatternSchema,
52
+ PatternTrackerApplyRequest,
53
+ PatternTrackerDistributionResponse,
54
+ PatternTrackerFitRequest,
55
+ PatternTrackerFitResponse,
56
+ PatternTrackerPartitionResponse,
57
+ PatternTrackerResponse,
58
+ PatternTrackerTransformResponse,
59
+ PredicateSchema,
60
+ SchemaInfoSchema,
61
+ )
50
62
  from .reasoning import ReasoningFitRequest, ReasoningFitResponse
51
- from .segment import ClusterPersonaSchema, SegmentRequest, SegmentResultResponse
52
63
  from .trainer import (
53
64
  HardwareSpec,
54
65
  ModalHardwareSpec,
@@ -62,7 +73,6 @@ __version__ = "0.1.0"
62
73
 
63
74
  __all__ = [
64
75
  "AnswerType",
65
- "ClusterPersonaSchema",
66
76
  "ConnectorResponse",
67
77
  "ConnectorType",
68
78
  "CreateConnectorRequest",
@@ -88,6 +98,16 @@ __all__ = [
88
98
  "ListTablesRequest",
89
99
  "ListTablesResponse",
90
100
  "ModalHardwareSpec",
101
+ "PatternTrackerApplyRequest",
102
+ "PatternTrackerDistributionResponse",
103
+ "PatternTrackerFitRequest",
104
+ "PatternTrackerFitResponse",
105
+ "PatternTrackerPartitionResponse",
106
+ "PatternTrackerResponse",
107
+ "PatternTrackerTransformResponse",
108
+ "PredicateSchema",
109
+ "FilterPatternSchema",
110
+ "SchemaInfoSchema",
91
111
  "PredictAndExplainRequest",
92
112
  "PredictAndExplainResponse",
93
113
  "PredictRequest",
@@ -102,8 +122,6 @@ __all__ = [
102
122
  "ScenarioResultItem",
103
123
  "Schema",
104
124
  "SchemaResultResponse",
105
- "SegmentRequest",
106
- "SegmentResultResponse",
107
125
  "StatusResponse",
108
126
  "TabularizeJobResponse",
109
127
  "TabularizeRequest",
@@ -31,7 +31,9 @@ class StatusResponse(BaseModel):
31
31
  """Returned by GET /models/{model_id}/status."""
32
32
 
33
33
  model_id: str
34
- job_type: str = Field(description=("One of: trainer_run, reasoning_fit, segment."))
34
+ job_type: str = Field(
35
+ description="One of: trainer_run, reasoning_fit, patterns_fit:<tracker_id>."
36
+ )
35
37
  status: JobStatus
36
38
  progress: dict[str, Any] | None = Field(
37
39
  None, description='Progress info, e.g. {"step": 3, "total_steps": 5}'
@@ -2,7 +2,49 @@
2
2
 
3
3
  from typing import Any, Literal
4
4
 
5
- from pydantic import BaseModel, ConfigDict, Field
5
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
6
+
7
+ # --- Request size caps ---------------------------------------------------- #
8
+ # Tune in review. Defined as module-level constants so callers and tests can
9
+ # reference them, and so the team can adjust without hunting for the values.
10
+
11
+ MAX_PREDICT_ROWS = 100_000
12
+ """Hard cap on `samples` length for /predict requests."""
13
+
14
+ MAX_EXPLAIN_ROWS = 10_000
15
+ """Hard cap on `samples` length for /explain and /predict_and_explain.
16
+
17
+ Lower than /predict because explanation computes per-row feature
18
+ attributions, which are roughly an order of magnitude more expensive
19
+ than prediction.
20
+
21
+ Note: real per-call cost also depends on training-time choices made
22
+ when the model was fit (cached sample counts, etc.). This row cap is
23
+ tight only when training stayed near defaults; models trained with
24
+ large internal caches can make explain expensive even within this
25
+ budget."""
26
+
27
+ MAX_SCENARIO_QUERIES = 100
28
+ """Hard cap on `queries` length for /scenario.
29
+
30
+ Each query runs `n_walks` random walks of up to `max_steps` steps; total
31
+ work scales as n_queries x n_walks x max_steps."""
32
+
33
+ MAX_SCENARIO_WALK_BUDGET = 50_000
34
+ """Hard cap on `n_walks x max_steps` per request.
35
+
36
+ Bounds the per-query inner work regardless of how the user splits the
37
+ budget between width (more walks) and depth (more steps)."""
38
+
39
+ # TODO(reviewers): consider adding MAX_SCENARIO_TOTAL_EXPLAINS to bound the
40
+ # aggregate explain work across a scenario request. Each walk step calls
41
+ # explain() on currently-active walks; worst-case totals (100 queries ×
42
+ # 500 walks × ~30 steps) can run for minutes of wall-clock even though
43
+ # each individual explain is small. A cap like
44
+ # MAX_SCENARIO_QUERIES * default n_walks would prevent the worst pile-ups
45
+ # without affecting typical usage. Not enforced today — wait until we see
46
+ # real timeouts before adding it.
47
+
6
48
 
7
49
  # --- POST /v1/models/{model_id}/predict ---
8
50
 
@@ -19,6 +61,16 @@ class PredictRequest(BaseModel):
19
61
  "schema in name and order."
20
62
  )
21
63
 
64
+ @model_validator(mode="after")
65
+ def _check_sample_count(self):
66
+ if len(self.samples) > MAX_PREDICT_ROWS:
67
+ raise ValueError(
68
+ f"samples has {len(self.samples)} rows; the per-request cap is "
69
+ f"{MAX_PREDICT_ROWS}. Batch the request or use a connector-backed "
70
+ "workflow."
71
+ )
72
+ return self
73
+
22
74
 
23
75
  class PredictResponse(BaseModel):
24
76
  model_id: str
@@ -40,6 +92,15 @@ class ExplainRequest(BaseModel):
40
92
  "schema in name and order."
41
93
  )
42
94
 
95
+ @model_validator(mode="after")
96
+ def _check_sample_count(self):
97
+ if len(self.samples) > MAX_EXPLAIN_ROWS:
98
+ raise ValueError(
99
+ f"samples has {len(self.samples)} rows; the per-request cap for "
100
+ f"/explain is {MAX_EXPLAIN_ROWS}."
101
+ )
102
+ return self
103
+
43
104
 
44
105
  class ExplainResponse(BaseModel):
45
106
  """Batch-shaped explanation arrays. Dimension 0 is the batch.
@@ -88,6 +149,15 @@ class PredictAndExplainRequest(BaseModel):
88
149
  ),
89
150
  )
90
151
 
152
+ @model_validator(mode="after")
153
+ def _check_sample_count(self):
154
+ if len(self.samples) > MAX_EXPLAIN_ROWS:
155
+ raise ValueError(
156
+ f"samples has {len(self.samples)} rows; the per-request cap for "
157
+ f"/predict_and_explain is {MAX_EXPLAIN_ROWS}."
158
+ )
159
+ return self
160
+
91
161
 
92
162
  class PredictAndExplainResponse(BaseModel):
93
163
  """Batch-shaped predict + explain arrays. Dimension 0 is the batch."""
@@ -155,6 +225,27 @@ class FeatureConstraintSchema(BaseModel):
155
225
  value_range: tuple[float | None, float | None] | None = None
156
226
  allowed_values: list[Any] | None = None
157
227
 
228
+ @model_validator(mode="after")
229
+ def _check_invariants(self):
230
+ if self.immutable and (
231
+ self.monotonic is not None
232
+ or self.value_range is not None
233
+ or self.allowed_values is not None
234
+ ):
235
+ raise ValueError(
236
+ "immutable=True cannot be combined with monotonic, value_range, "
237
+ "or allowed_values."
238
+ )
239
+ if self.value_range is not None:
240
+ lo, hi = self.value_range
241
+ if lo is not None and hi is not None and lo > hi:
242
+ raise ValueError(
243
+ f"value_range lower bound {lo} exceeds upper bound {hi}."
244
+ )
245
+ if self.allowed_values is not None and len(self.allowed_values) == 0:
246
+ raise ValueError("allowed_values must be non-empty when provided.")
247
+ return self
248
+
158
249
 
159
250
  class ScenarioRequest(BaseModel):
160
251
  """POST /v1/models/{model_id}/scenario -- Counterfactual search with constraints."""
@@ -168,12 +259,27 @@ class ScenarioRequest(BaseModel):
168
259
  "schema in name and order."
169
260
  )
170
261
  desired_class: int = 1
171
- n_walks: int = 500
172
- max_steps: int = 30
173
- epsilon: float = 0.2
262
+ n_walks: int = Field(500, ge=1)
263
+ max_steps: int = Field(30, ge=1)
264
+ epsilon: float = Field(0.2, gt=0.0, le=1.0)
174
265
  random_state: int | None = 42
175
266
  constraints: dict[str, FeatureConstraintSchema] = Field(default_factory=dict)
176
267
 
268
+ @model_validator(mode="after")
269
+ def _check_request_budget(self):
270
+ if len(self.queries) > MAX_SCENARIO_QUERIES:
271
+ raise ValueError(
272
+ f"queries has {len(self.queries)} entries; the per-request cap is "
273
+ f"{MAX_SCENARIO_QUERIES}."
274
+ )
275
+ budget = self.n_walks * self.max_steps
276
+ if budget > MAX_SCENARIO_WALK_BUDGET:
277
+ raise ValueError(
278
+ f"n_walks × max_steps = {budget} exceeds the per-query cap of "
279
+ f"{MAX_SCENARIO_WALK_BUDGET}. Reduce n_walks or max_steps."
280
+ )
281
+ return self
282
+
177
283
 
178
284
  class ScenarioChange(BaseModel):
179
285
  """Single-feature diff between a query and one counterfactual row."""
@@ -0,0 +1,213 @@
1
+ """Request/response schemas for the /v1/models/{model_id}/patterns/* endpoints.
2
+
3
+ PatternTracker aggregates per-sample local-rule explanations into a small set
4
+ of executable, labeled filter patterns. Fit runs server-side; the SDK
5
+ client holds an ``id`` and calls transform/distribution/partition by
6
+ ``tracker_id``.
7
+ """
8
+
9
+ from typing import Any, Literal
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+ from .common import JobStatus
14
+
15
+
16
+ class PredicateSchema(BaseModel):
17
+ """One literal of a conjunctive filter: ``feature <op> value``.
18
+
19
+ Mirrors ``outerproduct_reasoning.internal.local_rules.Predicate``: ``op``
20
+ is ``<=``/``>=`` for continuous features (numeric ``value``) or ``==`` for
21
+ categoricals (string/bool/int ``value``).
22
+ """
23
+
24
+ feature: str
25
+ op: Literal["<=", ">=", "=="]
26
+ value: float | int | str | bool
27
+
28
+
29
+ class FilterPatternSchema(BaseModel):
30
+ """One executable conjunctive filter with coverage stats and a label."""
31
+
32
+ predicates: list[PredicateSchema]
33
+ label: str
34
+ support_rejects: float = Field(
35
+ description="Share of fit-time rejected rows the pattern matches."
36
+ )
37
+ precision: float = Field(
38
+ description="Share of all matched rows that are rejects."
39
+ )
40
+ lift: float = Field(description="precision / base_reject_rate.")
41
+ n_local_rules_covered: int
42
+
43
+
44
+ class SchemaInfoSchema(BaseModel):
45
+ """Column / dtype / categorical-level snapshot captured at fit time."""
46
+
47
+ columns: list[str]
48
+ dtypes: dict[str, str]
49
+ # frozensets don't survive JSON; the server emits sorted lists, the SDK
50
+ # rehydrates into frozensets on its side.
51
+ categorical_levels: dict[str, list[Any]]
52
+
53
+
54
+ # --- POST /v1/models/{model_id}/patterns/fit ---
55
+
56
+
57
+ class PatternTrackerFitRequest(BaseModel):
58
+ """POST /v1/models/{model_id}/patterns/fit -- Submit a pattern-tracker fit job.
59
+
60
+ Carries the fit hyperparameters plus a dataset reference that mirrors
61
+ :class:`ReasoningFitRequest`. The dataset is resolved on the API server
62
+ (inline parquet upload / pre-uploaded passthrough / connector validation)
63
+ before the heavy compute is dispatched to a Modal CPU function; no inline
64
+ data is held in memory on the API tier.
65
+
66
+ Exactly one of the three dataset modes must be set:
67
+
68
+ - **Connector-backed** (``data_connector=True``): ``connector_id`` +
69
+ ``table_name`` reference a registered connector. The Modal worker
70
+ materializes the table at compute time.
71
+ - **Pre-uploaded** (``data_uploaded=True``): ``data_model_id`` is the
72
+ upload-scope pointer returned by ``POST /v1/uploads`` (independent
73
+ of the route's owning ``model_id``).
74
+ - **Inline** (default): ``data`` and ``feature_names`` carry a small
75
+ tabular payload directly in the request body.
76
+ """
77
+
78
+ target_range: tuple[float | None, float | None] = Field(
79
+ description=(
80
+ "(lo, hi) bounds defining the 'rejected' band of predictions. "
81
+ "Either side may be null for an open bound; at least one must be "
82
+ "set."
83
+ )
84
+ )
85
+ mode: Literal["cover", "discovery"] = "discovery"
86
+ max_patterns: int = 25
87
+ coverage_target: float = 0.95
88
+ min_pattern_support: float = 0.005
89
+ min_precision: float = 0.5
90
+ max_pattern_size: int = 3
91
+ threshold_n_bins: int = 10
92
+ ensure_coverage: bool = True
93
+ min_wracc: float = 0.0
94
+ diversity_threshold: float = 0.5
95
+ drop_redundant: bool = True
96
+ child_overlap_threshold: float = 0.9
97
+ explained_lift_threshold: float = 1.1
98
+ rule_kwargs: dict[str, Any] | None = None
99
+
100
+ # --- Dataset routing (mirrors ReasoningFitRequest) ----------------------
101
+ data_connector: bool = Field(
102
+ default=False,
103
+ description="If True, fit over a connector-backed table.",
104
+ )
105
+ connector_id: str | None = None
106
+ table_name: str | None = None
107
+ data_uploaded: bool = Field(
108
+ default=False,
109
+ description=(
110
+ "If True, fit over a dataset uploaded via POST /v1/uploads. "
111
+ "``data_model_id`` is the upload-scope pointer."
112
+ ),
113
+ )
114
+ data_model_id: str | None = Field(
115
+ default=None,
116
+ description=(
117
+ "Upload-scope model_id pointer for pre-uploaded datasets; "
118
+ "named distinctly to avoid colliding with the route's "
119
+ "owning ``model_id``."
120
+ ),
121
+ )
122
+ data: list[list[float | str | bool | None]] | None = Field(
123
+ default=None,
124
+ description=(
125
+ "Inline 2D feature matrix for small datasets. The API "
126
+ "server writes this to S3 as parquet before dispatching the "
127
+ "Modal compute."
128
+ ),
129
+ )
130
+ feature_names: list[str] | None = Field(
131
+ default=None,
132
+ description="Column names aligned to ``data``. Required when ``data`` is set.",
133
+ )
134
+ label_column: str | None = Field(
135
+ default=None,
136
+ description=(
137
+ "Optional label column name to drop from the dataset before "
138
+ "fitting. Only meaningful for pre-uploaded and connector-backed "
139
+ "datasets."
140
+ ),
141
+ )
142
+
143
+
144
+ class PatternTrackerFitResponse(BaseModel):
145
+ """Returned immediately by POST /patterns/fit; the actual artifact is
146
+ retrieved via GET /patterns/{tracker_id} once the job completes."""
147
+
148
+ model_id: str
149
+ tracker_id: str
150
+ status: JobStatus
151
+ message: str | None = None
152
+
153
+
154
+ # --- GET /v1/models/{model_id}/patterns/{tracker_id} ---
155
+
156
+
157
+ class PatternTrackerResponse(BaseModel):
158
+ """The fitted pattern tracker. Optional fields are None while the job is
159
+ pending/running/failed; populated once the job completes successfully.
160
+ """
161
+
162
+ model_id: str
163
+ tracker_id: str
164
+ status: JobStatus
165
+ patterns: list[FilterPatternSchema] | None = None
166
+ schema_info: SchemaInfoSchema | None = None
167
+ target_range: tuple[float | None, float | None] | None = None
168
+ n_rejected_fit: int | None = None
169
+ coverage_fit: float | None = None
170
+ error_message: str | None = None
171
+
172
+
173
+ # --- POST /v1/models/{model_id}/patterns/{tracker_id}/transform ---
174
+
175
+
176
+ class PatternTrackerApplyRequest(BaseModel):
177
+ """Body shared by /transform, /distribution, and /partition."""
178
+
179
+ samples: list[list[float | str | bool | None]] = Field(
180
+ description=(
181
+ "2D array, shape (n_samples, n_features). Cells may be numeric, "
182
+ "string (categorical), or bool."
183
+ )
184
+ )
185
+ feature_names: list[str] = Field(
186
+ description=(
187
+ "Column names. Must include every column the tracker's frozen "
188
+ "schema requires (extras are ignored)."
189
+ )
190
+ )
191
+
192
+
193
+ class PatternTrackerTransformResponse(BaseModel):
194
+ """Boolean match matrix aligned to ``labels``."""
195
+
196
+ labels: list[str]
197
+ matrix: list[list[bool]] = Field(
198
+ description="Shape (n_samples, n_patterns); cell `[i, j]` true iff "
199
+ "sample i matches pattern labels[j]."
200
+ )
201
+
202
+
203
+ class PatternTrackerDistributionResponse(BaseModel):
204
+ """Per-pattern match rate over the supplied samples."""
205
+
206
+ match_rate: dict[str, float]
207
+
208
+
209
+ class PatternTrackerPartitionResponse(BaseModel):
210
+ """Matching row indices (positional, into the request's `samples`) keyed
211
+ by pattern label."""
212
+
213
+ indices: dict[str, list[int]]
@@ -14,21 +14,35 @@ from pydantic import BaseModel, Field, model_validator
14
14
  from .common import JobResponse
15
15
  from .trainer import HardwareSpec
16
16
 
17
+ # --- Compute caps --------------------------------------------------------- #
18
+
19
+ MAX_REASONING_HYPEROPT_STEPS = 200
20
+ """Hard cap on `n_hyperopt_steps`.
21
+
22
+ reasoning.fit pins the surrogate to a single model type, so the budget
23
+ is just n_hyperopt_steps. Each step is a Modal trial."""
24
+
17
25
 
18
26
  class ReasoningFitRequest(BaseModel):
19
27
  """POST /v1/reasoning/fit -- Fit a ReasoningModel.
20
28
 
21
- Dataset delivery mirrors the other training endpoints:
22
- * Inline: set `data` and `labels`. `label_column` is ignored.
23
- * Pre-uploaded: set `data_uploaded=True`, supply `model_id` and
24
- `label_column`. When `teacher_predict_url` is set, labels become
25
- evaluation-only (the teacher provides the training target).
29
+ Dataset delivery has exactly one of three modes:
30
+ * Inline: set `data` and `labels`. The server creates a transient
31
+ dataset row and uploads the data to the dataset's canonical S3
32
+ location before training begins.
33
+ * Pre-uploaded: set `dataset_id` to the id returned by
34
+ ``POST /v1/uploads``, plus `label_column`.
35
+ * Connector: set `data_connector=True`, `connector_id`, and
36
+ `table_name`.
37
+
38
+ When `teacher_predict_url` is set, labels become evaluation-only (the
39
+ teacher provides the training target).
26
40
  """
27
41
 
28
42
  # --- dataset
29
43
  data: list[list[float | str | bool | None]] | None = Field(
30
44
  None,
31
- description="2D feature matrix (n_samples, n_features). Omit when data_uploaded=True.",
45
+ description="2D feature matrix (n_samples, n_features). Omit when dataset_id is set.",
32
46
  )
33
47
  labels: list[float] | None = Field(
34
48
  None,
@@ -44,24 +58,22 @@ class ReasoningFitRequest(BaseModel):
44
58
  None,
45
59
  description="Optional per-column schema for inline data.",
46
60
  )
47
- data_uploaded: bool = Field(
48
- False,
49
- description="If true, read the dataset already at "
50
- "traces/{org_id}/{model_id}/training_data.{pkl|csv|parquet}.",
61
+ dataset_id: str | None = Field(
62
+ None,
63
+ description="Identifier of a previously uploaded dataset (from "
64
+ "``POST /v1/uploads``). Mutually exclusive with inline data and "
65
+ "data_connector.",
51
66
  )
52
67
  label_column: str | None = Field(
53
68
  None,
54
- description="Target column name in the uploaded table.",
55
- )
56
- model_id: str | None = Field(
57
- None, description="Custom model ID; required when data_uploaded=True."
69
+ description="Target column name in the dataset.",
58
70
  )
59
71
 
60
72
  # --- connector-based data source
61
73
  data_connector: bool = Field(
62
74
  False,
63
75
  description="If true, read the dataset from a registered connector. "
64
- "Mutually exclusive with inline data and data_uploaded.",
76
+ "Mutually exclusive with inline data and dataset_id.",
65
77
  )
66
78
  connector_id: str | None = Field(
67
79
  None,
@@ -78,7 +90,7 @@ class ReasoningFitRequest(BaseModel):
78
90
  description="Candidate model-family identifiers. reasoning.fit pins the "
79
91
  "surrogate via force_model_type, so at most one entry is accepted.",
80
92
  )
81
- n_hyperopt_steps: int = 5
93
+ n_hyperopt_steps: int = Field(5, ge=1, le=MAX_REASONING_HYPEROPT_STEPS)
82
94
  device: str | None = Field(None, description="'auto' | 'cuda' | 'cpu'.")
83
95
  random_state: int = 42
84
96
  task_type: Literal["regression", "binclass", "multiclass"] | None = Field(
@@ -113,11 +125,13 @@ class ReasoningFitRequest(BaseModel):
113
125
 
114
126
  @model_validator(mode="after")
115
127
  def _check_dataset_source(self):
116
- # Exactly one of three modes: inline, uploaded, or connector.
117
- if self.data_connector and self.data_uploaded:
118
- raise ValueError("data_connector and data_uploaded are mutually exclusive")
128
+ # Exactly one of three modes: inline, dataset_id, or connector.
129
+ if self.data_connector and self.dataset_id is not None:
130
+ raise ValueError("data_connector and dataset_id are mutually exclusive")
119
131
  if self.data_connector and self.data is not None:
120
132
  raise ValueError("data_connector and inline data are mutually exclusive")
133
+ if self.dataset_id is not None and self.data is not None:
134
+ raise ValueError("dataset_id and inline data are mutually exclusive")
121
135
 
122
136
  if self.data_connector:
123
137
  if not self.connector_id:
@@ -129,21 +143,20 @@ class ReasoningFitRequest(BaseModel):
129
143
  "label_column is required when data_connector=True (unless "
130
144
  "teacher_predict_url is set)"
131
145
  )
132
- elif self.data_uploaded:
133
- if not self.model_id:
134
- raise ValueError("model_id is required when data_uploaded=True")
146
+ elif self.dataset_id is not None:
135
147
  if not self.label_column and not self.teacher_predict_url:
136
148
  raise ValueError(
137
- "label_column is required when data_uploaded=True (unless "
149
+ "label_column is required when dataset_id is set (unless "
138
150
  "teacher_predict_url is set)"
139
151
  )
140
152
  else:
141
153
  if self.data is None:
142
- raise ValueError("data is required when data_uploaded is False")
154
+ raise ValueError(
155
+ "data is required for inline mode (or supply dataset_id / data_connector=True)"
156
+ )
143
157
  if self.labels is None and self.teacher_predict_url is None:
144
158
  raise ValueError(
145
- "labels is required when data_uploaded is False (unless "
146
- "teacher_predict_url is set)"
159
+ "labels is required for inline mode (unless teacher_predict_url is set)"
147
160
  )
148
161
  return self
149
162
 
@@ -12,6 +12,14 @@ from pydantic import BaseModel, Field, model_validator
12
12
 
13
13
  from .common import JobResponse
14
14
 
15
+ # --- Compute caps --------------------------------------------------------- #
16
+
17
+ MAX_HYPEROPT_BUDGET = 200
18
+ """Hard cap on `n_trials × max(1, len(model_types))`.
19
+
20
+ Bounds total HPO trials per training request. Each trial spawns a Modal
21
+ container, so this is the primary cost gate."""
22
+
15
23
 
16
24
  class ModalHardwareSpec(BaseModel):
17
25
  """Fan trials out to additional Modal containers.
@@ -36,20 +44,27 @@ HardwareSpec = ModalHardwareSpec
36
44
  class TrainerRunRequest(BaseModel):
37
45
  """POST /v1/trainer/run -- Configure a Trainer and run HPO across a model matrix.
38
46
 
39
- Dataset delivery mirrors the other training endpoints:
40
- * Inline: set `data` and `labels`. `label_column` is ignored.
41
- * Pre-uploaded: set `data_uploaded=True`, supply `model_id` and
42
- `label_column` (the column in the uploaded table that holds the target).
47
+ Dataset delivery has exactly one of three modes:
48
+ * Inline: set `data` and `labels`. The server creates a transient
49
+ dataset row and uploads the data to the dataset's canonical S3
50
+ location before training begins.
51
+ * Pre-uploaded: set `dataset_id` to the id returned by
52
+ ``POST /v1/uploads``, plus `label_column` (the column in the uploaded
53
+ table that holds the target).
54
+ * Connector: set `data_connector=True`, `connector_id`, and
55
+ `table_name`.
56
+
57
+ The produced model id is returned in the response.
43
58
  """
44
59
 
45
60
  # --- dataset
46
61
  data: list[list[float | str | bool | None]] | None = Field(
47
62
  None,
48
- description="2D feature matrix (n_samples, n_features). Omit when data_uploaded=True.",
63
+ description="2D feature matrix (n_samples, n_features). Omit when dataset_id is set.",
49
64
  )
50
65
  labels: list[float] | None = Field(
51
66
  None,
52
- description="Target values, length n_samples. Omit when data_uploaded=True.",
67
+ description="Target values, length n_samples. Omit when dataset_id is set.",
53
68
  )
54
69
  feature_names: list[str] | None = Field(
55
70
  None,
@@ -61,24 +76,23 @@ class TrainerRunRequest(BaseModel):
61
76
  description="Optional per-column schema for inline data: "
62
77
  "{name: {dtype: 'float' | 'int' | 'bool' | 'categorical'}}.",
63
78
  )
64
- data_uploaded: bool = Field(
65
- False,
66
- description="If true, read the dataset already at "
67
- "traces/{org_id}/{model_id}/training_data.{pkl|csv|parquet}.",
79
+ dataset_id: str | None = Field(
80
+ None,
81
+ description="Identifier of a previously uploaded dataset (from "
82
+ "``POST /v1/uploads``). Mutually exclusive with inline data and "
83
+ "data_connector.",
68
84
  )
69
85
  label_column: str | None = Field(
70
86
  None,
71
- description="Target column name in the uploaded table. Required when data_uploaded=True.",
72
- )
73
- model_id: str | None = Field(
74
- None, description="Custom model ID; required when data_uploaded=True."
87
+ description="Target column name in the dataset. Required when dataset_id "
88
+ "is set (unless teacher_predict_url is also set).",
75
89
  )
76
90
 
77
91
  # --- connector-based data source
78
92
  data_connector: bool = Field(
79
93
  False,
80
94
  description="If true, read the dataset from a registered connector. "
81
- "Mutually exclusive with inline data and data_uploaded.",
95
+ "Mutually exclusive with inline data and dataset_id.",
82
96
  )
83
97
  connector_id: str | None = Field(
84
98
  None,
@@ -106,7 +120,7 @@ class TrainerRunRequest(BaseModel):
106
120
  "random",
107
121
  description="HPO strategy: 'random' or 'optuna'. Resolved server-side.",
108
122
  )
109
- n_trials: int = Field(4, description="Number of HPO trials per matrix row.")
123
+ n_trials: int = Field(4, ge=1, description="Number of HPO trials per matrix row.")
110
124
  n_splits: int | None = Field(
111
125
  None,
112
126
  description="K-fold cross-validation folds. None means a single holdout split.",
@@ -149,11 +163,13 @@ class TrainerRunRequest(BaseModel):
149
163
 
150
164
  @model_validator(mode="after")
151
165
  def _check_dataset_source(self):
152
- # Exactly one of three modes: inline, uploaded, or connector.
153
- if self.data_connector and self.data_uploaded:
154
- raise ValueError("data_connector and data_uploaded are mutually exclusive")
166
+ # Exactly one of three modes: inline, dataset_id, or connector.
167
+ if self.data_connector and self.dataset_id is not None:
168
+ raise ValueError("data_connector and dataset_id are mutually exclusive")
155
169
  if self.data_connector and self.data is not None:
156
170
  raise ValueError("data_connector and inline data are mutually exclusive")
171
+ if self.dataset_id is not None and self.data is not None:
172
+ raise ValueError("dataset_id and inline data are mutually exclusive")
157
173
 
158
174
  if self.data_connector:
159
175
  if not self.connector_id:
@@ -165,22 +181,21 @@ class TrainerRunRequest(BaseModel):
165
181
  "label_column is required when data_connector=True (unless "
166
182
  "teacher_predict_url is set)"
167
183
  )
168
- elif self.data_uploaded:
169
- if not self.model_id:
170
- raise ValueError("model_id is required when data_uploaded=True")
184
+ elif self.dataset_id is not None:
171
185
  if not self.label_column and not self.teacher_predict_url:
172
186
  raise ValueError(
173
- "label_column is required when data_uploaded=True (unless "
187
+ "label_column is required when dataset_id is set (unless "
174
188
  "teacher_predict_url is set, in which case the teacher provides "
175
189
  "the training target)"
176
190
  )
177
191
  else:
178
192
  if self.data is None:
179
- raise ValueError("data is required when data_uploaded is False")
193
+ raise ValueError(
194
+ "data is required for inline mode (or supply dataset_id / data_connector=True)"
195
+ )
180
196
  if self.labels is None and self.teacher_predict_url is None:
181
197
  raise ValueError(
182
- "labels is required when data_uploaded is False (unless "
183
- "teacher_predict_url is set)"
198
+ "labels is required for inline mode (unless teacher_predict_url is set)"
184
199
  )
185
200
  return self
186
201
 
@@ -192,6 +207,22 @@ class TrainerRunRequest(BaseModel):
192
207
  raise ValueError("feature_names is required when data contains non-numeric values")
193
208
  return self
194
209
 
210
+ @model_validator(mode="after")
211
+ def _check_hyperopt_budget(self):
212
+ # When model_types is None the server picks a default set, so the
213
+ # client-visible budget is just n_trials. We cap that as a lower
214
+ # bound on the real budget; the server should re-check after
215
+ # resolving the default model list.
216
+ n_models = max(1, len(self.model_types)) if self.model_types else 1
217
+ budget = self.n_trials * n_models
218
+ if budget > MAX_HYPEROPT_BUDGET:
219
+ raise ValueError(
220
+ f"n_trials x len(model_types) = {budget} exceeds the per-request "
221
+ f"cap of {MAX_HYPEROPT_BUDGET}. Reduce n_trials or shrink "
222
+ "model_types."
223
+ )
224
+ return self
225
+
195
226
 
196
227
  class TrainerRunResponse(JobResponse):
197
228
  """POST /v1/trainer/run -- async trainer job submission response."""
@@ -0,0 +1,58 @@
1
+ """Request/response schemas for the presigned-upload endpoint."""
2
+
3
+ from typing import Literal
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ # --- Upload size policy --------------------------------------------------- #
8
+ # Tunable in review. These constants are the single source of truth for the
9
+ # SDK's client-side guards. Server-side enforcement of MAX_UPLOAD_BYTES will
10
+ # be wired when the upload flow switches from presigned PUT to presigned POST
11
+ # (where S3's `content-length-range` condition becomes available).
12
+
13
+ MAX_UPLOAD_BYTES = 10_000_000_000
14
+ """Hard cap on uploaded file size, all formats.
15
+
16
+ Picked to comfortably hold 1M x 1k Parquet workloads with headroom for
17
+ ~10x growth. Enforced today by the SDK before upload starts. When the
18
+ presigned POST switch lands, S3 will enforce server-side via
19
+ `content-length-range`."""
20
+
21
+ MAX_CSV_UPLOAD_BYTES = 3_000_000_000
22
+ """Hard cap on CSV uploads specifically.
23
+
24
+ CSV is ~10-20x larger than the same data as Parquet, so a CSV near
25
+ MAX_UPLOAD_BYTES would almost always be a misformatted workload. Reject
26
+ early with a clear message rather than burn upload bandwidth on a file
27
+ that should have been Parquet."""
28
+
29
+ CSV_UPLOAD_WARN_BYTES = 500_000_000
30
+ """SDK emits a UserWarning above this CSV size, suggesting Parquet.
31
+
32
+ Soft signal — at this scale Parquet would be ~50x100 MB for the same
33
+ data, and subsequent training reads are also faster."""
34
+
35
+
36
+ class CreateUploadRequest(BaseModel):
37
+ """POST /v1/uploads -- request a presigned URL for direct-to-S3 upload."""
38
+
39
+ file_format: Literal["pkl", "csv", "parquet"] = Field(
40
+ ...,
41
+ description=(
42
+ "Format of the dataset you will PUT to the returned URL. "
43
+ "'pkl' = a pickled pandas DataFrame, 'csv' = RFC4180 CSV with a "
44
+ "header row, 'parquet' = Apache Parquet. The label column must be "
45
+ "present in the uploaded table and its name is supplied on the "
46
+ "subsequent /v1/trainer/run or /v1/reasoning/fit call as "
47
+ "`label_column`."
48
+ ),
49
+ )
50
+
51
+
52
+ class CreateUploadResponse(BaseModel):
53
+ dataset_id: str
54
+ upload_url: str
55
+ upload_key: str
56
+ file_format: Literal["pkl", "csv", "parquet"]
57
+ content_type: str
58
+ expires_in: int
@@ -1,59 +0,0 @@
1
- """Request/response schemas for segmentation endpoints."""
2
-
3
- from typing import Any
4
-
5
- from pydantic import BaseModel, ConfigDict, Field
6
-
7
-
8
- class ClusterPersonaSchema(BaseModel):
9
- """One cluster's persona description as exposed over HTTP.
10
-
11
- Mirrors the shape written into ``segments.json`` by the segment Lambda;
12
- the API repo aliases that S3 wire type onto this HTTP type at the
13
- response boundary.
14
- """
15
-
16
- model_config = ConfigDict(from_attributes=True)
17
-
18
- cluster_id: int
19
- persona_name: str
20
- persona_description: str
21
- stats: dict[str, Any]
22
- differentiating_features: list[dict[str, Any]] | None = None
23
-
24
-
25
- class SegmentRequest(BaseModel):
26
- """POST /v1/models/{model_id}/segment -- Supervised segmentation (async)."""
27
-
28
- data: list[list[float | str | bool | None]] | None = Field(
29
- None,
30
- description="Dataset to segment; uses training data if omitted. "
31
- "Cells may be numeric, string (categorical), or bool.",
32
- )
33
- target_values: list[float] | None = None
34
- feature_names: list[str] | None = None
35
- min_clusters: int = 4
36
- max_clusters: int | None = 10
37
- n_search_steps: int = 50
38
- use_agent: bool | None = None
39
- kpi_field: str | None = None
40
- problem_context: str | None = None
41
-
42
-
43
- class SegmentResultResponse(BaseModel):
44
- """GET /v1/models/{model_id}/segments -- Retrieve segmentation results.
45
-
46
- Result fields are Optional because pending/running/failed jobs return
47
- only model_id + status; populated only once the Lambda has uploaded
48
- segments.json and the SegmentsResult is available.
49
- """
50
-
51
- model_id: str
52
- status: str
53
- n_clusters: int | None = None
54
- cluster_ids: list[int] | None = None
55
- resolution: float | None = None
56
- quality: float | None = None
57
- personas: list[ClusterPersonaSchema] | None = None
58
- agent_score: float | None = None
59
- agent_reasoning: str | None = None
@@ -1,34 +0,0 @@
1
- """Request/response schemas for the presigned-upload endpoint."""
2
-
3
- from typing import Literal
4
-
5
- from pydantic import BaseModel, Field
6
-
7
-
8
- class CreateUploadRequest(BaseModel):
9
- """POST /v1/uploads -- request a presigned URL for direct-to-S3 upload."""
10
-
11
- model_id: str | None = Field(
12
- None,
13
- description="Custom model ID; auto-generated if omitted.",
14
- )
15
- file_format: Literal["pkl", "csv", "parquet"] = Field(
16
- ...,
17
- description=(
18
- "Format of the dataset you will PUT to the returned URL. "
19
- "'pkl' = a pickled pandas DataFrame, 'csv' = RFC4180 CSV with a "
20
- "header row, 'parquet' = Apache Parquet. The label column must be "
21
- "present in the uploaded table and its name is supplied on the "
22
- "subsequent /v1/trainer/run or /v1/reasoning/fit call as "
23
- "`label_column`."
24
- ),
25
- )
26
-
27
-
28
- class CreateUploadResponse(BaseModel):
29
- model_id: str
30
- upload_url: str
31
- upload_key: str
32
- file_format: Literal["pkl", "csv", "parquet"]
33
- content_type: str
34
- expires_in: int