outerproduct 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,153 @@
1
+ """OuterProduct — a Python SDK for training and explaining tabular ML models.
2
+
3
+ The package exposes a small, flat surface organised around the following
4
+ core concepts:
5
+
6
+ - :class:`~outerproduct.dataset.Dataset` — a tabular dataset with an optional label column and
7
+ per-column schema (:class:`~outerproduct.dataset.Column`).
8
+ - :class:`~outerproduct.connector.Connector` — a source of data (:class:`~outerproduct.connector.FileUploadConnector`,
9
+ :class:`~outerproduct.connector.S3Connector`, …) that produces a :class:`~outerproduct.dataset.Dataset`.
10
+ - :class:`~outerproduct.trainer.Trainer` — orchestrates general-purpose training and
11
+ returns a :class:`~outerproduct.model.Model` via :meth:`~outerproduct.trainer.Trainer.configure` and
12
+ :meth:`~outerproduct.trainer.Trainer.run`.
13
+ - :class:`~outerproduct.model.Model` — a trained predictor with :meth:`~outerproduct.model.Model.predict`.
14
+ - :class:`~outerproduct.model.Predictor` — a duck-typed wrapper around an external
15
+ prediction endpoint, used to hand black-box models into
16
+ :func:`~outerproduct.reasoning.fit` for distillation.
17
+ - :class:`~outerproduct.model.ReasoningModel` — a :class:`~outerproduct.model.Model` that also supports
18
+ :meth:`~outerproduct.model.ReasoningModel.explain`, :meth:`~outerproduct.model.ReasoningModel.interpret`,
19
+ :meth:`~outerproduct.model.ReasoningModel.scenario`, and :meth:`~outerproduct.model.ReasoningModel.segment`.
20
+ Produced by :func:`~outerproduct.reasoning.fit`.
21
+ - :class:`~outerproduct.reasoning.Reasoning` / :class:`~outerproduct.reasoning.ReasoningTrace` — the typed results of
22
+ reasoning calls.
23
+
24
+ Quickstart
25
+ ----------
26
+ .. code-block:: python
27
+
28
+ import outerproduct as op
29
+
30
+ op.init(api_key="...")
31
+ connector = op.FileUploadConnector()
32
+ dataset = connector.upload("customers.csv", label_column="churn")
33
+ reasoning_model = op.reasoning.fit(dataset)
34
+ predictions = reasoning_model.predict(X_new)
35
+ reasoning = reasoning_model.explain(X_new)
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ from dataclasses import dataclass, field
41
+ from importlib.metadata import version as _pkg_version
42
+
43
+ from outerproduct_http_types import ConnectorType
44
+
45
+ from outerproduct import agentic
46
+ from outerproduct.connector import (
47
+ Connector,
48
+ DatabricksConnector,
49
+ FileUploadConnector,
50
+ S3Connector,
51
+ SnowflakeConnector,
52
+ )
53
+ from outerproduct.dataset import Column, Dataset
54
+ from outerproduct.job import Job, ReasoningJob, TrainerJob
55
+ from outerproduct.model import Model, ReasoningModel, predict
56
+ from outerproduct.reasoning import Reasoning, ReasoningTrace
57
+ from outerproduct.trainer import (
58
+ Hardware,
59
+ Metric,
60
+ ModalHardware,
61
+ Trainer,
62
+ )
63
+
64
+ try:
65
+ __version__ = _pkg_version("outerproduct")
66
+ except Exception:
67
+ __version__ = "unknown"
68
+
69
+
70
+ @dataclass
71
+ class _GlobalState:
72
+ """Module-level state populated by :func:`~outerproduct.init`. Internal."""
73
+
74
+ api_key: str | None = None
75
+ base_url: str | None = None
76
+ hardware: Hardware | None = None
77
+ extra: dict[str, object] = field(default_factory=dict)
78
+
79
+
80
+ _state = _GlobalState()
81
+
82
+
83
+ def init(
84
+ api_key: str | None = None,
85
+ *,
86
+ base_url: str | None = None,
87
+ hardware: Hardware | None = None,
88
+ ) -> None:
89
+ """Configure the OuterProduct SDK for the current Python process.
90
+
91
+ Call once near the top of your program. Subsequent operations
92
+ (training, prediction, uploads) read credentials, the API base URL,
93
+ and the default execution backend from the state recorded here.
94
+
95
+ Examples
96
+ --------
97
+ .. code-block:: python
98
+
99
+ import outerproduct as op
100
+ op.init(api_key="op_live_...", hardware=op.ModalHardware())
101
+
102
+ Parameters
103
+ ----------
104
+ api_key : str, optional
105
+ Bearer token used to authenticate against the OuterProduct API.
106
+ If omitted, the ``OUTERPRODUCT_API_KEY`` environment variable is
107
+ consulted at use time.
108
+ base_url : str, optional
109
+ Override for the API base URL. If omitted, the SDK uses the
110
+ production endpoint.
111
+ hardware : Hardware, optional
112
+ Default execution backend used by :class:`~outerproduct.trainer.Trainer`
113
+ and :func:`~outerproduct.reasoning.fit`. If omitted, the API
114
+ server falls back to a managed Modal worker.
115
+ """
116
+ _state.api_key = api_key
117
+ _state.base_url = base_url
118
+ _state.hardware = hardware
119
+
120
+
121
+ __all__ = [
122
+ # Bootstrap
123
+ "init",
124
+ "__version__",
125
+ # Dataset
126
+ "Dataset",
127
+ "Column",
128
+ # Jobs
129
+ "Job",
130
+ "TrainerJob",
131
+ "ReasoningJob",
132
+ # Model
133
+ "Model",
134
+ "ReasoningModel",
135
+ "predict",
136
+ # Reasoning
137
+ "Reasoning",
138
+ "ReasoningTrace",
139
+ # Trainer
140
+ "Trainer",
141
+ "Hardware",
142
+ "ModalHardware",
143
+ "Metric",
144
+ # Connector
145
+ "Connector",
146
+ "ConnectorType",
147
+ "FileUploadConnector",
148
+ "S3Connector",
149
+ "SnowflakeConnector",
150
+ "DatabricksConnector",
151
+ # Agentic feature sourcing (submodule, accessed as op.agentic.*)
152
+ "agentic",
153
+ ]
@@ -0,0 +1,16 @@
1
+ """Agent-driven feature sourcing.
2
+
3
+ Everything under :mod:`outerproduct.agentic` is part of an agent-driven flow
4
+ that turns unstructured or semi-structured sources into typed tabular
5
+ features. The boundary is the *flow*, not whether each individual function
6
+ makes an LLM call — deterministic helpers (PDF parsing, batching, schema
7
+ persistence) live here too because they support the same contract.
8
+
9
+ Submodules
10
+ ----------
11
+ - :mod:`outerproduct.agentic.documents` — files (PDF, image, text).
12
+ """
13
+
14
+ from outerproduct.agentic import documents
15
+
16
+ __all__ = ["documents"]
@@ -0,0 +1,514 @@
1
+ """Document tabularization — agent-driven feature sourcing for files.
2
+
3
+ One submodule of :mod:`outerproduct.agentic`. Treats files (PDF, image,
4
+ text) as a feature source: induce a frozen schema of survey questions
5
+ for the use case, then have the agent extract every document against
6
+ that schema. The tabularized rows live in object storage on the
7
+ OuterProduct backend; the SDK returns a :class:`DocumentDataset` that
8
+ references that remote table by ``model_id`` and slots into
9
+ :meth:`outerproduct.Trainer.configure` and
10
+ :func:`outerproduct.reasoning.fit` exactly like any other uploaded
11
+ :class:`~outerproduct.Dataset`.
12
+
13
+ Typical flow::
14
+
15
+ docs = DocumentSet.from_directory("./invoices")
16
+ refs = upload(docs) # one-time upload
17
+ schema = induce_schema(refs, use_case="audit", skill="invoice")
18
+ ds = tabularize(refs, schema) # remote DocumentDataset
19
+ trainer.run(ds) # uses data_uploaded=True
20
+
21
+ ``upload`` is explicit so the same documents can feed both
22
+ :func:`induce_schema` and :func:`tabularize` without re-uploading.
23
+ For one-shot calls, ``induce_schema`` and ``tabularize`` also accept a
24
+ :class:`DocumentSet` directly and upload internally.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import json
30
+ import random as _random
31
+ import re
32
+ from collections.abc import Iterable, Iterator
33
+ from dataclasses import dataclass, field
34
+ from enum import StrEnum
35
+ from pathlib import Path
36
+ from typing import Any, Literal
37
+
38
+ import httpx
39
+ from outerproduct_http_types import (
40
+ AnswerType as _AnswerTypePayload,
41
+ CreateDocumentUploadRequest,
42
+ DocumentRef,
43
+ InduceSchemaRequest,
44
+ Question as QuestionPayload,
45
+ Schema as SchemaPayload,
46
+ TabularizeRequest,
47
+ )
48
+
49
+ from outerproduct.client import OuterProductClient
50
+ from outerproduct.dataset import Column, Dataset
51
+
52
+ DocumentMediaType = Literal[
53
+ "application/pdf",
54
+ "image/png",
55
+ "image/jpeg",
56
+ "image/gif",
57
+ "image/webp",
58
+ "text/plain",
59
+ ]
60
+
61
+
62
+ class AnswerType(StrEnum):
63
+ """The legal answer shapes for a :class:`DocumentQuestion`."""
64
+
65
+ BOOLEAN = "boolean"
66
+ NUMBER = "number"
67
+ INTEGER = "integer"
68
+ ENUM = "enum"
69
+ MULTI_ENUM = "multi_enum"
70
+ DATE = "date"
71
+ DATE_RANGE = "date_range"
72
+ STRING = "string"
73
+ TEXT = "text"
74
+
75
+
76
+ _ID_RE = re.compile(r"^[a-z][a-z0-9_]*$")
77
+
78
+
79
+ @dataclass(frozen=True)
80
+ class DocumentQuestion:
81
+ """One survey question in a :class:`DocumentSchema`."""
82
+
83
+ id: str
84
+ question: str
85
+ answer_type: AnswerType
86
+ rationale: str | None = None
87
+ unit: str | None = None
88
+ enum: tuple[str, ...] | None = None
89
+
90
+ def __post_init__(self) -> None:
91
+ if not _ID_RE.match(self.id):
92
+ raise ValueError(f"DocumentQuestion.id={self.id!r} must be snake_case")
93
+ if self.answer_type in (AnswerType.ENUM, AnswerType.MULTI_ENUM):
94
+ if not self.enum:
95
+ raise ValueError(
96
+ f"DocumentQuestion {self.id!r}: answer_type="
97
+ f"{self.answer_type.value} requires non-empty `enum`"
98
+ )
99
+ elif self.enum is not None:
100
+ raise ValueError(
101
+ f"DocumentQuestion {self.id!r}: `enum` is only valid for "
102
+ "answer_type='enum' or 'multi_enum'"
103
+ )
104
+
105
+ def to_dict(self) -> dict[str, Any]:
106
+ out: dict[str, Any] = {
107
+ "id": self.id,
108
+ "question": self.question,
109
+ "answer_type": self.answer_type.value,
110
+ }
111
+ if self.rationale is not None:
112
+ out["rationale"] = self.rationale
113
+ if self.unit is not None:
114
+ out["unit"] = self.unit
115
+ if self.enum is not None:
116
+ out["enum"] = list(self.enum)
117
+ return out
118
+
119
+ @classmethod
120
+ def from_dict(cls, obj: dict[str, Any]) -> DocumentQuestion:
121
+ enum = obj.get("enum")
122
+ return cls(
123
+ id=obj["id"],
124
+ question=obj["question"],
125
+ answer_type=AnswerType(obj["answer_type"]),
126
+ rationale=obj.get("rationale"),
127
+ unit=obj.get("unit"),
128
+ enum=tuple(enum) if enum is not None else None,
129
+ )
130
+
131
+
132
+ @dataclass
133
+ class DocumentSchema:
134
+ """A frozen list of :class:`DocumentQuestion` for one skill and one
135
+ use case. Persist with :meth:`save`; reload with :meth:`load`."""
136
+
137
+ skill: str
138
+ use_case: str
139
+ questions: list[DocumentQuestion]
140
+ metadata: dict[str, Any] = field(default_factory=dict)
141
+
142
+ def __post_init__(self) -> None:
143
+ seen: set[str] = set()
144
+ for q in self.questions:
145
+ if q.id in seen:
146
+ raise ValueError(f"DocumentSchema: duplicate question id {q.id!r}")
147
+ seen.add(q.id)
148
+
149
+ @property
150
+ def question_ids(self) -> list[str]:
151
+ return [q.id for q in self.questions]
152
+
153
+ def get(self, question_id: str) -> DocumentQuestion:
154
+ for q in self.questions:
155
+ if q.id == question_id:
156
+ return q
157
+ raise KeyError(question_id)
158
+
159
+ def to_dict(self) -> dict[str, Any]:
160
+ return {
161
+ "skill": self.skill,
162
+ "use_case": self.use_case,
163
+ "questions": [q.to_dict() for q in self.questions],
164
+ "metadata": self.metadata,
165
+ "_format_version": 1,
166
+ }
167
+
168
+ @classmethod
169
+ def from_dict(cls, obj: dict[str, Any]) -> DocumentSchema:
170
+ return cls(
171
+ skill=obj["skill"],
172
+ use_case=obj["use_case"],
173
+ questions=[DocumentQuestion.from_dict(q) for q in obj["questions"]],
174
+ metadata=obj.get("metadata", {}),
175
+ )
176
+
177
+ def save(self, path: str | Path) -> None:
178
+ Path(path).write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
179
+
180
+ @classmethod
181
+ def load(cls, path: str | Path) -> DocumentSchema:
182
+ return cls.from_dict(json.loads(Path(path).read_text(encoding="utf-8")))
183
+
184
+
185
+ _PDF_SUFFIXES = {".pdf"}
186
+ _IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".gif", ".webp"}
187
+ _TEXT_SUFFIXES = {".txt", ".md"}
188
+ _SUPPORTED_SUFFIXES = _PDF_SUFFIXES | _IMAGE_SUFFIXES | _TEXT_SUFFIXES
189
+ _MIME_BY_SUFFIX: dict[str, DocumentMediaType] = {
190
+ ".pdf": "application/pdf",
191
+ ".png": "image/png",
192
+ ".jpg": "image/jpeg",
193
+ ".jpeg": "image/jpeg",
194
+ ".gif": "image/gif",
195
+ ".webp": "image/webp",
196
+ ".txt": "text/plain",
197
+ ".md": "text/plain",
198
+ }
199
+
200
+
201
+ @dataclass(frozen=True)
202
+ class Document:
203
+ """One document — raw bytes plus identity."""
204
+
205
+ document_id: str
206
+ bytes_: bytes
207
+ media_type: DocumentMediaType
208
+ source: Path | None = None
209
+
210
+ @classmethod
211
+ def from_path(cls, path: str | Path, *, document_id: str | None = None) -> Document:
212
+ p = Path(path)
213
+ suffix = p.suffix.lower()
214
+ if suffix not in _MIME_BY_SUFFIX:
215
+ raise ValueError(
216
+ f"Document.from_path: unsupported suffix {suffix!r} for {p}; "
217
+ f"supported: {sorted(_SUPPORTED_SUFFIXES)}"
218
+ )
219
+ return cls(
220
+ document_id=document_id or p.stem,
221
+ bytes_=p.read_bytes(),
222
+ media_type=_MIME_BY_SUFFIX[suffix],
223
+ source=p,
224
+ )
225
+
226
+ @classmethod
227
+ def from_text(cls, text: str, *, document_id: str) -> Document:
228
+ return cls(
229
+ document_id=document_id,
230
+ bytes_=text.encode("utf-8"),
231
+ media_type="text/plain",
232
+ )
233
+
234
+
235
+ @dataclass
236
+ class DocumentSet:
237
+ """An ordered collection of :class:`Document`."""
238
+
239
+ documents: list[Document] = field(default_factory=list)
240
+
241
+ @classmethod
242
+ def from_directory(
243
+ cls,
244
+ path: str | Path,
245
+ *,
246
+ glob: str = "*",
247
+ recursive: bool = False,
248
+ ) -> DocumentSet:
249
+ root = Path(path)
250
+ if not root.is_dir():
251
+ raise NotADirectoryError(
252
+ f"DocumentSet.from_directory: {root} is not a directory"
253
+ )
254
+ iterator = root.rglob(glob) if recursive else root.glob(glob)
255
+ docs = [
256
+ Document.from_path(p)
257
+ for p in sorted(iterator)
258
+ if p.is_file() and p.suffix.lower() in _SUPPORTED_SUFFIXES
259
+ ]
260
+ return cls(documents=docs)
261
+
262
+ @classmethod
263
+ def from_paths(cls, paths: Iterable[str | Path]) -> DocumentSet:
264
+ return cls(documents=[Document.from_path(p) for p in paths])
265
+
266
+ def sample(self, n: int, *, seed: int = 0) -> DocumentSet:
267
+ if n <= 0:
268
+ raise ValueError(f"sample: n must be positive, got {n}")
269
+ if n >= len(self.documents):
270
+ return DocumentSet(documents=list(self.documents))
271
+ rng = _random.Random(seed)
272
+ return DocumentSet(documents=rng.sample(self.documents, n))
273
+
274
+ def __iter__(self) -> Iterator[Document]:
275
+ return iter(self.documents)
276
+
277
+ def __len__(self) -> int:
278
+ return len(self.documents)
279
+
280
+ def __getitem__(self, index: int) -> Document:
281
+ return self.documents[index]
282
+
283
+ def __repr__(self) -> str:
284
+ return f"DocumentSet(n={len(self)})"
285
+
286
+
287
+ class DocumentDataset(Dataset):
288
+ """A remote-handle :class:`Dataset` whose rows are tabularized
289
+ documents and columns are :class:`DocumentSchema` questions.
290
+
291
+ Holds no rows locally. The tabularized table lives on the
292
+ OuterProduct backend under ``model_id`` (the same path used by
293
+ pre-uploaded datasets), and the trainer/reasoning APIs fetch from
294
+ that location server-side via the standard ``data_uploaded=True``
295
+ wire mode.
296
+
297
+ Construct directly when you already have a tabularize ``model_id``,
298
+ or — more commonly — let :func:`tabularize` build one for you.
299
+ """
300
+
301
+ def __init__(
302
+ self,
303
+ *,
304
+ schema: DocumentSchema,
305
+ document_ids: list[str],
306
+ model_id: str,
307
+ label_column: str | None = None,
308
+ columns: list[Column] | None = None,
309
+ ) -> None:
310
+ import pandas as pd
311
+
312
+ empty = pd.DataFrame(columns=pd.Index([q.id for q in schema.questions]))
313
+ super().__init__(empty, label_column=label_column, columns=columns)
314
+ self._upload_id = model_id
315
+ self._schema = schema
316
+ self._document_ids = list(document_ids)
317
+
318
+ @property
319
+ def schema(self) -> DocumentSchema:
320
+ return self._schema
321
+
322
+ @property
323
+ def document_ids(self) -> list[str]:
324
+ return list(self._document_ids)
325
+
326
+ @property
327
+ def model_id(self) -> str:
328
+ """The tabularize-job ``model_id`` that backs this dataset on the server."""
329
+ assert self._upload_id is not None
330
+ return self._upload_id
331
+
332
+ @property
333
+ def n_samples(self) -> int:
334
+ return len(self._document_ids)
335
+
336
+
337
+ # --------------------------------------------------------------------------- #
338
+ # Wire conversions between the in-SDK data classes and the http-types models. #
339
+ # --------------------------------------------------------------------------- #
340
+
341
+
342
+ def _question_to_payload(q: DocumentQuestion) -> QuestionPayload:
343
+ return QuestionPayload(
344
+ id=q.id,
345
+ question=q.question,
346
+ answer_type=_AnswerTypePayload(q.answer_type.value),
347
+ rationale=q.rationale,
348
+ unit=q.unit,
349
+ enum=list(q.enum) if q.enum is not None else None,
350
+ )
351
+
352
+
353
+ def _question_from_payload(p: QuestionPayload) -> DocumentQuestion:
354
+ return DocumentQuestion(
355
+ id=p.id,
356
+ question=p.question,
357
+ answer_type=AnswerType(p.answer_type.value),
358
+ rationale=p.rationale,
359
+ unit=p.unit,
360
+ enum=tuple(p.enum) if p.enum is not None else None,
361
+ )
362
+
363
+
364
+ def _schema_to_payload(s: DocumentSchema) -> SchemaPayload:
365
+ return SchemaPayload(
366
+ skill=s.skill,
367
+ use_case=s.use_case,
368
+ questions=[_question_to_payload(q) for q in s.questions],
369
+ metadata=dict(s.metadata),
370
+ )
371
+
372
+
373
+ def _schema_from_payload(p: SchemaPayload) -> DocumentSchema:
374
+ return DocumentSchema(
375
+ skill=p.skill,
376
+ use_case=p.use_case,
377
+ questions=[_question_from_payload(q) for q in p.questions],
378
+ metadata=dict(p.metadata),
379
+ )
380
+
381
+
382
+ def _resolve_client(client: OuterProductClient | None) -> OuterProductClient:
383
+ return client if client is not None else OuterProductClient.from_credentials()
384
+
385
+
386
+ def _coerce_to_refs(
387
+ documents: DocumentSet | list[DocumentRef],
388
+ client: OuterProductClient,
389
+ ) -> list[DocumentRef]:
390
+ """Accept either a :class:`DocumentSet` (upload internally) or
391
+ pre-uploaded :class:`DocumentRef` list (return as-is)."""
392
+ if isinstance(documents, DocumentSet):
393
+ return _upload_document_set(documents, client)
394
+ return list(documents)
395
+
396
+
397
+ def _upload_document_set(
398
+ documents: DocumentSet, client: OuterProductClient
399
+ ) -> list[DocumentRef]:
400
+ refs: list[DocumentRef] = []
401
+ for doc in documents:
402
+ resp = client.uploads_api.create_document(
403
+ CreateDocumentUploadRequest(
404
+ document_id=doc.document_id, media_type=doc.media_type
405
+ )
406
+ )
407
+ put = httpx.put(
408
+ resp.upload_url,
409
+ content=doc.bytes_,
410
+ headers={"Content-Type": doc.media_type},
411
+ )
412
+ put.raise_for_status()
413
+ refs.append(
414
+ DocumentRef(
415
+ document_id=resp.document_id,
416
+ upload_key=resp.upload_key,
417
+ media_type=resp.media_type,
418
+ )
419
+ )
420
+ return refs
421
+
422
+
423
+ # --------------------------------------------------------------------------- #
424
+ # Public entry points #
425
+ # --------------------------------------------------------------------------- #
426
+
427
+
428
+ def upload(
429
+ documents: DocumentSet, *, client: OuterProductClient | None = None
430
+ ) -> list[DocumentRef]:
431
+ """Upload every document in ``documents`` via per-file presigned URLs.
432
+
433
+ Returns a list of :class:`DocumentRef` you can pass to
434
+ :func:`induce_schema` and :func:`tabularize` — uploading once and
435
+ reusing the refs avoids re-uploading the same bytes for both jobs.
436
+ """
437
+ op = _resolve_client(client)
438
+ return _upload_document_set(documents, op)
439
+
440
+
441
+ def induce_schema(
442
+ documents: DocumentSet | list[DocumentRef],
443
+ *,
444
+ use_case: str,
445
+ skill: str,
446
+ client: OuterProductClient | None = None,
447
+ ) -> DocumentSchema:
448
+ """Run the agent over a sample of documents to produce a frozen schema.
449
+
450
+ Pass either a :class:`DocumentSet` (uploads internally) or a list of
451
+ :class:`DocumentRef` from a previous :func:`upload` call. Submits the
452
+ job, polls until completion, and returns the produced
453
+ :class:`DocumentSchema`.
454
+ """
455
+ op = _resolve_client(client)
456
+ refs = _coerce_to_refs(documents, op)
457
+ submission = op.agentic_documents_api.induce_schema(
458
+ InduceSchemaRequest(documents=refs, use_case=use_case, skill=skill)
459
+ )
460
+ op.wait_for(submission.model_id)
461
+ result = op.agentic_documents_api.get_schema(submission.model_id)
462
+ return _schema_from_payload(result.schema_)
463
+
464
+
465
+ def tabularize(
466
+ documents: DocumentSet | list[DocumentRef],
467
+ schema: DocumentSchema,
468
+ *,
469
+ web_augmentation: bool = False,
470
+ concurrency: int = 1,
471
+ label_column: str | None = None,
472
+ client: OuterProductClient | None = None,
473
+ ) -> DocumentDataset:
474
+ """Extract every document against ``schema`` and return a :class:`DocumentDataset`.
475
+
476
+ The tabularized rows are written to object storage on the backend;
477
+ the returned :class:`DocumentDataset` is a remote handle that
478
+ :class:`~outerproduct.Trainer` and :func:`~outerproduct.reasoning.fit`
479
+ consume directly via the standard ``data_uploaded=True`` wire mode.
480
+ No rows are downloaded to the client.
481
+ """
482
+ op = _resolve_client(client)
483
+ refs = _coerce_to_refs(documents, op)
484
+ submission = op.agentic_documents_api.tabularize(
485
+ TabularizeRequest.model_validate(
486
+ {
487
+ "documents": [r.model_dump() for r in refs],
488
+ "schema": _schema_to_payload(schema).model_dump(),
489
+ "web_augmentation": web_augmentation,
490
+ "concurrency": concurrency,
491
+ }
492
+ )
493
+ )
494
+ op.wait_for(submission.model_id)
495
+ result = op.agentic_documents_api.get_table(submission.model_id)
496
+ return DocumentDataset(
497
+ schema=_schema_from_payload(result.schema_),
498
+ document_ids=list(result.document_ids),
499
+ model_id=result.model_id,
500
+ label_column=label_column,
501
+ )
502
+
503
+
504
+ __all__ = [
505
+ "AnswerType",
506
+ "Document",
507
+ "DocumentDataset",
508
+ "DocumentQuestion",
509
+ "DocumentSchema",
510
+ "DocumentSet",
511
+ "induce_schema",
512
+ "tabularize",
513
+ "upload",
514
+ ]