outerproduct 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- outerproduct/__init__.py +153 -0
- outerproduct/agentic/__init__.py +16 -0
- outerproduct/agentic/documents.py +514 -0
- outerproduct/client/__init__.py +72 -0
- outerproduct/client/_http.py +61 -0
- outerproduct/client/apis.py +240 -0
- outerproduct/client/client.py +232 -0
- outerproduct/client/endpoints.py +267 -0
- outerproduct/client/exceptions.py +48 -0
- outerproduct/connector.py +718 -0
- outerproduct/dataset.py +469 -0
- outerproduct/job.py +173 -0
- outerproduct/model.py +642 -0
- outerproduct/py.typed +0 -0
- outerproduct/reasoning.py +282 -0
- outerproduct/trainer.py +418 -0
- outerproduct/utils.py +66 -0
- outerproduct-0.1.0.dist-info/METADATA +153 -0
- outerproduct-0.1.0.dist-info/RECORD +21 -0
- outerproduct-0.1.0.dist-info/WHEEL +4 -0
- outerproduct-0.1.0.dist-info/licenses/LICENSE +21 -0
outerproduct/__init__.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""OuterProduct — a Python SDK for training and explaining tabular ML models.
|
|
2
|
+
|
|
3
|
+
The package exposes a small, flat surface organised around the following
|
|
4
|
+
core concepts:
|
|
5
|
+
|
|
6
|
+
- :class:`~outerproduct.dataset.Dataset` — a tabular dataset with an optional label column and
|
|
7
|
+
per-column schema (:class:`~outerproduct.dataset.Column`).
|
|
8
|
+
- :class:`~outerproduct.connector.Connector` — a source of data (:class:`~outerproduct.connector.FileUploadConnector`,
|
|
9
|
+
:class:`~outerproduct.connector.S3Connector`, …) that produces a :class:`~outerproduct.dataset.Dataset`.
|
|
10
|
+
- :class:`~outerproduct.trainer.Trainer` — orchestrates general-purpose training and
|
|
11
|
+
returns a :class:`~outerproduct.model.Model` via :meth:`~outerproduct.trainer.Trainer.configure` and
|
|
12
|
+
:meth:`~outerproduct.trainer.Trainer.run`.
|
|
13
|
+
- :class:`~outerproduct.model.Model` — a trained predictor with :meth:`~outerproduct.model.Model.predict`.
|
|
14
|
+
- :class:`~outerproduct.model.Predictor` — a duck-typed wrapper around an external
|
|
15
|
+
prediction endpoint, used to hand black-box models into
|
|
16
|
+
:func:`~outerproduct.reasoning.fit` for distillation.
|
|
17
|
+
- :class:`~outerproduct.model.ReasoningModel` — a :class:`~outerproduct.model.Model` that also supports
|
|
18
|
+
:meth:`~outerproduct.model.ReasoningModel.explain`, :meth:`~outerproduct.model.ReasoningModel.interpret`,
|
|
19
|
+
:meth:`~outerproduct.model.ReasoningModel.scenario`, and :meth:`~outerproduct.model.ReasoningModel.segment`.
|
|
20
|
+
Produced by :func:`~outerproduct.reasoning.fit`.
|
|
21
|
+
- :class:`~outerproduct.reasoning.Reasoning` / :class:`~outerproduct.reasoning.ReasoningTrace` — the typed results of
|
|
22
|
+
reasoning calls.
|
|
23
|
+
|
|
24
|
+
Quickstart
|
|
25
|
+
----------
|
|
26
|
+
.. code-block:: python
|
|
27
|
+
|
|
28
|
+
import outerproduct as op
|
|
29
|
+
|
|
30
|
+
op.init(api_key="...")
|
|
31
|
+
connector = op.FileUploadConnector()
|
|
32
|
+
dataset = connector.upload("customers.csv", label_column="churn")
|
|
33
|
+
reasoning_model = op.reasoning.fit(dataset)
|
|
34
|
+
predictions = reasoning_model.predict(X_new)
|
|
35
|
+
reasoning = reasoning_model.explain(X_new)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
from dataclasses import dataclass, field
|
|
41
|
+
from importlib.metadata import version as _pkg_version
|
|
42
|
+
|
|
43
|
+
from outerproduct_http_types import ConnectorType
|
|
44
|
+
|
|
45
|
+
from outerproduct import agentic
|
|
46
|
+
from outerproduct.connector import (
|
|
47
|
+
Connector,
|
|
48
|
+
DatabricksConnector,
|
|
49
|
+
FileUploadConnector,
|
|
50
|
+
S3Connector,
|
|
51
|
+
SnowflakeConnector,
|
|
52
|
+
)
|
|
53
|
+
from outerproduct.dataset import Column, Dataset
|
|
54
|
+
from outerproduct.job import Job, ReasoningJob, TrainerJob
|
|
55
|
+
from outerproduct.model import Model, ReasoningModel, predict
|
|
56
|
+
from outerproduct.reasoning import Reasoning, ReasoningTrace
|
|
57
|
+
from outerproduct.trainer import (
|
|
58
|
+
Hardware,
|
|
59
|
+
Metric,
|
|
60
|
+
ModalHardware,
|
|
61
|
+
Trainer,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
__version__ = _pkg_version("outerproduct")
|
|
66
|
+
except Exception:
|
|
67
|
+
__version__ = "unknown"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class _GlobalState:
|
|
72
|
+
"""Module-level state populated by :func:`~outerproduct.init`. Internal."""
|
|
73
|
+
|
|
74
|
+
api_key: str | None = None
|
|
75
|
+
base_url: str | None = None
|
|
76
|
+
hardware: Hardware | None = None
|
|
77
|
+
extra: dict[str, object] = field(default_factory=dict)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
_state = _GlobalState()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def init(
|
|
84
|
+
api_key: str | None = None,
|
|
85
|
+
*,
|
|
86
|
+
base_url: str | None = None,
|
|
87
|
+
hardware: Hardware | None = None,
|
|
88
|
+
) -> None:
|
|
89
|
+
"""Configure the OuterProduct SDK for the current Python process.
|
|
90
|
+
|
|
91
|
+
Call once near the top of your program. Subsequent operations
|
|
92
|
+
(training, prediction, uploads) read credentials, the API base URL,
|
|
93
|
+
and the default execution backend from the state recorded here.
|
|
94
|
+
|
|
95
|
+
Examples
|
|
96
|
+
--------
|
|
97
|
+
.. code-block:: python
|
|
98
|
+
|
|
99
|
+
import outerproduct as op
|
|
100
|
+
op.init(api_key="op_live_...", hardware=op.ModalHardware())
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
api_key : str, optional
|
|
105
|
+
Bearer token used to authenticate against the OuterProduct API.
|
|
106
|
+
If omitted, the ``OUTERPRODUCT_API_KEY`` environment variable is
|
|
107
|
+
consulted at use time.
|
|
108
|
+
base_url : str, optional
|
|
109
|
+
Override for the API base URL. If omitted, the SDK uses the
|
|
110
|
+
production endpoint.
|
|
111
|
+
hardware : Hardware, optional
|
|
112
|
+
Default execution backend used by :class:`~outerproduct.trainer.Trainer`
|
|
113
|
+
and :func:`~outerproduct.reasoning.fit`. If omitted, the API
|
|
114
|
+
server falls back to a managed Modal worker.
|
|
115
|
+
"""
|
|
116
|
+
_state.api_key = api_key
|
|
117
|
+
_state.base_url = base_url
|
|
118
|
+
_state.hardware = hardware
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
__all__ = [
|
|
122
|
+
# Bootstrap
|
|
123
|
+
"init",
|
|
124
|
+
"__version__",
|
|
125
|
+
# Dataset
|
|
126
|
+
"Dataset",
|
|
127
|
+
"Column",
|
|
128
|
+
# Jobs
|
|
129
|
+
"Job",
|
|
130
|
+
"TrainerJob",
|
|
131
|
+
"ReasoningJob",
|
|
132
|
+
# Model
|
|
133
|
+
"Model",
|
|
134
|
+
"ReasoningModel",
|
|
135
|
+
"predict",
|
|
136
|
+
# Reasoning
|
|
137
|
+
"Reasoning",
|
|
138
|
+
"ReasoningTrace",
|
|
139
|
+
# Trainer
|
|
140
|
+
"Trainer",
|
|
141
|
+
"Hardware",
|
|
142
|
+
"ModalHardware",
|
|
143
|
+
"Metric",
|
|
144
|
+
# Connector
|
|
145
|
+
"Connector",
|
|
146
|
+
"ConnectorType",
|
|
147
|
+
"FileUploadConnector",
|
|
148
|
+
"S3Connector",
|
|
149
|
+
"SnowflakeConnector",
|
|
150
|
+
"DatabricksConnector",
|
|
151
|
+
# Agentic feature sourcing (submodule, accessed as op.agentic.*)
|
|
152
|
+
"agentic",
|
|
153
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Agent-driven feature sourcing.
|
|
2
|
+
|
|
3
|
+
Everything under :mod:`outerproduct.agentic` is part of an agent-driven flow
|
|
4
|
+
that turns unstructured or semi-structured sources into typed tabular
|
|
5
|
+
features. The boundary is the *flow*, not whether each individual function
|
|
6
|
+
makes an LLM call — deterministic helpers (PDF parsing, batching, schema
|
|
7
|
+
persistence) live here too because they support the same contract.
|
|
8
|
+
|
|
9
|
+
Submodules
|
|
10
|
+
----------
|
|
11
|
+
- :mod:`outerproduct.agentic.documents` — files (PDF, image, text).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from outerproduct.agentic import documents
|
|
15
|
+
|
|
16
|
+
__all__ = ["documents"]
|
|
@@ -0,0 +1,514 @@
|
|
|
1
|
+
"""Document tabularization — agent-driven feature sourcing for files.
|
|
2
|
+
|
|
3
|
+
One submodule of :mod:`outerproduct.agentic`. Treats files (PDF, image,
|
|
4
|
+
text) as a feature source: induce a frozen schema of survey questions
|
|
5
|
+
for the use case, then have the agent extract every document against
|
|
6
|
+
that schema. The tabularized rows live in object storage on the
|
|
7
|
+
OuterProduct backend; the SDK returns a :class:`DocumentDataset` that
|
|
8
|
+
references that remote table by ``model_id`` and slots into
|
|
9
|
+
:meth:`outerproduct.Trainer.configure` and
|
|
10
|
+
:func:`outerproduct.reasoning.fit` exactly like any other uploaded
|
|
11
|
+
:class:`~outerproduct.Dataset`.
|
|
12
|
+
|
|
13
|
+
Typical flow::
|
|
14
|
+
|
|
15
|
+
docs = DocumentSet.from_directory("./invoices")
|
|
16
|
+
refs = upload(docs) # one-time upload
|
|
17
|
+
schema = induce_schema(refs, use_case="audit", skill="invoice")
|
|
18
|
+
ds = tabularize(refs, schema) # remote DocumentDataset
|
|
19
|
+
trainer.run(ds) # uses data_uploaded=True
|
|
20
|
+
|
|
21
|
+
``upload`` is explicit so the same documents can feed both
|
|
22
|
+
:func:`induce_schema` and :func:`tabularize` without re-uploading.
|
|
23
|
+
For one-shot calls, ``induce_schema`` and ``tabularize`` also accept a
|
|
24
|
+
:class:`DocumentSet` directly and upload internally.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import json
|
|
30
|
+
import random as _random
|
|
31
|
+
import re
|
|
32
|
+
from collections.abc import Iterable, Iterator
|
|
33
|
+
from dataclasses import dataclass, field
|
|
34
|
+
from enum import StrEnum
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import Any, Literal
|
|
37
|
+
|
|
38
|
+
import httpx
|
|
39
|
+
from outerproduct_http_types import (
|
|
40
|
+
AnswerType as _AnswerTypePayload,
|
|
41
|
+
CreateDocumentUploadRequest,
|
|
42
|
+
DocumentRef,
|
|
43
|
+
InduceSchemaRequest,
|
|
44
|
+
Question as QuestionPayload,
|
|
45
|
+
Schema as SchemaPayload,
|
|
46
|
+
TabularizeRequest,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
from outerproduct.client import OuterProductClient
|
|
50
|
+
from outerproduct.dataset import Column, Dataset
|
|
51
|
+
|
|
52
|
+
DocumentMediaType = Literal[
|
|
53
|
+
"application/pdf",
|
|
54
|
+
"image/png",
|
|
55
|
+
"image/jpeg",
|
|
56
|
+
"image/gif",
|
|
57
|
+
"image/webp",
|
|
58
|
+
"text/plain",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AnswerType(StrEnum):
|
|
63
|
+
"""The legal answer shapes for a :class:`DocumentQuestion`."""
|
|
64
|
+
|
|
65
|
+
BOOLEAN = "boolean"
|
|
66
|
+
NUMBER = "number"
|
|
67
|
+
INTEGER = "integer"
|
|
68
|
+
ENUM = "enum"
|
|
69
|
+
MULTI_ENUM = "multi_enum"
|
|
70
|
+
DATE = "date"
|
|
71
|
+
DATE_RANGE = "date_range"
|
|
72
|
+
STRING = "string"
|
|
73
|
+
TEXT = "text"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
_ID_RE = re.compile(r"^[a-z][a-z0-9_]*$")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass(frozen=True)
|
|
80
|
+
class DocumentQuestion:
|
|
81
|
+
"""One survey question in a :class:`DocumentSchema`."""
|
|
82
|
+
|
|
83
|
+
id: str
|
|
84
|
+
question: str
|
|
85
|
+
answer_type: AnswerType
|
|
86
|
+
rationale: str | None = None
|
|
87
|
+
unit: str | None = None
|
|
88
|
+
enum: tuple[str, ...] | None = None
|
|
89
|
+
|
|
90
|
+
def __post_init__(self) -> None:
|
|
91
|
+
if not _ID_RE.match(self.id):
|
|
92
|
+
raise ValueError(f"DocumentQuestion.id={self.id!r} must be snake_case")
|
|
93
|
+
if self.answer_type in (AnswerType.ENUM, AnswerType.MULTI_ENUM):
|
|
94
|
+
if not self.enum:
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"DocumentQuestion {self.id!r}: answer_type="
|
|
97
|
+
f"{self.answer_type.value} requires non-empty `enum`"
|
|
98
|
+
)
|
|
99
|
+
elif self.enum is not None:
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"DocumentQuestion {self.id!r}: `enum` is only valid for "
|
|
102
|
+
"answer_type='enum' or 'multi_enum'"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def to_dict(self) -> dict[str, Any]:
|
|
106
|
+
out: dict[str, Any] = {
|
|
107
|
+
"id": self.id,
|
|
108
|
+
"question": self.question,
|
|
109
|
+
"answer_type": self.answer_type.value,
|
|
110
|
+
}
|
|
111
|
+
if self.rationale is not None:
|
|
112
|
+
out["rationale"] = self.rationale
|
|
113
|
+
if self.unit is not None:
|
|
114
|
+
out["unit"] = self.unit
|
|
115
|
+
if self.enum is not None:
|
|
116
|
+
out["enum"] = list(self.enum)
|
|
117
|
+
return out
|
|
118
|
+
|
|
119
|
+
@classmethod
|
|
120
|
+
def from_dict(cls, obj: dict[str, Any]) -> DocumentQuestion:
|
|
121
|
+
enum = obj.get("enum")
|
|
122
|
+
return cls(
|
|
123
|
+
id=obj["id"],
|
|
124
|
+
question=obj["question"],
|
|
125
|
+
answer_type=AnswerType(obj["answer_type"]),
|
|
126
|
+
rationale=obj.get("rationale"),
|
|
127
|
+
unit=obj.get("unit"),
|
|
128
|
+
enum=tuple(enum) if enum is not None else None,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class DocumentSchema:
|
|
134
|
+
"""A frozen list of :class:`DocumentQuestion` for one skill and one
|
|
135
|
+
use case. Persist with :meth:`save`; reload with :meth:`load`."""
|
|
136
|
+
|
|
137
|
+
skill: str
|
|
138
|
+
use_case: str
|
|
139
|
+
questions: list[DocumentQuestion]
|
|
140
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
141
|
+
|
|
142
|
+
def __post_init__(self) -> None:
|
|
143
|
+
seen: set[str] = set()
|
|
144
|
+
for q in self.questions:
|
|
145
|
+
if q.id in seen:
|
|
146
|
+
raise ValueError(f"DocumentSchema: duplicate question id {q.id!r}")
|
|
147
|
+
seen.add(q.id)
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def question_ids(self) -> list[str]:
|
|
151
|
+
return [q.id for q in self.questions]
|
|
152
|
+
|
|
153
|
+
def get(self, question_id: str) -> DocumentQuestion:
|
|
154
|
+
for q in self.questions:
|
|
155
|
+
if q.id == question_id:
|
|
156
|
+
return q
|
|
157
|
+
raise KeyError(question_id)
|
|
158
|
+
|
|
159
|
+
def to_dict(self) -> dict[str, Any]:
|
|
160
|
+
return {
|
|
161
|
+
"skill": self.skill,
|
|
162
|
+
"use_case": self.use_case,
|
|
163
|
+
"questions": [q.to_dict() for q in self.questions],
|
|
164
|
+
"metadata": self.metadata,
|
|
165
|
+
"_format_version": 1,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def from_dict(cls, obj: dict[str, Any]) -> DocumentSchema:
|
|
170
|
+
return cls(
|
|
171
|
+
skill=obj["skill"],
|
|
172
|
+
use_case=obj["use_case"],
|
|
173
|
+
questions=[DocumentQuestion.from_dict(q) for q in obj["questions"]],
|
|
174
|
+
metadata=obj.get("metadata", {}),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def save(self, path: str | Path) -> None:
|
|
178
|
+
Path(path).write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
|
|
179
|
+
|
|
180
|
+
@classmethod
|
|
181
|
+
def load(cls, path: str | Path) -> DocumentSchema:
|
|
182
|
+
return cls.from_dict(json.loads(Path(path).read_text(encoding="utf-8")))
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
_PDF_SUFFIXES = {".pdf"}
|
|
186
|
+
_IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".gif", ".webp"}
|
|
187
|
+
_TEXT_SUFFIXES = {".txt", ".md"}
|
|
188
|
+
_SUPPORTED_SUFFIXES = _PDF_SUFFIXES | _IMAGE_SUFFIXES | _TEXT_SUFFIXES
|
|
189
|
+
_MIME_BY_SUFFIX: dict[str, DocumentMediaType] = {
|
|
190
|
+
".pdf": "application/pdf",
|
|
191
|
+
".png": "image/png",
|
|
192
|
+
".jpg": "image/jpeg",
|
|
193
|
+
".jpeg": "image/jpeg",
|
|
194
|
+
".gif": "image/gif",
|
|
195
|
+
".webp": "image/webp",
|
|
196
|
+
".txt": "text/plain",
|
|
197
|
+
".md": "text/plain",
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@dataclass(frozen=True)
|
|
202
|
+
class Document:
|
|
203
|
+
"""One document — raw bytes plus identity."""
|
|
204
|
+
|
|
205
|
+
document_id: str
|
|
206
|
+
bytes_: bytes
|
|
207
|
+
media_type: DocumentMediaType
|
|
208
|
+
source: Path | None = None
|
|
209
|
+
|
|
210
|
+
@classmethod
|
|
211
|
+
def from_path(cls, path: str | Path, *, document_id: str | None = None) -> Document:
|
|
212
|
+
p = Path(path)
|
|
213
|
+
suffix = p.suffix.lower()
|
|
214
|
+
if suffix not in _MIME_BY_SUFFIX:
|
|
215
|
+
raise ValueError(
|
|
216
|
+
f"Document.from_path: unsupported suffix {suffix!r} for {p}; "
|
|
217
|
+
f"supported: {sorted(_SUPPORTED_SUFFIXES)}"
|
|
218
|
+
)
|
|
219
|
+
return cls(
|
|
220
|
+
document_id=document_id or p.stem,
|
|
221
|
+
bytes_=p.read_bytes(),
|
|
222
|
+
media_type=_MIME_BY_SUFFIX[suffix],
|
|
223
|
+
source=p,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
@classmethod
|
|
227
|
+
def from_text(cls, text: str, *, document_id: str) -> Document:
|
|
228
|
+
return cls(
|
|
229
|
+
document_id=document_id,
|
|
230
|
+
bytes_=text.encode("utf-8"),
|
|
231
|
+
media_type="text/plain",
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@dataclass
|
|
236
|
+
class DocumentSet:
|
|
237
|
+
"""An ordered collection of :class:`Document`."""
|
|
238
|
+
|
|
239
|
+
documents: list[Document] = field(default_factory=list)
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def from_directory(
|
|
243
|
+
cls,
|
|
244
|
+
path: str | Path,
|
|
245
|
+
*,
|
|
246
|
+
glob: str = "*",
|
|
247
|
+
recursive: bool = False,
|
|
248
|
+
) -> DocumentSet:
|
|
249
|
+
root = Path(path)
|
|
250
|
+
if not root.is_dir():
|
|
251
|
+
raise NotADirectoryError(
|
|
252
|
+
f"DocumentSet.from_directory: {root} is not a directory"
|
|
253
|
+
)
|
|
254
|
+
iterator = root.rglob(glob) if recursive else root.glob(glob)
|
|
255
|
+
docs = [
|
|
256
|
+
Document.from_path(p)
|
|
257
|
+
for p in sorted(iterator)
|
|
258
|
+
if p.is_file() and p.suffix.lower() in _SUPPORTED_SUFFIXES
|
|
259
|
+
]
|
|
260
|
+
return cls(documents=docs)
|
|
261
|
+
|
|
262
|
+
@classmethod
|
|
263
|
+
def from_paths(cls, paths: Iterable[str | Path]) -> DocumentSet:
|
|
264
|
+
return cls(documents=[Document.from_path(p) for p in paths])
|
|
265
|
+
|
|
266
|
+
def sample(self, n: int, *, seed: int = 0) -> DocumentSet:
|
|
267
|
+
if n <= 0:
|
|
268
|
+
raise ValueError(f"sample: n must be positive, got {n}")
|
|
269
|
+
if n >= len(self.documents):
|
|
270
|
+
return DocumentSet(documents=list(self.documents))
|
|
271
|
+
rng = _random.Random(seed)
|
|
272
|
+
return DocumentSet(documents=rng.sample(self.documents, n))
|
|
273
|
+
|
|
274
|
+
def __iter__(self) -> Iterator[Document]:
|
|
275
|
+
return iter(self.documents)
|
|
276
|
+
|
|
277
|
+
def __len__(self) -> int:
|
|
278
|
+
return len(self.documents)
|
|
279
|
+
|
|
280
|
+
def __getitem__(self, index: int) -> Document:
|
|
281
|
+
return self.documents[index]
|
|
282
|
+
|
|
283
|
+
def __repr__(self) -> str:
|
|
284
|
+
return f"DocumentSet(n={len(self)})"
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
class DocumentDataset(Dataset):
|
|
288
|
+
"""A remote-handle :class:`Dataset` whose rows are tabularized
|
|
289
|
+
documents and columns are :class:`DocumentSchema` questions.
|
|
290
|
+
|
|
291
|
+
Holds no rows locally. The tabularized table lives on the
|
|
292
|
+
OuterProduct backend under ``model_id`` (the same path used by
|
|
293
|
+
pre-uploaded datasets), and the trainer/reasoning APIs fetch from
|
|
294
|
+
that location server-side via the standard ``data_uploaded=True``
|
|
295
|
+
wire mode.
|
|
296
|
+
|
|
297
|
+
Construct directly when you already have a tabularize ``model_id``,
|
|
298
|
+
or — more commonly — let :func:`tabularize` build one for you.
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
def __init__(
|
|
302
|
+
self,
|
|
303
|
+
*,
|
|
304
|
+
schema: DocumentSchema,
|
|
305
|
+
document_ids: list[str],
|
|
306
|
+
model_id: str,
|
|
307
|
+
label_column: str | None = None,
|
|
308
|
+
columns: list[Column] | None = None,
|
|
309
|
+
) -> None:
|
|
310
|
+
import pandas as pd
|
|
311
|
+
|
|
312
|
+
empty = pd.DataFrame(columns=pd.Index([q.id for q in schema.questions]))
|
|
313
|
+
super().__init__(empty, label_column=label_column, columns=columns)
|
|
314
|
+
self._upload_id = model_id
|
|
315
|
+
self._schema = schema
|
|
316
|
+
self._document_ids = list(document_ids)
|
|
317
|
+
|
|
318
|
+
@property
|
|
319
|
+
def schema(self) -> DocumentSchema:
|
|
320
|
+
return self._schema
|
|
321
|
+
|
|
322
|
+
@property
|
|
323
|
+
def document_ids(self) -> list[str]:
|
|
324
|
+
return list(self._document_ids)
|
|
325
|
+
|
|
326
|
+
@property
|
|
327
|
+
def model_id(self) -> str:
|
|
328
|
+
"""The tabularize-job ``model_id`` that backs this dataset on the server."""
|
|
329
|
+
assert self._upload_id is not None
|
|
330
|
+
return self._upload_id
|
|
331
|
+
|
|
332
|
+
@property
|
|
333
|
+
def n_samples(self) -> int:
|
|
334
|
+
return len(self._document_ids)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
# --------------------------------------------------------------------------- #
|
|
338
|
+
# Wire conversions between the in-SDK data classes and the http-types models. #
|
|
339
|
+
# --------------------------------------------------------------------------- #
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _question_to_payload(q: DocumentQuestion) -> QuestionPayload:
|
|
343
|
+
return QuestionPayload(
|
|
344
|
+
id=q.id,
|
|
345
|
+
question=q.question,
|
|
346
|
+
answer_type=_AnswerTypePayload(q.answer_type.value),
|
|
347
|
+
rationale=q.rationale,
|
|
348
|
+
unit=q.unit,
|
|
349
|
+
enum=list(q.enum) if q.enum is not None else None,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _question_from_payload(p: QuestionPayload) -> DocumentQuestion:
|
|
354
|
+
return DocumentQuestion(
|
|
355
|
+
id=p.id,
|
|
356
|
+
question=p.question,
|
|
357
|
+
answer_type=AnswerType(p.answer_type.value),
|
|
358
|
+
rationale=p.rationale,
|
|
359
|
+
unit=p.unit,
|
|
360
|
+
enum=tuple(p.enum) if p.enum is not None else None,
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _schema_to_payload(s: DocumentSchema) -> SchemaPayload:
|
|
365
|
+
return SchemaPayload(
|
|
366
|
+
skill=s.skill,
|
|
367
|
+
use_case=s.use_case,
|
|
368
|
+
questions=[_question_to_payload(q) for q in s.questions],
|
|
369
|
+
metadata=dict(s.metadata),
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _schema_from_payload(p: SchemaPayload) -> DocumentSchema:
|
|
374
|
+
return DocumentSchema(
|
|
375
|
+
skill=p.skill,
|
|
376
|
+
use_case=p.use_case,
|
|
377
|
+
questions=[_question_from_payload(q) for q in p.questions],
|
|
378
|
+
metadata=dict(p.metadata),
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _resolve_client(client: OuterProductClient | None) -> OuterProductClient:
|
|
383
|
+
return client if client is not None else OuterProductClient.from_credentials()
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _coerce_to_refs(
|
|
387
|
+
documents: DocumentSet | list[DocumentRef],
|
|
388
|
+
client: OuterProductClient,
|
|
389
|
+
) -> list[DocumentRef]:
|
|
390
|
+
"""Accept either a :class:`DocumentSet` (upload internally) or
|
|
391
|
+
pre-uploaded :class:`DocumentRef` list (return as-is)."""
|
|
392
|
+
if isinstance(documents, DocumentSet):
|
|
393
|
+
return _upload_document_set(documents, client)
|
|
394
|
+
return list(documents)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _upload_document_set(
|
|
398
|
+
documents: DocumentSet, client: OuterProductClient
|
|
399
|
+
) -> list[DocumentRef]:
|
|
400
|
+
refs: list[DocumentRef] = []
|
|
401
|
+
for doc in documents:
|
|
402
|
+
resp = client.uploads_api.create_document(
|
|
403
|
+
CreateDocumentUploadRequest(
|
|
404
|
+
document_id=doc.document_id, media_type=doc.media_type
|
|
405
|
+
)
|
|
406
|
+
)
|
|
407
|
+
put = httpx.put(
|
|
408
|
+
resp.upload_url,
|
|
409
|
+
content=doc.bytes_,
|
|
410
|
+
headers={"Content-Type": doc.media_type},
|
|
411
|
+
)
|
|
412
|
+
put.raise_for_status()
|
|
413
|
+
refs.append(
|
|
414
|
+
DocumentRef(
|
|
415
|
+
document_id=resp.document_id,
|
|
416
|
+
upload_key=resp.upload_key,
|
|
417
|
+
media_type=resp.media_type,
|
|
418
|
+
)
|
|
419
|
+
)
|
|
420
|
+
return refs
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
# --------------------------------------------------------------------------- #
|
|
424
|
+
# Public entry points #
|
|
425
|
+
# --------------------------------------------------------------------------- #
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def upload(
|
|
429
|
+
documents: DocumentSet, *, client: OuterProductClient | None = None
|
|
430
|
+
) -> list[DocumentRef]:
|
|
431
|
+
"""Upload every document in ``documents`` via per-file presigned URLs.
|
|
432
|
+
|
|
433
|
+
Returns a list of :class:`DocumentRef` you can pass to
|
|
434
|
+
:func:`induce_schema` and :func:`tabularize` — uploading once and
|
|
435
|
+
reusing the refs avoids re-uploading the same bytes for both jobs.
|
|
436
|
+
"""
|
|
437
|
+
op = _resolve_client(client)
|
|
438
|
+
return _upload_document_set(documents, op)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def induce_schema(
|
|
442
|
+
documents: DocumentSet | list[DocumentRef],
|
|
443
|
+
*,
|
|
444
|
+
use_case: str,
|
|
445
|
+
skill: str,
|
|
446
|
+
client: OuterProductClient | None = None,
|
|
447
|
+
) -> DocumentSchema:
|
|
448
|
+
"""Run the agent over a sample of documents to produce a frozen schema.
|
|
449
|
+
|
|
450
|
+
Pass either a :class:`DocumentSet` (uploads internally) or a list of
|
|
451
|
+
:class:`DocumentRef` from a previous :func:`upload` call. Submits the
|
|
452
|
+
job, polls until completion, and returns the produced
|
|
453
|
+
:class:`DocumentSchema`.
|
|
454
|
+
"""
|
|
455
|
+
op = _resolve_client(client)
|
|
456
|
+
refs = _coerce_to_refs(documents, op)
|
|
457
|
+
submission = op.agentic_documents_api.induce_schema(
|
|
458
|
+
InduceSchemaRequest(documents=refs, use_case=use_case, skill=skill)
|
|
459
|
+
)
|
|
460
|
+
op.wait_for(submission.model_id)
|
|
461
|
+
result = op.agentic_documents_api.get_schema(submission.model_id)
|
|
462
|
+
return _schema_from_payload(result.schema_)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def tabularize(
|
|
466
|
+
documents: DocumentSet | list[DocumentRef],
|
|
467
|
+
schema: DocumentSchema,
|
|
468
|
+
*,
|
|
469
|
+
web_augmentation: bool = False,
|
|
470
|
+
concurrency: int = 1,
|
|
471
|
+
label_column: str | None = None,
|
|
472
|
+
client: OuterProductClient | None = None,
|
|
473
|
+
) -> DocumentDataset:
|
|
474
|
+
"""Extract every document against ``schema`` and return a :class:`DocumentDataset`.
|
|
475
|
+
|
|
476
|
+
The tabularized rows are written to object storage on the backend;
|
|
477
|
+
the returned :class:`DocumentDataset` is a remote handle that
|
|
478
|
+
:class:`~outerproduct.Trainer` and :func:`~outerproduct.reasoning.fit`
|
|
479
|
+
consume directly via the standard ``data_uploaded=True`` wire mode.
|
|
480
|
+
No rows are downloaded to the client.
|
|
481
|
+
"""
|
|
482
|
+
op = _resolve_client(client)
|
|
483
|
+
refs = _coerce_to_refs(documents, op)
|
|
484
|
+
submission = op.agentic_documents_api.tabularize(
|
|
485
|
+
TabularizeRequest.model_validate(
|
|
486
|
+
{
|
|
487
|
+
"documents": [r.model_dump() for r in refs],
|
|
488
|
+
"schema": _schema_to_payload(schema).model_dump(),
|
|
489
|
+
"web_augmentation": web_augmentation,
|
|
490
|
+
"concurrency": concurrency,
|
|
491
|
+
}
|
|
492
|
+
)
|
|
493
|
+
)
|
|
494
|
+
op.wait_for(submission.model_id)
|
|
495
|
+
result = op.agentic_documents_api.get_table(submission.model_id)
|
|
496
|
+
return DocumentDataset(
|
|
497
|
+
schema=_schema_from_payload(result.schema_),
|
|
498
|
+
document_ids=list(result.document_ids),
|
|
499
|
+
model_id=result.model_id,
|
|
500
|
+
label_column=label_column,
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
__all__ = [
|
|
505
|
+
"AnswerType",
|
|
506
|
+
"Document",
|
|
507
|
+
"DocumentDataset",
|
|
508
|
+
"DocumentQuestion",
|
|
509
|
+
"DocumentSchema",
|
|
510
|
+
"DocumentSet",
|
|
511
|
+
"induce_schema",
|
|
512
|
+
"tabularize",
|
|
513
|
+
"upload",
|
|
514
|
+
]
|