openaivec 0.14.8__py3-none-any.whl → 0.14.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_dynamic.py +350 -0
- openaivec/_schema.py +101 -278
- openaivec/pandas_ext.py +370 -354
- {openaivec-0.14.8.dist-info → openaivec-0.14.10.dist-info}/METADATA +1 -1
- {openaivec-0.14.8.dist-info → openaivec-0.14.10.dist-info}/RECORD +7 -6
- {openaivec-0.14.8.dist-info → openaivec-0.14.10.dist-info}/WHEEL +0 -0
- {openaivec-0.14.8.dist-info → openaivec-0.14.10.dist-info}/licenses/LICENSE +0 -0
openaivec/_schema.py
CHANGED
|
@@ -3,165 +3,89 @@
|
|
|
3
3
|
This (non-public) module converts a small *representative* sample of free‑text
|
|
4
4
|
examples plus a *purpose* statement into:
|
|
5
5
|
|
|
6
|
-
1. A vetted
|
|
7
|
-
|
|
6
|
+
1. A vetted hierarchical object specification (``ObjectSpec``) whose recursively
|
|
7
|
+
defined ``fields`` (``FieldSpec``) capture reliably extractable signals.
|
|
8
8
|
2. A reusable, self‑contained extraction prompt (``inference_prompt``) that
|
|
9
|
-
|
|
10
|
-
3. A dynamically generated Pydantic model
|
|
11
|
-
|
|
9
|
+
freezes the agreed schema contract (no additions / renames / omissions).
|
|
10
|
+
3. A dynamically generated Pydantic model mirroring the hierarchical schema,
|
|
11
|
+
enabling immediate typed parsing with the OpenAI Responses API.
|
|
12
12
|
4. A ``PreparedTask`` wrapper (``InferredSchema.task``) for downstream batched
|
|
13
|
-
|
|
13
|
+
responses / structured extraction flows in pandas or Spark.
|
|
14
14
|
|
|
15
15
|
Core goals:
|
|
16
16
|
* Minimize manual, subjective schema design iterations.
|
|
17
17
|
* Enforce objective naming / typing / enum rules early (guard rails rather than
|
|
18
|
-
|
|
19
|
-
* Provide deterministic reusability: the same prompt + model yield stable
|
|
20
|
-
|
|
18
|
+
after‑the‑fact cleaning).
|
|
19
|
+
* Provide deterministic reusability: the same prompt + model yield stable field
|
|
20
|
+
ordering & types for analytics or feature engineering.
|
|
21
21
|
* Avoid outcome / target label leakage in predictive (feature engineering)
|
|
22
|
-
|
|
22
|
+
contexts by explicitly excluding direct target restatements.
|
|
23
23
|
|
|
24
24
|
This module is intentionally **internal** (``__all__ = []``). Public users
|
|
25
25
|
should interact through higher‑level batch APIs once a schema has been inferred.
|
|
26
26
|
|
|
27
|
-
Design constraints:
|
|
28
|
-
*
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
*
|
|
32
|
-
|
|
33
|
-
|
|
27
|
+
Design constraints (updated):
|
|
28
|
+
* Root: single ``ObjectSpec`` (UpperCamelCase name) containing one or more fields.
|
|
29
|
+
* Field types: string | integer | float | boolean | enum | object |
|
|
30
|
+
string_array | integer_array | float_array | boolean_array | enum_array | object_array
|
|
31
|
+
* Arrays are homogeneous lists of their base type.
|
|
32
|
+
* Nested objects / arrays of objects are allowed when semantically cohesive; keep
|
|
33
|
+
depth shallow and avoid gratuitous nesting.
|
|
34
|
+
* Enumerations use ``enum_spec`` with explicit ``name`` (UpperCamelCase) and 1–24
|
|
35
|
+
raw label values (project constant). Values collapse by uppercasing; order not guaranteed.
|
|
36
|
+
* Field names: lower_snake_case; unique per containing object.
|
|
37
|
+
* Boolean names: affirmative 'is_' prefix.
|
|
38
|
+
* Numeric (integer/float) names encode unit / measure suffix (e.g. *_count, *_ratio, *_ms).
|
|
34
39
|
* Validation retries ensure a structurally coherent suggestion before returning.
|
|
35
40
|
|
|
36
41
|
Example (conceptual):
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
42
|
+
from openai import OpenAI
|
|
43
|
+
client = OpenAI()
|
|
44
|
+
inferer = SchemaInferer(client=client, model_name="gpt-4.1-mini")
|
|
45
|
+
schema = inferer.infer_schema(
|
|
46
|
+
SchemaInferenceInput(
|
|
47
|
+
examples=["Order #123 delayed due to weather", "Order #456 delivered"],
|
|
48
|
+
purpose="Extract operational status signals for logistics analytics",
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
Model = schema.model # dynamic Pydantic model
|
|
52
|
+
task = schema.task # PreparedTask for batch extraction
|
|
48
53
|
|
|
49
54
|
The implementation purposefully does *not* emit or depend on JSON Schema; the
|
|
50
|
-
authoritative contract is the
|
|
55
|
+
authoritative contract is the recursive ``ObjectSpec`` tree.
|
|
51
56
|
"""
|
|
52
57
|
|
|
53
58
|
from dataclasses import dataclass
|
|
54
|
-
from enum import Enum
|
|
55
|
-
from typing import Literal
|
|
56
59
|
|
|
57
60
|
from openai import OpenAI
|
|
58
61
|
from openai.types.responses import ParsedResponse
|
|
59
|
-
from pydantic import BaseModel, Field
|
|
62
|
+
from pydantic import BaseModel, Field
|
|
60
63
|
|
|
64
|
+
from openaivec._dynamic import ObjectSpec, _build_model
|
|
61
65
|
from openaivec._model import PreparedTask
|
|
62
66
|
|
|
63
67
|
# Internal module: explicitly not part of public API
|
|
64
68
|
__all__: list[str] = []
|
|
65
69
|
|
|
66
70
|
|
|
67
|
-
class FieldSpec(BaseModel):
|
|
68
|
-
"""Specification for a single candidate output field.
|
|
69
|
-
|
|
70
|
-
Each ``FieldSpec`` encodes a *flat*, scalar, semantically atomic unit the
|
|
71
|
-
model should extract. These become columns in downstream DataFrames.
|
|
72
|
-
|
|
73
|
-
Validation focuses on: objective naming, primitive typing, and *optional*
|
|
74
|
-
closed categorical vocabularies. Enumerations are intentionally conservative
|
|
75
|
-
(must derive from clear evidence) to reduce over‑fitted schemas.
|
|
76
|
-
|
|
77
|
-
Attributes:
|
|
78
|
-
name: Lower snake_case unique identifier (regex ^[a-z][a-z0-9_]*$). Avoid
|
|
79
|
-
subjective modifiers ("best", "great", "high_quality").
|
|
80
|
-
type: One of ``string|integer|float|boolean``. ``integer`` only if all
|
|
81
|
-
observed numeric values are whole numbers; ``float`` if any decimal
|
|
82
|
-
or ratio appears. ``boolean`` strictly for explicit binary forms.
|
|
83
|
-
description: Concise, objective extraction rule (what qualifies / what
|
|
84
|
-
to ignore). Disambiguate from overlapping fields if needed.
|
|
85
|
-
enum_values: Optional stable closed set of lowercase string labels
|
|
86
|
-
(2–24). Only for *string* type when the vocabulary is clearly
|
|
87
|
-
evidenced; never hallucinate or extrapolate.
|
|
88
|
-
"""
|
|
89
|
-
|
|
90
|
-
name: str = Field(
|
|
91
|
-
description=(
|
|
92
|
-
"Lower snake_case identifier (regex: ^[a-z][a-z0-9_]*$). Must be unique across all fields and "
|
|
93
|
-
"express the semantic meaning succinctly (no adjectives like 'best', 'great'). For numeric (integer|float) "
|
|
94
|
-
"fields the name MUST include an explicit unit or measure suffix (e.g. _count, _total_count, "
|
|
95
|
-
"_duration_seconds, _ms, _price_usd, _ratio, _score) to eliminate ambiguity. Avoid bare numeric nouns like "
|
|
96
|
-
"'duration' or 'value' without unit/scale. Boolean field names MUST begin with 'is_' followed by a "
|
|
97
|
-
"descriptive predicate (e.g. is_active, is_delayed). Use positive forms (is_active) rather than "
|
|
98
|
-
"negated forms (is_not_active)."
|
|
99
|
-
)
|
|
100
|
-
)
|
|
101
|
-
type: Literal[
|
|
102
|
-
"string",
|
|
103
|
-
"integer",
|
|
104
|
-
"float",
|
|
105
|
-
"boolean",
|
|
106
|
-
"string_array",
|
|
107
|
-
"integer_array",
|
|
108
|
-
"float_array",
|
|
109
|
-
"boolean_array",
|
|
110
|
-
] = Field(
|
|
111
|
-
description=(
|
|
112
|
-
"Primitive type. Use 'integer' only if all observed numeric values are whole numbers. "
|
|
113
|
-
"Use 'float' if any value can contain a decimal or represents a ratio/score. Use 'boolean' only for "
|
|
114
|
-
"explicit binary states (yes/no, true/false, present/absent) consistently encoded. Use 'string' otherwise. "
|
|
115
|
-
"Array variants (string_array, integer_array, float_array, boolean_array) are ONLY allowed when the value "
|
|
116
|
-
"is a repeatable homogeneous collection whose individual elements would otherwise stand as valid scalar "
|
|
117
|
-
"extractions (e.g. keywords, error_codes, tag_ids). Do not encode objects or mixed-type arrays; flatten or "
|
|
118
|
-
"choose the most informative level."
|
|
119
|
-
)
|
|
120
|
-
)
|
|
121
|
-
description: str = Field(
|
|
122
|
-
description=(
|
|
123
|
-
"Concise, objective definition plus extraction rule (what qualifies / what to ignore). Avoid subjective, "
|
|
124
|
-
"speculative, or promotional language. If ambiguity exists with another field, clarify the distinction. "
|
|
125
|
-
"Do NOT simply restate an original JSON/key name if the examples are already structured; only include a "
|
|
126
|
-
"raw key verbatim when it is already the minimal, irreducible analytic unit. For derived fields, clearly "
|
|
127
|
-
"state the transformation (e.g. sentiment of comment_text, normalized date, language code)."
|
|
128
|
-
)
|
|
129
|
-
)
|
|
130
|
-
enum_values: list[str] | None = Field(
|
|
131
|
-
default=None,
|
|
132
|
-
description=(
|
|
133
|
-
"Optional finite categorical label set (classification) for a string field. Provide ONLY when a closed, "
|
|
134
|
-
"stable vocabulary (2–24 lowercase tokens) is clearly evidenced or strongly implied by examples. "
|
|
135
|
-
"Do NOT invent labels. Omit if open-ended or ambiguous. Order must be stable and semantically natural."
|
|
136
|
-
),
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
|
|
140
71
|
class InferredSchema(BaseModel):
|
|
141
72
|
"""Result of a schema inference round.
|
|
142
73
|
|
|
143
|
-
Contains the normalized *purpose*,
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
adding novel unstated facts is disallowed to preserve traceability.
|
|
74
|
+
Contains the normalized *purpose*, objective *examples_summary*, the root
|
|
75
|
+
hierarchical ``object_spec`` contract, and the canonical reusable
|
|
76
|
+
``inference_prompt``. The prompt MUST be fully derivable from the other
|
|
77
|
+
components (no new unstated facts) to preserve traceability.
|
|
148
78
|
|
|
149
79
|
Attributes:
|
|
150
|
-
purpose: Unambiguous restatement of the user's objective
|
|
151
|
-
redundancy removed).
|
|
80
|
+
purpose: Unambiguous restatement of the user's objective.
|
|
152
81
|
examples_summary: Neutral description of structural / semantic patterns
|
|
153
|
-
observed in the examples
|
|
154
|
-
examples_purpose_alignment:
|
|
155
|
-
recurring
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
fields: Ordered list of ``FieldSpec`` objects comprising the schema's
|
|
161
|
-
sole authoritative contract.
|
|
162
|
-
inference_prompt: Self-contained extraction instructions enforcing an
|
|
163
|
-
exact field set (names, order, primitive types) with prohibition on
|
|
164
|
-
alterations or subjective flourishes.
|
|
82
|
+
observed in the examples.
|
|
83
|
+
examples_purpose_alignment: Mapping from purpose facets to concrete
|
|
84
|
+
recurring evidence (or explicit gaps) anchoring extraction scope.
|
|
85
|
+
object_spec: Root ``ObjectSpec`` (UpperCamelCase name) whose ``fields``
|
|
86
|
+
recursively define the extraction schema.
|
|
87
|
+
inference_prompt: Canonical instructions enforcing exact field names,
|
|
88
|
+
hierarchy, and types (no additions/removals/renames).
|
|
165
89
|
"""
|
|
166
90
|
|
|
167
91
|
purpose: str = Field(
|
|
@@ -183,20 +107,17 @@ class InferredSchema(BaseModel):
|
|
|
183
107
|
"reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream extraction."
|
|
184
108
|
)
|
|
185
109
|
)
|
|
186
|
-
|
|
110
|
+
object_spec: ObjectSpec = Field(
|
|
187
111
|
description=(
|
|
188
|
-
"
|
|
189
|
-
"
|
|
112
|
+
"Root ObjectSpec (recursive). Each contained object's field list is unique-name ordered and derived "
|
|
113
|
+
"strictly from observable, repeatable signals aligned with the purpose."
|
|
190
114
|
)
|
|
191
115
|
)
|
|
192
116
|
inference_prompt: str = Field(
|
|
193
117
|
description=(
|
|
194
|
-
"Canonical, reusable extraction prompt
|
|
195
|
-
"
|
|
196
|
-
"
|
|
197
|
-
"and primitive types; (2) forbid adding, removing, or renaming fields; (3) avoid subjective or "
|
|
198
|
-
"marketing language; (4) be self-contained (no TODOs, no external references, no unresolved "
|
|
199
|
-
"placeholders). Intended for direct reuse as the prompt for deterministic alignment with 'fields'."
|
|
118
|
+
"Canonical, reusable extraction prompt. Must be derivable from purpose + summaries + object_spec. Enforces "
|
|
119
|
+
"exact hierarchical field set (names, order per object, types) forbidding additions, removals, renames, or "
|
|
120
|
+
"subjective language. Self-contained (no TODOs, external refs, or placeholders)."
|
|
200
121
|
)
|
|
201
122
|
)
|
|
202
123
|
|
|
@@ -236,53 +157,7 @@ class InferredSchema(BaseModel):
|
|
|
236
157
|
)
|
|
237
158
|
|
|
238
159
|
def build_model(self) -> type[BaseModel]:
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
Implementation details:
|
|
242
|
-
* Maps primitive types: string→``str``, integer→``int``, float→``float``, boolean→``bool``.
|
|
243
|
-
* For enumerated string fields, constructs an ad‑hoc ``Enum`` subclass with
|
|
244
|
-
stable member names (collision‑safe, normalized to ``UPPER_SNAKE``).
|
|
245
|
-
* All fields are required (ellipsis ``...``). Optionality can be
|
|
246
|
-
introduced later by modifying this logic if needed.
|
|
247
|
-
|
|
248
|
-
Returns:
|
|
249
|
-
type[BaseModel]: New (not cached) model type; order matches ``fields``.
|
|
250
|
-
"""
|
|
251
|
-
type_map: dict[str, type] = {
|
|
252
|
-
"string": str,
|
|
253
|
-
"integer": int,
|
|
254
|
-
"float": float,
|
|
255
|
-
"boolean": bool,
|
|
256
|
-
}
|
|
257
|
-
fields: dict[str, tuple[type, object]] = {}
|
|
258
|
-
|
|
259
|
-
for spec in self.fields:
|
|
260
|
-
py_type: type
|
|
261
|
-
if spec.enum_values:
|
|
262
|
-
enum_class_name = "Enum_" + "".join(part.capitalize() for part in spec.name.split("_"))
|
|
263
|
-
members: dict[str, str] = {}
|
|
264
|
-
for raw in spec.enum_values:
|
|
265
|
-
sanitized = raw.upper().replace("-", "_").replace(" ", "_")
|
|
266
|
-
if not sanitized or sanitized[0].isdigit():
|
|
267
|
-
sanitized = f"V_{sanitized}"
|
|
268
|
-
base = sanitized
|
|
269
|
-
i = 2
|
|
270
|
-
while sanitized in members:
|
|
271
|
-
sanitized = f"{base}_{i}"
|
|
272
|
-
i += 1
|
|
273
|
-
members[sanitized] = raw
|
|
274
|
-
enum_cls = Enum(enum_class_name, members) # type: ignore[arg-type]
|
|
275
|
-
py_type = enum_cls
|
|
276
|
-
else:
|
|
277
|
-
if spec.type.endswith("_array"):
|
|
278
|
-
base = spec.type.rsplit("_", 1)[0]
|
|
279
|
-
py_type = list[type_map[base]] # type: ignore[index]
|
|
280
|
-
else:
|
|
281
|
-
py_type = type_map[spec.type]
|
|
282
|
-
fields[spec.name] = (py_type, Field(description=spec.description))
|
|
283
|
-
|
|
284
|
-
model = create_model("InferredSchema", **fields) # type: ignore[call-arg]
|
|
285
|
-
return model
|
|
160
|
+
return _build_model(self.object_spec)
|
|
286
161
|
|
|
287
162
|
def save(self, path: str) -> None:
|
|
288
163
|
"""Persist this inferred schema as pretty‑printed JSON.
|
|
@@ -326,56 +201,41 @@ You are a schema inference engine.
|
|
|
326
201
|
Task:
|
|
327
202
|
1. Normalize the user's purpose (eliminate ambiguity, redundancy, contradictions).
|
|
328
203
|
2. Objectively summarize observable patterns in the example texts.
|
|
329
|
-
3. Produce an "examples_purpose_alignment" explanation
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
This MUST NOT introduce new domain facts beyond the examples & purpose.
|
|
333
|
-
4. Propose a minimal flat set of scalar fields (and ONLY when justified,
|
|
334
|
-
homogeneous primitive arrays) that are reliably extractable.
|
|
204
|
+
3. Produce an "examples_purpose_alignment" explanation mapping purpose facets to concrete recurring evidence (or gaps).
|
|
205
|
+
4. Propose a minimal hierarchical schema (root ObjectSpec) comprised of reliably extractable fields. Use nesting ONLY
|
|
206
|
+
when a group of fields forms a cohesive sub-entity repeated in the data; otherwise keep flat.
|
|
335
207
|
5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
|
|
336
|
-
6. Provide
|
|
337
|
-
|
|
338
|
-
7. If the purpose indicates prediction (predict / probability / likelihood),
|
|
339
|
-
|
|
208
|
+
6. Provide enum_spec ONLY when a small stable closed categorical set (1–{_MAX_ENUM_VALUES} raw tokens) is clearly
|
|
209
|
+
evidenced; never invent unseen categories.
|
|
210
|
+
7. If the purpose indicates prediction (predict / probability / likelihood),
|
|
211
|
+
output only explanatory features (no target restatement).
|
|
340
212
|
|
|
341
213
|
Rules:
|
|
342
|
-
-
|
|
343
|
-
-
|
|
344
|
-
|
|
345
|
-
*
|
|
346
|
-
*
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
- Boolean
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
-
|
|
356
|
-
|
|
357
|
-
- Descriptions: concise, objective extraction rules (no marketing/emotion/speculation).
|
|
358
|
-
- enum_values only for string fields with stable closed vocab; omit otherwise.
|
|
359
|
-
- Exclude direct outcome labels (e.g. attrition_probability, will_buy, purchase_likelihood)
|
|
360
|
-
in predictive / feature engineering contexts.
|
|
361
|
-
- When examples already appear as serialized JSON / key-value records, DO NOT merely relist the
|
|
362
|
-
raw original keys unless each is already an atomic, irreducible analytic signal. Prefer high-signal
|
|
363
|
-
derived / normalized / aggregated features (e.g. sentiment, category, language_code, boolean flags,
|
|
364
|
-
normalized_date, count metrics).
|
|
365
|
-
- Superficial renames (adding trivial prefixes/suffixes like _value, _field, new_) are forbidden; a new
|
|
366
|
-
field name must reflect a semantic transformation.
|
|
367
|
-
- Keep field count focused (typically <= 12) prioritizing reusable analytical / ML features over low-signal
|
|
368
|
-
restatements.
|
|
369
|
-
- If you retain an original raw key unchanged, its description must justify why it is minimal and cannot
|
|
370
|
-
be further decomposed without losing analytical value.
|
|
214
|
+
- Field names: lower snake_case, unique within each object, regex ^[a-z][a-z0-9_]*$, no subjective adjectives.
|
|
215
|
+
- Field types: string | integer | float | boolean | enum | object | string_array | integer_array | float_array |
|
|
216
|
+
boolean_array | enum_array | object_array
|
|
217
|
+
* *_array are homogeneous lists of their primitive / enum / object base type.
|
|
218
|
+
* Use object/object_array ONLY for semantically cohesive grouped attributes; avoid gratuitous layers.
|
|
219
|
+
- Enumerations: use enum_spec { name (UpperCamelCase), values [raw_tokens...] }. values length 1–{_MAX_ENUM_VALUES}.
|
|
220
|
+
Use ONLY when closed set is evidenced. Otherwise, use string.
|
|
221
|
+
- Numeric (integer|float) names encode explicit unit/measure suffix (e.g. *_count, *_seconds, *_usd, *_ratio, *_score).
|
|
222
|
+
- Boolean names start with 'is_' followed by positive predicate (no negations like is_not_*).
|
|
223
|
+
- Array field names SHOULD end with '_array' for primitive/enum arrays; object_array
|
|
224
|
+
fields may use plural noun or *_array pattern.
|
|
225
|
+
- Descriptions: concise, objective extraction criteria (no marketing/emotion/speculation).
|
|
226
|
+
- Exclude direct outcome labels in predictive contexts.
|
|
227
|
+
- Avoid superficial renames; semantic transformation only.
|
|
228
|
+
- Keep total field count focused (typically <= 16) optimizing for reusable analytical / ML features.
|
|
371
229
|
|
|
372
230
|
Output contract:
|
|
373
|
-
Return exactly an InferredSchema object with
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
231
|
+
Return exactly an InferredSchema JSON object with keys:
|
|
232
|
+
- purpose (string)
|
|
233
|
+
- examples_summary (string)
|
|
234
|
+
- examples_purpose_alignment (string)
|
|
235
|
+
- object_spec (ObjectSpec: name, fields[list[FieldSpec]])
|
|
236
|
+
- inference_prompt (string)
|
|
237
|
+
Where each FieldSpec includes: name, type, description, optional enum_spec (for
|
|
238
|
+
enum / enum_array), optional object_spec (for object / object_array).
|
|
379
239
|
""".strip()
|
|
380
240
|
|
|
381
241
|
|
|
@@ -401,14 +261,15 @@ class SchemaInferer:
|
|
|
401
261
|
client: OpenAI
|
|
402
262
|
model_name: str
|
|
403
263
|
|
|
404
|
-
def infer_schema(self, data:
|
|
264
|
+
def infer_schema(self, data: SchemaInferenceInput, *args, max_retries: int = 8, **kwargs) -> InferredSchema:
|
|
405
265
|
"""Infer a validated schema from representative examples.
|
|
406
266
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
267
|
+
Workflow:
|
|
268
|
+
1. Submit ``SchemaInferenceInput`` (JSON) + instructions via
|
|
269
|
+
``responses.parse`` requesting an ``InferredSchema`` object.
|
|
270
|
+
2. Attempt dynamic model build (``parsed.build_model()``) which performs recursive
|
|
271
|
+
structural validation (names, types, enum/object specs) via the dynamic layer.
|
|
272
|
+
3. Retry (up to ``max_retries``) on validation failure.
|
|
412
273
|
|
|
413
274
|
Args:
|
|
414
275
|
data (SchemaInferenceInput): Representative examples + purpose.
|
|
@@ -460,55 +321,17 @@ class SchemaInferer:
|
|
|
460
321
|
)
|
|
461
322
|
parsed = response.output_parsed
|
|
462
323
|
try:
|
|
463
|
-
|
|
464
|
-
parsed.build_model()
|
|
324
|
+
# Validate the field list structure
|
|
325
|
+
parsed.build_model()
|
|
326
|
+
return parsed
|
|
465
327
|
except ValueError as e:
|
|
466
328
|
last_err = e
|
|
467
329
|
previous_errors.append(str(e))
|
|
468
330
|
if attempt == max_retries - 1:
|
|
469
|
-
raise
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
if last_err: # pragma: no cover
|
|
473
|
-
raise last_err
|
|
474
|
-
raise RuntimeError("unreachable retry loop state") # pragma: no cover
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
def _basic_field_list_validation(parsed: InferredSchema) -> None:
|
|
478
|
-
"""Lightweight structural validation of an inferred field list.
|
|
331
|
+
raise ValueError(
|
|
332
|
+
f"Schema validation failed after {max_retries} attempts. Last error: {last_err}"
|
|
333
|
+
) from last_err
|
|
479
334
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
* All types in the allowed primitive set.
|
|
484
|
-
* ``enum_values`` only on string fields and size within bounds (2–24).
|
|
485
|
-
|
|
486
|
-
Args:
|
|
487
|
-
parsed (InferredSchema): Candidate ``InferredSchema`` instance.
|
|
488
|
-
|
|
489
|
-
Raises:
|
|
490
|
-
ValueError: Any invariant is violated.
|
|
491
|
-
"""
|
|
492
|
-
names = [f.name for f in parsed.fields]
|
|
493
|
-
if not names:
|
|
494
|
-
raise ValueError("no fields suggested")
|
|
495
|
-
if len(names) != len(set(names)):
|
|
496
|
-
raise ValueError("duplicate field names detected")
|
|
497
|
-
allowed = {
|
|
498
|
-
"string",
|
|
499
|
-
"integer",
|
|
500
|
-
"float",
|
|
501
|
-
"boolean",
|
|
502
|
-
"string_array",
|
|
503
|
-
"integer_array",
|
|
504
|
-
"float_array",
|
|
505
|
-
"boolean_array",
|
|
506
|
-
}
|
|
507
|
-
for f in parsed.fields:
|
|
508
|
-
if f.type not in allowed:
|
|
509
|
-
raise ValueError(f"unsupported field type: {f.type}")
|
|
510
|
-
if f.enum_values is not None:
|
|
511
|
-
if f.type != "string":
|
|
512
|
-
raise ValueError(f"enum_values only allowed for plain string field: {f.name}")
|
|
513
|
-
if not (2 <= len(f.enum_values) <= 24):
|
|
514
|
-
raise ValueError(f"enum_values length out of bounds for field {f.name}")
|
|
335
|
+
if last_err:
|
|
336
|
+
raise last_err
|
|
337
|
+
raise RuntimeError("unreachable retry loop state")
|