openaivec 0.14.8__py3-none-any.whl → 0.14.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openaivec/_dynamic.py ADDED
@@ -0,0 +1,350 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from enum import Enum
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, Field, create_model
8
+
9
+ __all__: list[str] = []
10
+
11
+ _MAX_ENUM_VALUES = 24
12
+
13
+
14
+ class FieldSpec(BaseModel):
15
+ name: str = Field(
16
+ description=(
17
+ "Field name in lower_snake_case. Rules: (1) Use only lowercase letters, numbers, and underscores; "
18
+ "must start with a letter. (2) For numeric quantities append an explicit unit (e.g. 'duration_seconds', "
19
+ "'price_usd'). (3) Boolean fields use an affirmative 'is_' prefix (e.g. 'is_active'); avoid negative / "
20
+ "ambiguous forms like 'is_deleted' (prefer 'is_active', 'is_enabled'). (4) Name must be unique within the "
21
+ "containing object."
22
+ )
23
+ )
24
+ type: Literal[
25
+ "string",
26
+ "integer",
27
+ "float",
28
+ "boolean",
29
+ "enum",
30
+ "object",
31
+ "string_array",
32
+ "integer_array",
33
+ "float_array",
34
+ "boolean_array",
35
+ "enum_array",
36
+ "object_array",
37
+ ] = Field(
38
+ description=(
39
+ "Logical data type. Allowed values: string | integer | float | boolean | enum | object | string_array | "
40
+ "integer_array | float_array | boolean_array | enum_array | object_array. *_array variants represent a "
41
+ "homogeneous list of the base type. 'enum' / 'enum_array' require 'enum_spec'. 'object' / 'object_array' "
42
+ "require 'object_spec'. Primitives must not define 'enum_spec' or 'object_spec'."
43
+ )
44
+ )
45
+ description: str = Field(
46
+ description=(
47
+ "Human‑readable, concise explanation of the field's meaning and business intent. Should clarify units, "
48
+ "value semantics, and any domain constraints not captured by type. 1–2 sentences; no implementation notes."
49
+ )
50
+ )
51
+ enum_spec: EnumSpec | None = Field(
52
+ default=None,
53
+ description=(
54
+ "Enumeration specification for 'enum' / 'enum_array'. Must be provided (non-empty) for those types and "
55
+ "omitted for all others. Maximum size enforced by constant."
56
+ ),
57
+ )
58
+ object_spec: ObjectSpec | None = Field(
59
+ default=None,
60
+ description=(
61
+ "Nested object schema. Required for 'object' / 'object_array'; must be omitted for every other type. The "
62
+ "contained 'name' is used to derive the generated nested Pydantic model class name."
63
+ ),
64
+ )
65
+
66
+
67
+ class EnumSpec(BaseModel):
68
+ """Enumeration specification for enum / enum_array field types.
69
+
70
+ Attributes:
71
+ name: Required Enum class name (UpperCamelCase). Must match ^[A-Z][A-Za-z0-9]*$. Previously optional; now
72
+ explicit to remove implicit coupling to the field name and make schemas self‑describing.
73
+ values: Raw label values (1–_MAX_ENUM_VALUES before de‑dup). Values are uppercased then
74
+ de-duplicated using a set; ordering of generated Enum members is not guaranteed. Any
75
+ casing variants collapse silently to a single member.
76
+ """
77
+
78
+ name: str = Field(
79
+ description=("Required Enum class name (UpperCamelCase). Valid pattern: ^[A-Z][A-Za-z0-9]*$."),
80
+ )
81
+ values: list[str] = Field(
82
+ description=(
83
+ f"Raw enum label values (1–{_MAX_ENUM_VALUES}). Uppercased then deduplicated; order of members "
84
+ "not guaranteed."
85
+ )
86
+ )
87
+
88
+
89
+ class ObjectSpec(BaseModel):
90
+ name: str = Field(
91
+ description=(
92
+ "Object model class name in UpperCamelCase (singular noun). Must match ^[A-Z][A-Za-z0-9]*$ and is used "
93
+ "directly as the generated Pydantic model class name (no transformation)."
94
+ )
95
+ )
96
+ fields: list[FieldSpec] = Field(
97
+ description=(
98
+ "Non-empty list of FieldSpec definitions composing the object. Each field name must be unique; order is "
99
+ "preserved in the generated model."
100
+ )
101
+ )
102
+
103
+
104
+ def _build_model(object_spec: ObjectSpec) -> type[BaseModel]:
105
+ lower_sname_pattern = re.compile(r"^[a-z][a-z0-9]*(?:_[a-z0-9]+)*$")
106
+ upper_camel_pattern = re.compile(r"^[A-Z][A-Za-z0-9]*$")
107
+ type_map: dict[str, type] = {
108
+ "string": str,
109
+ "integer": int,
110
+ "float": float,
111
+ "boolean": bool,
112
+ "string_array": list[str],
113
+ "integer_array": list[int],
114
+ "float_array": list[float],
115
+ "boolean_array": list[bool],
116
+ }
117
+ output_fields: dict[str, tuple[type, object]] = {}
118
+
119
+ field_names: list[str] = [field.name for field in object_spec.fields]
120
+
121
+ # Assert that names of fields are not duplicated
122
+ if len(field_names) != len(set(field_names)):
123
+ raise ValueError("Field names must be unique within the object spec.")
124
+
125
+ for field in object_spec.fields:
126
+ # Assert that field names are lower_snake_case
127
+ if not lower_sname_pattern.match(field.name):
128
+ raise ValueError(f"Field name '{field.name}' must be in lower_snake_case format (e.g., 'my_field_name').")
129
+
130
+ # (EnumSpec.name now mandatory; no need to derive a fallback name from the field.)
131
+ match field:
132
+ case FieldSpec(
133
+ name=name,
134
+ type="string"
135
+ | "integer"
136
+ | "float"
137
+ | "boolean"
138
+ | "string_array"
139
+ | "integer_array"
140
+ | "float_array"
141
+ | "boolean_array",
142
+ description=description,
143
+ enum_spec=None,
144
+ object_spec=None,
145
+ ):
146
+ field_type = type_map[field.type]
147
+ output_fields[name] = (field_type, Field(description=description))
148
+
149
+ case FieldSpec(name=name, type="enum", description=description, enum_spec=enum_spec, object_spec=None) if (
150
+ enum_spec
151
+ and 0 < len(enum_spec.values) <= _MAX_ENUM_VALUES
152
+ and upper_camel_pattern.match(enum_spec.name)
153
+ ):
154
+ member_names = list({v.upper() for v in enum_spec.values})
155
+ enum_type = Enum(enum_spec.name, member_names)
156
+ output_fields[name] = (enum_type, Field(description=description))
157
+
158
+ case FieldSpec(
159
+ name=name, type="enum_array", description=description, enum_spec=enum_spec, object_spec=None
160
+ ) if (
161
+ enum_spec
162
+ and 0 < len(enum_spec.values) <= _MAX_ENUM_VALUES
163
+ and upper_camel_pattern.match(enum_spec.name)
164
+ ):
165
+ member_names = list({v.upper() for v in enum_spec.values})
166
+ enum_type = Enum(enum_spec.name, member_names)
167
+ output_fields[name] = (list[enum_type], Field(description=description))
168
+
169
+ case FieldSpec(
170
+ name=name, type="object", description=description, enum_spec=None, object_spec=object_spec
171
+ ) if object_spec and upper_camel_pattern.match(object_spec.name):
172
+ nested_model = _build_model(object_spec)
173
+ output_fields[name] = (nested_model, Field(description=description))
174
+
175
+ case FieldSpec(
176
+ name=name, type="object_array", description=description, enum_spec=None, object_spec=object_spec
177
+ ) if object_spec and upper_camel_pattern.match(object_spec.name):
178
+ nested_model = _build_model(object_spec)
179
+ output_fields[name] = (list[nested_model], Field(description=description))
180
+
181
+ # ---- Error cases (explicit reasons) ----
182
+ # Enum type without enum_spec (None or empty)
183
+ case FieldSpec(
184
+ name=name,
185
+ type="enum",
186
+ enum_spec=enum_spec,
187
+ object_spec=None,
188
+ ) if not enum_spec or not enum_spec.values:
189
+ raise ValueError(f"Field '{name}': enum type requires non-empty enum_spec values list.")
190
+ # Enum type exceeding max length
191
+ case FieldSpec(
192
+ name=name,
193
+ type="enum",
194
+ enum_spec=enum_spec,
195
+ object_spec=None,
196
+ ) if enum_spec and len(enum_spec.values) > _MAX_ENUM_VALUES:
197
+ raise ValueError(
198
+ (
199
+ f"Field '{name}': enum type supports at most {_MAX_ENUM_VALUES} enum_spec values "
200
+ f"(got {len(enum_spec.values)})."
201
+ )
202
+ )
203
+ # Enum type invalid explicit name pattern
204
+ case FieldSpec(
205
+ name=name,
206
+ type="enum",
207
+ enum_spec=enum_spec,
208
+ object_spec=None,
209
+ ) if enum_spec and not upper_camel_pattern.match(enum_spec.name):
210
+ raise ValueError(
211
+ (f"Field '{name}': enum_spec.name '{enum_spec.name}' invalid – must match ^[A-Z][A-Za-z0-9]*$")
212
+ )
213
+ # Enum type incorrectly provides an object_spec
214
+ case FieldSpec(
215
+ name=name,
216
+ type="enum",
217
+ enum_spec=enum_spec,
218
+ object_spec=object_spec,
219
+ ) if object_spec is not None:
220
+ raise ValueError(
221
+ f"Field '{name}': enum type must not provide object_spec (got object_spec={object_spec!r})."
222
+ )
223
+ # Enum array type without enum_spec
224
+ case FieldSpec(
225
+ name=name,
226
+ type="enum_array",
227
+ enum_spec=enum_spec,
228
+ object_spec=None,
229
+ ) if not enum_spec or not enum_spec.values:
230
+ raise ValueError(f"Field '{name}': enum_array type requires non-empty enum_spec values list.")
231
+ # Enum array type exceeding max length
232
+ case FieldSpec(
233
+ name=name,
234
+ type="enum_array",
235
+ enum_spec=enum_spec,
236
+ object_spec=None,
237
+ ) if enum_spec and len(enum_spec.values) > _MAX_ENUM_VALUES:
238
+ raise ValueError(
239
+ (
240
+ f"Field '{name}': enum_array type supports at most {_MAX_ENUM_VALUES} enum_spec values "
241
+ f"(got {len(enum_spec.values)})."
242
+ )
243
+ )
244
+ # Enum array type invalid explicit name pattern
245
+ case FieldSpec(
246
+ name=name,
247
+ type="enum_array",
248
+ enum_spec=enum_spec,
249
+ object_spec=None,
250
+ ) if enum_spec and not upper_camel_pattern.match(enum_spec.name):
251
+ raise ValueError(
252
+ (f"Field '{name}': enum_spec.name '{enum_spec.name}' invalid – must match ^[A-Z][A-Za-z0-9]*$")
253
+ )
254
+ # Enum array type incorrectly provides an object_spec
255
+ case FieldSpec(
256
+ name=name,
257
+ type="enum_array",
258
+ enum_spec=enum_spec,
259
+ object_spec=object_spec,
260
+ ) if object_spec is not None:
261
+ raise ValueError(
262
+ f"Field '{name}': enum_array type must not provide object_spec (got object_spec={object_spec!r})."
263
+ )
264
+ # Object type missing object_spec
265
+ case FieldSpec(
266
+ name=name,
267
+ type="object",
268
+ enum_spec=enum_spec,
269
+ object_spec=None,
270
+ ):
271
+ raise ValueError(f"Field '{name}': object type requires object_spec (got object_spec=None).")
272
+ # Object array type missing object_spec
273
+ case FieldSpec(
274
+ name=name,
275
+ type="object_array",
276
+ enum_spec=enum_spec,
277
+ object_spec=None,
278
+ ):
279
+ raise ValueError(f"Field '{name}': object_array type requires object_spec (got object_spec=None).")
280
+ # Object/object_array provided but invalid name pattern
281
+ case FieldSpec(
282
+ name=name,
283
+ type="object" | "object_array",
284
+ enum_spec=enum_spec,
285
+ object_spec=object_spec,
286
+ ) if object_spec is not None and not upper_camel_pattern.match(object_spec.name):
287
+ raise ValueError(
288
+ (
289
+ f"Field '{name}': object_spec.name '{object_spec.name}' must be UpperCamelCase "
290
+ "(regex ^[A-Z][A-Za-z0-9]*$) and contain only letters and digits."
291
+ )
292
+ )
293
+ # Object/object_array types must not provide enum_spec
294
+ case FieldSpec(
295
+ name=name,
296
+ type="object" | "object_array",
297
+ enum_spec=enum_spec,
298
+ object_spec=object_spec,
299
+ ) if enum_spec is not None:
300
+ raise ValueError(
301
+ f"Field '{name}': {field.type} must not define enum_spec (got enum_spec={enum_spec!r})."
302
+ )
303
+ # Primitive / simple array types must not have enum_spec
304
+ case FieldSpec(
305
+ name=name,
306
+ type="string"
307
+ | "integer"
308
+ | "float"
309
+ | "boolean"
310
+ | "string_array"
311
+ | "integer_array"
312
+ | "float_array"
313
+ | "boolean_array",
314
+ enum_spec=enum_spec,
315
+ object_spec=object_spec,
316
+ ) if enum_spec is not None:
317
+ raise ValueError(
318
+ (f"Field '{name}': type '{field.type}' must not define enum_spec (got enum_spec={enum_spec!r}).")
319
+ )
320
+ # Primitive / simple array types must not have object_spec
321
+ case FieldSpec(
322
+ name=name,
323
+ type="string"
324
+ | "integer"
325
+ | "float"
326
+ | "boolean"
327
+ | "string_array"
328
+ | "integer_array"
329
+ | "float_array"
330
+ | "boolean_array",
331
+ enum_spec=None,
332
+ object_spec=object_spec,
333
+ ) if object_spec is not None:
334
+ raise ValueError(
335
+ (
336
+ f"Field '{name}': type '{field.type}' must not define object_spec "
337
+ f"(got object_spec={object_spec!r})."
338
+ )
339
+ )
340
+ # Any other unmatched combination
341
+ case FieldSpec() as f:
342
+ raise ValueError(
343
+ (
344
+ "Field configuration invalid / unrecognized combination: "
345
+ f"name={f.name!r}, type={f.type!r}, enum_spec={'set' if f.enum_spec else None}, "
346
+ f"object_spec={'set' if f.object_spec else None}."
347
+ )
348
+ )
349
+
350
+ return create_model(object_spec.name, **output_fields)
openaivec/_schema.py CHANGED
@@ -3,165 +3,89 @@
3
3
  This (non-public) module converts a small *representative* sample of free‑text
4
4
  examples plus a *purpose* statement into:
5
5
 
6
- 1. A vetted, flat list of scalar field specifications (``FieldSpec``) that can
7
- be *reliably* extracted across similar future inputs.
6
+ 1. A vetted hierarchical object specification (``ObjectSpec``) whose recursively
7
+ defined ``fields`` (``FieldSpec``) capture reliably extractable signals.
8
8
  2. A reusable, self‑contained extraction prompt (``inference_prompt``) that
9
- freezes the agreed schema contract (no additions / renames / omissions).
10
- 3. A dynamically generated Pydantic model whose fields mirror the inferred
11
- schema, enabling immediate typed parsing with the OpenAI Responses API.
9
+ freezes the agreed schema contract (no additions / renames / omissions).
10
+ 3. A dynamically generated Pydantic model mirroring the hierarchical schema,
11
+ enabling immediate typed parsing with the OpenAI Responses API.
12
12
  4. A ``PreparedTask`` wrapper (``InferredSchema.task``) for downstream batched
13
- responses/structured extraction flows in pandas or Spark.
13
+ responses / structured extraction flows in pandas or Spark.
14
14
 
15
15
  Core goals:
16
16
  * Minimize manual, subjective schema design iterations.
17
17
  * Enforce objective naming / typing / enum rules early (guard rails rather than
18
- after‑the‑fact cleaning).
19
- * Provide deterministic reusability: the same prompt + model yield stable
20
- column ordering & types for analytics or feature engineering.
18
+ after‑the‑fact cleaning).
19
+ * Provide deterministic reusability: the same prompt + model yield stable field
20
+ ordering & types for analytics or feature engineering.
21
21
  * Avoid outcome / target label leakage in predictive (feature engineering)
22
- contexts by explicitly excluding direct target restatements.
22
+ contexts by explicitly excluding direct target restatements.
23
23
 
24
24
  This module is intentionally **internal** (``__all__ = []``). Public users
25
25
  should interact through higher‑level batch APIs once a schema has been inferred.
26
26
 
27
- Design constraints:
28
- * Flat schema only (no nested objects). Top-level arrays permitted ONLY as homogeneous arrays of primitives
29
- (e.g. array of strings) represented via specialized primitive array type names
30
- (string_array, integer_array, float_array, boolean_array).
31
- * Primitive scalar types limited to {string, integer, float, boolean}; optional array variants
32
- {string_array, integer_array, float_array, boolean_array}.
33
- * Optional enumerations for *closed*, *observed* categorical sets only.
27
+ Design constraints (updated):
28
+ * Root: single ``ObjectSpec`` (UpperCamelCase name) containing one or more fields.
29
+ * Field types: string | integer | float | boolean | enum | object |
30
+ string_array | integer_array | float_array | boolean_array | enum_array | object_array
31
+ * Arrays are homogeneous lists of their base type.
32
+ * Nested objects / arrays of objects are allowed when semantically cohesive; keep
33
+ depth shallow and avoid gratuitous nesting.
34
+ * Enumerations use ``enum_spec`` with explicit ``name`` (UpperCamelCase) and 1–24
35
+ raw label values (project constant). Values collapse by uppercasing; order not guaranteed.
36
+ * Field names: lower_snake_case; unique per containing object.
37
+ * Boolean names: affirmative 'is_' prefix.
38
+ * Numeric (integer/float) names encode unit / measure suffix (e.g. *_count, *_ratio, *_ms).
34
39
  * Validation retries ensure a structurally coherent suggestion before returning.
35
40
 
36
41
  Example (conceptual):
37
- from openai import OpenAI
38
- client = OpenAI()
39
- inferer = SchemaInferer(client=client, model_name="gpt-4.1-mini")
40
- schema = inferer.infer_schema(
41
- SchemaInferenceInput(
42
- examples=["Order #123 delayed due to weather", "Order #456 delivered"],
43
- purpose="Extract operational status signals for logistics analytics",
44
- )
45
- )
46
- Model = schema.model # dynamic Pydantic model
47
- task = schema.task # PreparedTask for batch extraction
42
+ from openai import OpenAI
43
+ client = OpenAI()
44
+ inferer = SchemaInferer(client=client, model_name="gpt-4.1-mini")
45
+ schema = inferer.infer_schema(
46
+ SchemaInferenceInput(
47
+ examples=["Order #123 delayed due to weather", "Order #456 delivered"],
48
+ purpose="Extract operational status signals for logistics analytics",
49
+ )
50
+ )
51
+ Model = schema.model # dynamic Pydantic model
52
+ task = schema.task # PreparedTask for batch extraction
48
53
 
49
54
  The implementation purposefully does *not* emit or depend on JSON Schema; the
50
- authoritative contract is the ordered list of ``FieldSpec`` instances.
55
+ authoritative contract is the recursive ``ObjectSpec`` tree.
51
56
  """
52
57
 
53
58
  from dataclasses import dataclass
54
- from enum import Enum
55
- from typing import Literal
56
59
 
57
60
  from openai import OpenAI
58
61
  from openai.types.responses import ParsedResponse
59
- from pydantic import BaseModel, Field, create_model
62
+ from pydantic import BaseModel, Field
60
63
 
64
+ from openaivec._dynamic import ObjectSpec, _build_model
61
65
  from openaivec._model import PreparedTask
62
66
 
63
67
  # Internal module: explicitly not part of public API
64
68
  __all__: list[str] = []
65
69
 
66
70
 
67
- class FieldSpec(BaseModel):
68
- """Specification for a single candidate output field.
69
-
70
- Each ``FieldSpec`` encodes a *flat*, scalar, semantically atomic unit the
71
- model should extract. These become columns in downstream DataFrames.
72
-
73
- Validation focuses on: objective naming, primitive typing, and *optional*
74
- closed categorical vocabularies. Enumerations are intentionally conservative
75
- (must derive from clear evidence) to reduce over‑fitted schemas.
76
-
77
- Attributes:
78
- name: Lower snake_case unique identifier (regex ^[a-z][a-z0-9_]*$). Avoid
79
- subjective modifiers ("best", "great", "high_quality").
80
- type: One of ``string|integer|float|boolean``. ``integer`` only if all
81
- observed numeric values are whole numbers; ``float`` if any decimal
82
- or ratio appears. ``boolean`` strictly for explicit binary forms.
83
- description: Concise, objective extraction rule (what qualifies / what
84
- to ignore). Disambiguate from overlapping fields if needed.
85
- enum_values: Optional stable closed set of lowercase string labels
86
- (2–24). Only for *string* type when the vocabulary is clearly
87
- evidenced; never hallucinate or extrapolate.
88
- """
89
-
90
- name: str = Field(
91
- description=(
92
- "Lower snake_case identifier (regex: ^[a-z][a-z0-9_]*$). Must be unique across all fields and "
93
- "express the semantic meaning succinctly (no adjectives like 'best', 'great'). For numeric (integer|float) "
94
- "fields the name MUST include an explicit unit or measure suffix (e.g. _count, _total_count, "
95
- "_duration_seconds, _ms, _price_usd, _ratio, _score) to eliminate ambiguity. Avoid bare numeric nouns like "
96
- "'duration' or 'value' without unit/scale. Boolean field names MUST begin with 'is_' followed by a "
97
- "descriptive predicate (e.g. is_active, is_delayed). Use positive forms (is_active) rather than "
98
- "negated forms (is_not_active)."
99
- )
100
- )
101
- type: Literal[
102
- "string",
103
- "integer",
104
- "float",
105
- "boolean",
106
- "string_array",
107
- "integer_array",
108
- "float_array",
109
- "boolean_array",
110
- ] = Field(
111
- description=(
112
- "Primitive type. Use 'integer' only if all observed numeric values are whole numbers. "
113
- "Use 'float' if any value can contain a decimal or represents a ratio/score. Use 'boolean' only for "
114
- "explicit binary states (yes/no, true/false, present/absent) consistently encoded. Use 'string' otherwise. "
115
- "Array variants (string_array, integer_array, float_array, boolean_array) are ONLY allowed when the value "
116
- "is a repeatable homogeneous collection whose individual elements would otherwise stand as valid scalar "
117
- "extractions (e.g. keywords, error_codes, tag_ids). Do not encode objects or mixed-type arrays; flatten or "
118
- "choose the most informative level."
119
- )
120
- )
121
- description: str = Field(
122
- description=(
123
- "Concise, objective definition plus extraction rule (what qualifies / what to ignore). Avoid subjective, "
124
- "speculative, or promotional language. If ambiguity exists with another field, clarify the distinction. "
125
- "Do NOT simply restate an original JSON/key name if the examples are already structured; only include a "
126
- "raw key verbatim when it is already the minimal, irreducible analytic unit. For derived fields, clearly "
127
- "state the transformation (e.g. sentiment of comment_text, normalized date, language code)."
128
- )
129
- )
130
- enum_values: list[str] | None = Field(
131
- default=None,
132
- description=(
133
- "Optional finite categorical label set (classification) for a string field. Provide ONLY when a closed, "
134
- "stable vocabulary (2–24 lowercase tokens) is clearly evidenced or strongly implied by examples. "
135
- "Do NOT invent labels. Omit if open-ended or ambiguous. Order must be stable and semantically natural."
136
- ),
137
- )
138
-
139
-
140
71
  class InferredSchema(BaseModel):
141
72
  """Result of a schema inference round.
142
73
 
143
- Contains the normalized *purpose*, an objective *examples_summary*, the
144
- ordered ``fields`` contract, and the canonical reusable ``inference_prompt``.
145
-
146
- The prompt is constrained to be fully derivable from the other components;
147
- adding novel unstated facts is disallowed to preserve traceability.
74
+ Contains the normalized *purpose*, objective *examples_summary*, the root
75
+ hierarchical ``object_spec`` contract, and the canonical reusable
76
+ ``inference_prompt``. The prompt MUST be fully derivable from the other
77
+ components (no new unstated facts) to preserve traceability.
148
78
 
149
79
  Attributes:
150
- purpose: Unambiguous restatement of the user's objective (noise &
151
- redundancy removed).
80
+ purpose: Unambiguous restatement of the user's objective.
152
81
  examples_summary: Neutral description of structural / semantic patterns
153
- observed in the examples (domain, recurring signals, constraints).
154
- examples_purpose_alignment: Analytical explanation of how the concrete
155
- recurring patterns in the provided examples *justify*, *constrain*,
156
- or *refine* the stated purpose. Should map purpose facets to
157
- observed evidence (or explicitly note gaps) to discourage
158
- hallucinated fields and anchor extraction scope. This is an
159
- internal quality aid downstream consumers typically ignore it.
160
- fields: Ordered list of ``FieldSpec`` objects comprising the schema's
161
- sole authoritative contract.
162
- inference_prompt: Self-contained extraction instructions enforcing an
163
- exact field set (names, order, primitive types) with prohibition on
164
- alterations or subjective flourishes.
82
+ observed in the examples.
83
+ examples_purpose_alignment: Mapping from purpose facets to concrete
84
+ recurring evidence (or explicit gaps) anchoring extraction scope.
85
+ object_spec: Root ``ObjectSpec`` (UpperCamelCase name) whose ``fields``
86
+ recursively define the extraction schema.
87
+ inference_prompt: Canonical instructions enforcing exact field names,
88
+ hierarchy, and types (no additions/removals/renames).
165
89
  """
166
90
 
167
91
  purpose: str = Field(
@@ -183,20 +107,17 @@ class InferredSchema(BaseModel):
183
107
  "reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream extraction."
184
108
  )
185
109
  )
186
- fields: list[FieldSpec] = Field(
110
+ object_spec: ObjectSpec = Field(
187
111
  description=(
188
- "Ordered list of proposed fields derived strictly from observable, repeatable signals in the "
189
- "examples and aligned with the purpose."
112
+ "Root ObjectSpec (recursive). Each contained object's field list is unique-name ordered and derived "
113
+ "strictly from observable, repeatable signals aligned with the purpose."
190
114
  )
191
115
  )
192
116
  inference_prompt: str = Field(
193
117
  description=(
194
- "Canonical, reusable extraction prompt for structuring future inputs with this schema. "
195
- "Must be fully derivable from 'purpose', 'examples_summary', and 'fields' (no new unstated facts or "
196
- "speculation). It MUST: (1) instruct the model to output only the listed fields with the exact names "
197
- "and primitive types; (2) forbid adding, removing, or renaming fields; (3) avoid subjective or "
198
- "marketing language; (4) be self-contained (no TODOs, no external references, no unresolved "
199
- "placeholders). Intended for direct reuse as the prompt for deterministic alignment with 'fields'."
118
+ "Canonical, reusable extraction prompt. Must be derivable from purpose + summaries + object_spec. Enforces "
119
+ "exact hierarchical field set (names, order per object, types) forbidding additions, removals, renames, or "
120
+ "subjective language. Self-contained (no TODOs, external refs, or placeholders)."
200
121
  )
201
122
  )
202
123
 
@@ -236,53 +157,7 @@ class InferredSchema(BaseModel):
236
157
  )
237
158
 
238
159
  def build_model(self) -> type[BaseModel]:
239
- """Create a new dynamic ``BaseModel`` class adhering to this schema.
240
-
241
- Implementation details:
242
- * Maps primitive types: string→``str``, integer→``int``, float→``float``, boolean→``bool``.
243
- * For enumerated string fields, constructs an ad‑hoc ``Enum`` subclass with
244
- stable member names (collision‑safe, normalized to ``UPPER_SNAKE``).
245
- * All fields are required (ellipsis ``...``). Optionality can be
246
- introduced later by modifying this logic if needed.
247
-
248
- Returns:
249
- type[BaseModel]: New (not cached) model type; order matches ``fields``.
250
- """
251
- type_map: dict[str, type] = {
252
- "string": str,
253
- "integer": int,
254
- "float": float,
255
- "boolean": bool,
256
- }
257
- fields: dict[str, tuple[type, object]] = {}
258
-
259
- for spec in self.fields:
260
- py_type: type
261
- if spec.enum_values:
262
- enum_class_name = "Enum_" + "".join(part.capitalize() for part in spec.name.split("_"))
263
- members: dict[str, str] = {}
264
- for raw in spec.enum_values:
265
- sanitized = raw.upper().replace("-", "_").replace(" ", "_")
266
- if not sanitized or sanitized[0].isdigit():
267
- sanitized = f"V_{sanitized}"
268
- base = sanitized
269
- i = 2
270
- while sanitized in members:
271
- sanitized = f"{base}_{i}"
272
- i += 1
273
- members[sanitized] = raw
274
- enum_cls = Enum(enum_class_name, members) # type: ignore[arg-type]
275
- py_type = enum_cls
276
- else:
277
- if spec.type.endswith("_array"):
278
- base = spec.type.rsplit("_", 1)[0]
279
- py_type = list[type_map[base]] # type: ignore[index]
280
- else:
281
- py_type = type_map[spec.type]
282
- fields[spec.name] = (py_type, Field(description=spec.description))
283
-
284
- model = create_model("InferredSchema", **fields) # type: ignore[call-arg]
285
- return model
160
+ return _build_model(self.object_spec)
286
161
 
287
162
  def save(self, path: str) -> None:
288
163
  """Persist this inferred schema as pretty‑printed JSON.
@@ -326,56 +201,41 @@ You are a schema inference engine.
326
201
  Task:
327
202
  1. Normalize the user's purpose (eliminate ambiguity, redundancy, contradictions).
328
203
  2. Objectively summarize observable patterns in the example texts.
329
- 3. Produce an "examples_purpose_alignment" explanation that explicitly maps purpose facets
330
- to concrete recurring evidence in the examples (or flags gaps). Use concise bullet‑style
331
- sentences (still a plain string) such as: "purpose facet -> supporting pattern / gap".
332
- This MUST NOT introduce new domain facts beyond the examples & purpose.
333
- 4. Propose a minimal flat set of scalar fields (and ONLY when justified,
334
- homogeneous primitive arrays) that are reliably extractable.
204
+ 3. Produce an "examples_purpose_alignment" explanation mapping purpose facets to concrete recurring evidence (or gaps).
205
+ 4. Propose a minimal hierarchical schema (root ObjectSpec) comprised of reliably extractable fields. Use nesting ONLY
206
+ when a group of fields forms a cohesive sub-entity repeated in the data; otherwise keep flat.
335
207
  5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
336
- 6. Provide enum_values ONLY when a small stable closed categorical set (224 lowercase tokens)
337
- is clearly evidenced; never invent.
338
- 7. If the purpose indicates prediction (predict / probability / likelihood), output only
339
- explanatory features (no target restatement).
208
+ 6. Provide enum_spec ONLY when a small stable closed categorical set (1{_MAX_ENUM_VALUES} raw tokens) is clearly
209
+ evidenced; never invent unseen categories.
210
+ 7. If the purpose indicates prediction (predict / probability / likelihood),
211
+ output only explanatory features (no target restatement).
340
212
 
341
213
  Rules:
342
- - Names: lower snake_case, unique, regex ^[a-z][a-z0-9_]*$, no subjective adjectives.
343
- - Types: string | integer | float | boolean
344
- * integer = all whole numbers
345
- * float = any decimals / ratios
346
- * boolean = explicit binary
347
- * else use string
348
- - Numeric (integer|float) field names MUST encode an explicit unit / scale / measure suffix
349
- (e.g. *_count, *_seconds, *_ms, *_usd, *_ratio, *_score). Avoid ambiguous bare numeric names.
350
- - Boolean field names MUST start with 'is_' followed by a positive predicate (e.g. is_active,
351
- is_delayed). Avoid negated forms.
352
- - No nested objects or mixed-type arrays. Homogeneous primitive arrays are allowed ONLY if each element is an atomic
353
- scalar signal (use *_array types: string_array, integer_array, float_array, boolean_array). The array is expected to
354
- contain 0..N such elements per record.
355
- - Array field names MUST end with '_array' (e.g. keywords_array, tag_ids_array). Do not use plural-only forms
356
- (e.g. keywords) for arrays; the suffix makes container semantics explicit.
357
- - Descriptions: concise, objective extraction rules (no marketing/emotion/speculation).
358
- - enum_values only for string fields with stable closed vocab; omit otherwise.
359
- - Exclude direct outcome labels (e.g. attrition_probability, will_buy, purchase_likelihood)
360
- in predictive / feature engineering contexts.
361
- - When examples already appear as serialized JSON / key-value records, DO NOT merely relist the
362
- raw original keys unless each is already an atomic, irreducible analytic signal. Prefer high-signal
363
- derived / normalized / aggregated features (e.g. sentiment, category, language_code, boolean flags,
364
- normalized_date, count metrics).
365
- - Superficial renames (adding trivial prefixes/suffixes like _value, _field, new_) are forbidden; a new
366
- field name must reflect a semantic transformation.
367
- - Keep field count focused (typically <= 12) prioritizing reusable analytical / ML features over low-signal
368
- restatements.
369
- - If you retain an original raw key unchanged, its description must justify why it is minimal and cannot
370
- be further decomposed without losing analytical value.
214
+ - Field names: lower snake_case, unique within each object, regex ^[a-z][a-z0-9_]*$, no subjective adjectives.
215
+ - Field types: string | integer | float | boolean | enum | object | string_array | integer_array | float_array |
216
+ boolean_array | enum_array | object_array
217
+ * *_array are homogeneous lists of their primitive / enum / object base type.
218
+ * Use object/object_array ONLY for semantically cohesive grouped attributes; avoid gratuitous layers.
219
+ - Enumerations: use enum_spec { name (UpperCamelCase), values [raw_tokens...] }. values length 1–{_MAX_ENUM_VALUES}.
220
+ Use ONLY when closed set is evidenced. Otherwise, use string.
221
+ - Numeric (integer|float) names encode explicit unit/measure suffix (e.g. *_count, *_seconds, *_usd, *_ratio, *_score).
222
+ - Boolean names start with 'is_' followed by positive predicate (no negations like is_not_*).
223
+ - Array field names SHOULD end with '_array' for primitive/enum arrays; object_array
224
+ fields may use plural noun or *_array pattern.
225
+ - Descriptions: concise, objective extraction criteria (no marketing/emotion/speculation).
226
+ - Exclude direct outcome labels in predictive contexts.
227
+ - Avoid superficial renames; semantic transformation only.
228
+ - Keep total field count focused (typically <= 16) optimizing for reusable analytical / ML features.
371
229
 
372
230
  Output contract:
373
- Return exactly an InferredSchema object with JSON keys:
374
- - purpose (string)
375
- - examples_summary (string)
376
- - examples_purpose_alignment (string)
377
- - fields (array of FieldSpec objects: name, type, description, enum_values?)
378
- - inference_prompt (string)
231
+ Return exactly an InferredSchema JSON object with keys:
232
+ - purpose (string)
233
+ - examples_summary (string)
234
+ - examples_purpose_alignment (string)
235
+ - object_spec (ObjectSpec: name, fields[list[FieldSpec]])
236
+ - inference_prompt (string)
237
+ Where each FieldSpec includes: name, type, description, optional enum_spec (for
238
+ enum / enum_array), optional object_spec (for object / object_array).
379
239
  """.strip()
380
240
 
381
241
 
@@ -401,14 +261,15 @@ class SchemaInferer:
401
261
  client: OpenAI
402
262
  model_name: str
403
263
 
404
- def infer_schema(self, data: "SchemaInferenceInput", *args, max_retries: int = 3, **kwargs) -> "InferredSchema":
264
+ def infer_schema(self, data: SchemaInferenceInput, *args, max_retries: int = 8, **kwargs) -> InferredSchema:
405
265
  """Infer a validated schema from representative examples.
406
266
 
407
- Workflow:
408
- 1. Submit ``SchemaInferenceInput`` (JSON) + instructions via
409
- ``responses.parse`` requesting an ``InferredSchema`` object.
410
- 2. Validate the returned field list with ``_basic_field_list_validation``.
411
- 3. Retry (up to ``max_retries``) if validation fails.
267
+ Workflow:
268
+ 1. Submit ``SchemaInferenceInput`` (JSON) + instructions via
269
+ ``responses.parse`` requesting an ``InferredSchema`` object.
270
+ 2. Attempt dynamic model build (``parsed.build_model()``) which performs recursive
271
+ structural validation (names, types, enum/object specs) via the dynamic layer.
272
+ 3. Retry (up to ``max_retries``) on validation failure.
412
273
 
413
274
  Args:
414
275
  data (SchemaInferenceInput): Representative examples + purpose.
@@ -460,55 +321,17 @@ class SchemaInferer:
460
321
  )
461
322
  parsed = response.output_parsed
462
323
  try:
463
- _basic_field_list_validation(parsed)
464
- parsed.build_model() # ensure dynamic model creation succeeds
324
+ # Validate the field list structure
325
+ parsed.build_model()
326
+ return parsed
465
327
  except ValueError as e:
466
328
  last_err = e
467
329
  previous_errors.append(str(e))
468
330
  if attempt == max_retries - 1:
469
- raise
470
- continue
471
- return parsed
472
- if last_err: # pragma: no cover
473
- raise last_err
474
- raise RuntimeError("unreachable retry loop state") # pragma: no cover
475
-
476
-
477
- def _basic_field_list_validation(parsed: InferredSchema) -> None:
478
- """Lightweight structural validation of an inferred field list.
331
+ raise ValueError(
332
+ f"Schema validation failed after {max_retries} attempts. Last error: {last_err}"
333
+ ) from last_err
479
334
 
480
- Checks:
481
- * Non-empty field set.
482
- * No duplicate names.
483
- * All types in the allowed primitive set.
484
- * ``enum_values`` only on string fields and size within bounds (2–24).
485
-
486
- Args:
487
- parsed (InferredSchema): Candidate ``InferredSchema`` instance.
488
-
489
- Raises:
490
- ValueError: Any invariant is violated.
491
- """
492
- names = [f.name for f in parsed.fields]
493
- if not names:
494
- raise ValueError("no fields suggested")
495
- if len(names) != len(set(names)):
496
- raise ValueError("duplicate field names detected")
497
- allowed = {
498
- "string",
499
- "integer",
500
- "float",
501
- "boolean",
502
- "string_array",
503
- "integer_array",
504
- "float_array",
505
- "boolean_array",
506
- }
507
- for f in parsed.fields:
508
- if f.type not in allowed:
509
- raise ValueError(f"unsupported field type: {f.type}")
510
- if f.enum_values is not None:
511
- if f.type != "string":
512
- raise ValueError(f"enum_values only allowed for plain string field: {f.name}")
513
- if not (2 <= len(f.enum_values) <= 24):
514
- raise ValueError(f"enum_values length out of bounds for field {f.name}")
335
+ if last_err:
336
+ raise last_err
337
+ raise RuntimeError("unreachable retry loop state")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 0.14.8
3
+ Version: 0.14.9
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -1,5 +1,6 @@
1
1
  openaivec/__init__.py,sha256=mXCGNNTjYbmE4CAXGvAs78soxUsoy_mxxnvaCk_CL6Y,361
2
2
  openaivec/_di.py,sha256=1MXaBzaH_ZenQnWKQzBY2z-egHwiteMvg7byoUH3ZZI,10658
3
+ openaivec/_dynamic.py,sha256=7ZaC59w2Edemnao57XeZVO4qmSOA-Kus6TchZC3Dd5o,14821
3
4
  openaivec/_embeddings.py,sha256=upCjl8m9h1CihP6t7wvIH_vivOAPSgmgooAxIhnUMUw,7449
4
5
  openaivec/_log.py,sha256=LHNs6AbJzM4weaRARZFroigxR6D148d7WSIMLk1IhbU,1439
5
6
  openaivec/_model.py,sha256=toS2oBubrJa9jrdYy-87Fb2XivjXUlk_8Zn5gKUAcFI,3345
@@ -8,7 +9,7 @@ openaivec/_prompt.py,sha256=zLv13q47CKV3jnETUyWAIlnjXFSEMs70c8m0yN7_Hek,20820
8
9
  openaivec/_provider.py,sha256=YLrEcb4aWBD1fj0n6PNcJpCtEXK6jkUuRH_WxcLDCuI,7145
9
10
  openaivec/_proxy.py,sha256=AiGuC1MCFjZCRXCac-pHUI3Np3nf1HIpWY6nC9ZVCFY,29671
10
11
  openaivec/_responses.py,sha256=lVJRa_Uc7hQJnYJRgumqwBbu6GToZqsLFS6tIAFO1Fc,24014
11
- openaivec/_schema.py,sha256=fVsFkCZWSbh2-fiGxnT8cSVrlUQOYWJX5EeL2F6aX4s,24039
12
+ openaivec/_schema.py,sha256=RKjDPqet1TlReYibah0R0NIvCV1VWN5SZxiaBeV0gCY,15492
12
13
  openaivec/_serialize.py,sha256=u2Om94Sc_QgJkTlW2BAGw8wd6gYDhc6IRqvS-qevFSs,8399
13
14
  openaivec/_util.py,sha256=XfueAycVCQvgRLS7wF7e306b53lebORvZOBzbQjy4vE,6438
14
15
  openaivec/pandas_ext.py,sha256=rCkh8g9eqHn0gUG8j_-jdppQt_Yq_1Wg6FmsCEcpv3k,85985
@@ -30,7 +31,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=Np-yY0d4Kr5WEjGjq4tNFHDNarBLajJr
30
31
  openaivec/task/nlp/translation.py,sha256=VYgiXtr2TL1tbqZkBpyVAy4ahrgd8UO4ZjhIL6xMdkI,6609
31
32
  openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
32
33
  openaivec/task/table/fillna.py,sha256=g_CpLnLzK1C5rCiVq15L3X0kywJK6CtSrKRYxQFuhn8,6606
33
- openaivec-0.14.8.dist-info/METADATA,sha256=ItqzTCNsPigyX9fe5WBQHih3gzT68XjdwFsAOa9-qrI,27566
34
- openaivec-0.14.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
35
- openaivec-0.14.8.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
36
- openaivec-0.14.8.dist-info/RECORD,,
34
+ openaivec-0.14.9.dist-info/METADATA,sha256=C7UqwVFLIVYiMJdRdUMTuUbhamacXoM2EHfS1nIxROQ,27566
35
+ openaivec-0.14.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
36
+ openaivec-0.14.9.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
37
+ openaivec-0.14.9.dist-info/RECORD,,