openaivec 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openaivec/_schema.py ADDED
@@ -0,0 +1,454 @@
1
+ """Internal schema inference & dynamic model materialization utilities.
2
+
3
+ This (non-public) module converts a small *representative* sample of free‑text
4
+ examples plus a *purpose* statement into:
5
+
6
+ 1. A vetted, flat list of scalar field specifications (``FieldSpec``) that can
7
+ be *reliably* extracted across similar future inputs.
8
+ 2. A reusable, self‑contained extraction prompt (``inference_prompt``) that
9
+ freezes the agreed schema contract (no additions / renames / omissions).
10
+ 3. A dynamically generated Pydantic model whose fields mirror the inferred
11
+ schema, enabling immediate typed parsing with the OpenAI Responses API.
12
+ 4. A ``PreparedTask`` wrapper (``InferredSchema.task``) for downstream batched
13
+ responses/structured extraction flows in pandas or Spark.
14
+
15
+ Core goals:
16
+ * Minimize manual, subjective schema design iterations.
17
+ * Enforce objective naming / typing / enum rules early (guard rails rather than
18
+ after‑the‑fact cleaning).
19
+ * Provide deterministic reusability: the same prompt + model yield stable
20
+ column ordering & types for analytics or feature engineering.
21
+ * Avoid outcome / target label leakage in predictive (feature engineering)
22
+ contexts by explicitly excluding direct target restatements.
23
+
24
+ This module is intentionally **internal** (``__all__ = []``). Public users
25
+ should interact through higher‑level batch APIs once a schema has been inferred.
26
+
27
+ Design constraints:
28
+ * Flat schema only (no nesting / arrays) to simplify Spark & pandas alignment.
29
+ * Primitive types limited to {string, integer, float, boolean}.
30
+ * Optional enumerations for *closed*, *observed* categorical sets only.
31
+ * Validation retries ensure a structurally coherent suggestion before returning.
32
+
33
+ Example (conceptual):
34
+ from openai import OpenAI
35
+ client = OpenAI()
36
+ inferer = SchemaInferer(client=client, model_name="gpt-4.1-mini")
37
+ schema = inferer.infer_schema(
38
+ SchemaInferenceInput(
39
+ examples=["Order #123 delayed due to weather", "Order #456 delivered"],
40
+ purpose="Extract operational status signals for logistics analytics",
41
+ )
42
+ )
43
+ Model = schema.model # dynamic Pydantic model
44
+ task = schema.task # PreparedTask for batch extraction
45
+
46
+ The implementation purposefully does *not* emit or depend on JSON Schema; the
47
+ authoritative contract is the ordered list of ``FieldSpec`` instances.
48
+ """
49
+
50
+ from dataclasses import dataclass
51
+ from enum import Enum
52
+ from typing import List, Literal, Optional, Type
53
+
54
+ from openai import OpenAI
55
+ from openai.types.responses import ParsedResponse
56
+ from pydantic import BaseModel, Field, create_model
57
+
58
+ from openaivec._model import PreparedTask
59
+
60
+ # Internal module: explicitly not part of public API
61
+ __all__: list[str] = []
62
+
63
+
64
+ class FieldSpec(BaseModel):
65
+ """Specification for a single candidate output field.
66
+
67
+ Each ``FieldSpec`` encodes a *flat*, scalar, semantically atomic unit the
68
+ model should extract. These become columns in downstream DataFrames.
69
+
70
+ Validation focuses on: objective naming, primitive typing, and *optional*
71
+ closed categorical vocabularies. Enumerations are intentionally conservative
72
+ (must derive from clear evidence) to reduce over‑fitted schemas.
73
+
74
+ Attributes:
75
+ name: Lower snake_case unique identifier (regex ^[a-z][a-z0-9_]*$). Avoid
76
+ subjective modifiers ("best", "great", "high_quality").
77
+ type: One of ``string|integer|float|boolean``. ``integer`` only if all
78
+ observed numeric values are whole numbers; ``float`` if any decimal
79
+ or ratio appears. ``boolean`` strictly for explicit binary forms.
80
+ description: Concise, objective extraction rule (what qualifies / what
81
+ to ignore). Disambiguate from overlapping fields if needed.
82
+ enum_values: Optional stable closed set of lowercase string labels
83
+ (2–24). Only for *string* type when the vocabulary is clearly
84
+ evidenced; never hallucinate or extrapolate.
85
+ """
86
+
87
+ name: str = Field(
88
+ description=(
89
+ "Lower snake_case identifier (regex: ^[a-z][a-z0-9_]*$). Must be unique across all fields and "
90
+ "express the semantic meaning succinctly (no adjectives like 'best', 'great')."
91
+ )
92
+ )
93
+ type: Literal["string", "integer", "float", "boolean"] = Field(
94
+ description=(
95
+ "Primitive type. Use 'integer' only if all observed numeric values are whole numbers. "
96
+ "Use 'float' if any value can contain a decimal or represents a ratio/score. Use 'boolean' only for "
97
+ "explicit binary states (yes/no, true/false, present/absent) consistently encoded. Use 'string' otherwise. "
98
+ "Never output arrays, objects, or composite encodings; flatten to the most specific scalar value."
99
+ )
100
+ )
101
+ description: str = Field(
102
+ description=(
103
+ "Concise, objective definition plus extraction rule (what qualifies / what to ignore). Avoid subjective, "
104
+ "speculative, or promotional language. If ambiguity exists with another field, clarify the distinction."
105
+ )
106
+ )
107
+ enum_values: Optional[List[str]] = Field(
108
+ default=None,
109
+ description=(
110
+ "Optional finite categorical label set (classification) for a string field. Provide ONLY when a closed, "
111
+ "stable vocabulary (2–24 lowercase tokens) is clearly evidenced or strongly implied by examples. "
112
+ "Do NOT invent labels. Omit if open-ended or ambiguous. Order must be stable and semantically natural."
113
+ ),
114
+ )
115
+
116
+
117
+ class InferredSchema(BaseModel):
118
+ """Result of a schema inference round.
119
+
120
+ Contains the normalized *purpose*, an objective *examples_summary*, the
121
+ ordered ``fields`` contract, and the canonical reusable ``inference_prompt``.
122
+
123
+ The prompt is constrained to be fully derivable from the other components;
124
+ adding novel unstated facts is disallowed to preserve traceability.
125
+
126
+ Attributes:
127
+ purpose: Unambiguous restatement of the user's objective (noise &
128
+ redundancy removed).
129
+ examples_summary: Neutral description of structural / semantic patterns
130
+ observed in the examples (domain, recurring signals, constraints).
131
+ examples_purpose_alignment: Analytical explanation of how the concrete
132
+ recurring patterns in the provided examples *justify*, *constrain*,
133
+ or *refine* the stated purpose. Should map purpose facets to
134
+ observed evidence (or explicitly note gaps) to discourage
135
+ hallucinated fields and anchor extraction scope. This is an
136
+ internal quality aid – downstream consumers typically ignore it.
137
+ fields: Ordered list of ``FieldSpec`` objects comprising the schema's
138
+ sole authoritative contract.
139
+ inference_prompt: Self-contained extraction instructions enforcing an
140
+ exact field set (names, order, primitive types) with prohibition on
141
+ alterations or subjective flourishes.
142
+ """
143
+
144
+ purpose: str = Field(
145
+ description=(
146
+ "Normalized, unambiguous restatement of the user objective with redundant, vague, or "
147
+ "conflicting phrasing removed."
148
+ )
149
+ )
150
+ examples_summary: str = Field(
151
+ description=(
152
+ "Objective characterization of the provided examples: content domain, structure, recurring "
153
+ "patterns, and notable constraints."
154
+ )
155
+ )
156
+ examples_purpose_alignment: str = Field(
157
+ description=(
158
+ "Explanation of how observable recurring patterns in the examples substantiate and bound the stated "
159
+ "purpose. Should reference purpose facets and cite supporting example evidence (or note any gaps) to "
160
+ "reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream extraction."
161
+ )
162
+ )
163
+ fields: List[FieldSpec] = Field(
164
+ description=(
165
+ "Ordered list of proposed fields derived strictly from observable, repeatable signals in the "
166
+ "examples and aligned with the purpose."
167
+ )
168
+ )
169
+ inference_prompt: str = Field(
170
+ description=(
171
+ "Canonical, reusable extraction prompt for structuring future inputs with this schema. "
172
+ "Must be fully derivable from 'purpose', 'examples_summary', and 'fields' (no new unstated facts or "
173
+ "speculation). It MUST: (1) instruct the model to output only the listed fields with the exact names "
174
+ "and primitive types; (2) forbid adding, removing, or renaming fields; (3) avoid subjective or "
175
+ "marketing language; (4) be self-contained (no TODOs, no external references, no unresolved "
176
+ "placeholders). Intended for direct reuse as the prompt for deterministic alignment with 'fields'."
177
+ )
178
+ )
179
+
180
+ @classmethod
181
+ def load(cls, path: str) -> "InferredSchema":
182
+ """Load an inferred schema from a JSON file.
183
+
184
+ Args:
185
+ path (str): Path to a UTF‑8 JSON document previously produced via ``save``.
186
+
187
+ Returns:
188
+ InferredSchema: Reconstructed instance.
189
+ """
190
+ with open(path, "r", encoding="utf-8") as f:
191
+ return cls.model_validate_json(f.read())
192
+
193
+ @property
194
+ def model(self) -> Type[BaseModel]:
195
+ """Dynamically materialized Pydantic model for the inferred schema.
196
+
197
+ Equivalent to calling :meth:`build_model` each access (not cached).
198
+
199
+ Returns:
200
+ Type[BaseModel]: Fresh model type reflecting ``fields`` ordering.
201
+ """
202
+ return self.build_model()
203
+
204
+ @property
205
+ def task(self) -> PreparedTask:
206
+ """PreparedTask integrating the schema's extraction prompt & model.
207
+
208
+ Returns:
209
+ PreparedTask: Ready for batched structured extraction calls.
210
+ """
211
+ return PreparedTask(
212
+ instructions=self.inference_prompt, response_format=self.model, top_p=None, temperature=None
213
+ )
214
+
215
+ def build_model(self) -> Type[BaseModel]:
216
+ """Create a new dynamic ``BaseModel`` class adhering to this schema.
217
+
218
+ Implementation details:
219
+ * Maps primitive types: string→``str``, integer→``int``, float→``float``, boolean→``bool``.
220
+ * For enumerated string fields, constructs an ad‑hoc ``Enum`` subclass with
221
+ stable member names (collision‑safe, normalized to ``UPPER_SNAKE``).
222
+ * All fields are required (ellipsis ``...``). Optionality can be
223
+ introduced later by modifying this logic if needed.
224
+
225
+ Returns:
226
+ Type[BaseModel]: New (not cached) model type; order matches ``fields``.
227
+ """
228
+ type_map: dict[str, type] = {"string": str, "integer": int, "float": float, "boolean": bool}
229
+ fields: dict[str, tuple[type, object]] = {}
230
+
231
+ for spec in self.fields:
232
+ py_type: type
233
+ if spec.enum_values:
234
+ enum_class_name = "Enum_" + "".join(part.capitalize() for part in spec.name.split("_"))
235
+ members: dict[str, str] = {}
236
+ for raw in spec.enum_values:
237
+ sanitized = raw.upper().replace("-", "_").replace(" ", "_")
238
+ if not sanitized or sanitized[0].isdigit():
239
+ sanitized = f"V_{sanitized}"
240
+ base = sanitized
241
+ i = 2
242
+ while sanitized in members:
243
+ sanitized = f"{base}_{i}"
244
+ i += 1
245
+ members[sanitized] = raw
246
+ enum_cls = Enum(enum_class_name, members) # type: ignore[arg-type]
247
+ py_type = enum_cls
248
+ else:
249
+ py_type = type_map[spec.type]
250
+ fields[spec.name] = (py_type, Field(description=spec.description))
251
+
252
+ model = create_model("InferredSchema", **fields) # type: ignore[call-arg]
253
+ return model
254
+
255
+ def save(self, path: str) -> None:
256
+ """Persist this inferred schema as pretty‑printed JSON.
257
+
258
+ Args:
259
+ path (str): Destination filesystem path.
260
+ """
261
+ with open(path, "w", encoding="utf-8") as f:
262
+ f.write(self.model_dump_json(indent=2))
263
+
264
+
265
+ class SchemaInferenceInput(BaseModel):
266
+ """Input payload for schema inference.
267
+
268
+ Attributes:
269
+ examples: Representative sample texts restricted to the in‑scope
270
+ distribution (exclude outliers / noise). Size should be *minimal*
271
+ yet sufficient to surface recurring patterns.
272
+ purpose: Plain language description of downstream usage (analytics,
273
+ filtering, enrichment, feature engineering, etc.). Guides field
274
+ relevance & exclusion of outcome labels.
275
+ """
276
+
277
+ examples: List[str] = Field(
278
+ description=(
279
+ "Representative sample texts (strings). Provide only data the schema should generalize over; "
280
+ "exclude outliers not in scope."
281
+ )
282
+ )
283
+ purpose: str = Field(
284
+ description=(
285
+ "Plain language statement describing the downstream use of the extracted structured data (e.g. "
286
+ "analytics, filtering, enrichment)."
287
+ )
288
+ )
289
+
290
+
291
+ _INFER_INSTRUCTIONS = """
292
+ You are a schema inference engine.
293
+
294
+ Task:
295
+ 1. Normalize the user's purpose (eliminate ambiguity, redundancy, contradictions).
296
+ 2. Objectively summarize observable patterns in the example texts.
297
+ 3. Produce an "examples_purpose_alignment" explanation that explicitly maps purpose facets
298
+ to concrete recurring evidence in the examples (or flags gaps). Use concise bullet‑style
299
+ sentences (still a plain string) such as: "purpose facet -> supporting pattern / gap".
300
+ This MUST NOT introduce new domain facts beyond the examples & purpose.
301
+ 4. Propose a minimal flat set of scalar fields (no nesting / arrays) that are reliably extractable.
302
+ 5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
303
+ 6. Provide enum_values ONLY when a small stable closed categorical set (2–24 lowercase tokens)
304
+ is clearly evidenced; never invent.
305
+ 7. If the purpose indicates prediction (predict / probability / likelihood), output only
306
+ explanatory features (no target restatement).
307
+
308
+ Rules:
309
+ - Names: lower snake_case, unique, regex ^[a-z][a-z0-9_]*$, no subjective adjectives.
310
+ - Types: string | integer | float | boolean
311
+ * integer = all whole numbers
312
+ * float = any decimals / ratios
313
+ * boolean = explicit binary
314
+ * else use string
315
+ - No arrays, objects, composite encodings, or merged multi-concept fields.
316
+ - Descriptions: concise, objective extraction rules (no marketing/emotion/speculation).
317
+ - enum_values only for string fields with stable closed vocab; omit otherwise.
318
+ - Exclude direct outcome labels (e.g. attrition_probability, will_buy, purchase_likelihood)
319
+ in predictive / feature engineering contexts.
320
+
321
+ Output contract:
322
+ Return exactly an InferredSchema object with JSON keys:
323
+ - purpose (string)
324
+ - examples_summary (string)
325
+ - examples_purpose_alignment (string)
326
+ - fields (array of FieldSpec objects: name, type, description, enum_values?)
327
+ - inference_prompt (string)
328
+ """.strip()
329
+
330
+
331
+ @dataclass(frozen=True)
332
+ class SchemaInferer:
333
+ """High-level orchestrator for schema inference against the Responses API.
334
+
335
+ Responsibilities:
336
+ * Issue a structured parsing request with strict instructions.
337
+ * Retry (up to ``max_retries``) when the produced field list violates
338
+ baseline structural rules (duplicate names, unsupported types, etc.).
339
+ * Return a fully validated ``InferredSchema`` ready for dynamic model
340
+ generation & downstream batch extraction.
341
+
342
+ The inferred schema intentionally avoids JSON Schema intermediates; the
343
+ authoritative contract is the ordered ``FieldSpec`` list.
344
+
345
+ Attributes:
346
+ client: OpenAI client for calling ``responses.parse``.
347
+ model_name: Model / deployment identifier.
348
+ """
349
+
350
+ client: OpenAI
351
+ model_name: str
352
+
353
+ def infer_schema(self, data: "SchemaInferenceInput", *args, max_retries: int = 3, **kwargs) -> "InferredSchema":
354
+ """Infer a validated schema from representative examples.
355
+
356
+ Workflow:
357
+ 1. Submit ``SchemaInferenceInput`` (JSON) + instructions via
358
+ ``responses.parse`` requesting an ``InferredSchema`` object.
359
+ 2. Validate the returned field list with ``_basic_field_list_validation``.
360
+ 3. Retry (up to ``max_retries``) if validation fails.
361
+
362
+ Args:
363
+ data (SchemaInferenceInput): Representative examples + purpose.
364
+ *args: Positional passthrough to ``client.responses.parse``.
365
+ max_retries (int, optional): Attempts before surfacing the last validation error
366
+ (must be >= 1). Defaults to 3.
367
+ **kwargs: Keyword passthrough to ``client.responses.parse``.
368
+
369
+ Returns:
370
+ InferredSchema: Fully validated schema (purpose, examples summary,
371
+ ordered fields, extraction prompt).
372
+
373
+ Raises:
374
+ ValueError: Validation still fails after exhausting retries.
375
+ """
376
+ if max_retries < 1:
377
+ raise ValueError("max_retries must be >= 1")
378
+
379
+ last_err: Exception | None = None
380
+ previous_errors: list[str] = []
381
+ for attempt in range(max_retries):
382
+ if attempt == 0:
383
+ instructions = _INFER_INSTRUCTIONS
384
+ else:
385
+ # Provide structured feedback for correction. Keep concise and prohibit speculative expansion.
386
+ feedback_lines = [
387
+ "--- PRIOR VALIDATION FEEDBACK ---",
388
+ ]
389
+ for i, err in enumerate(previous_errors[-5:], 1): # include last up to 5 errors
390
+ feedback_lines.append(f"{i}. {err}")
391
+ feedback_lines.extend(
392
+ [
393
+ "Adjust ONLY listed issues; avoid adding brand-new fields unless essential.",
394
+ "Don't hallucinate or broaden enum_values unless enum rule caused failure.",
395
+ "Duplicate names: minimally rename; keep semantics.",
396
+ "Unsupported type: change to string|integer|float|boolean (no new facts).",
397
+ "Bad enum length: drop enum or constrain to 2–24 evidenced tokens.",
398
+ ]
399
+ )
400
+ instructions = _INFER_INSTRUCTIONS + "\n\n" + "\n".join(feedback_lines)
401
+
402
+ response: ParsedResponse[InferredSchema] = self.client.responses.parse(
403
+ model=self.model_name,
404
+ instructions=instructions,
405
+ input=data.model_dump_json(),
406
+ text_format=InferredSchema,
407
+ *args,
408
+ **kwargs,
409
+ )
410
+ parsed = response.output_parsed
411
+ try:
412
+ _basic_field_list_validation(parsed)
413
+ parsed.build_model() # ensure dynamic model creation succeeds
414
+ except ValueError as e:
415
+ last_err = e
416
+ previous_errors.append(str(e))
417
+ if attempt == max_retries - 1:
418
+ raise
419
+ continue
420
+ return parsed
421
+ if last_err: # pragma: no cover
422
+ raise last_err
423
+ raise RuntimeError("unreachable retry loop state") # pragma: no cover
424
+
425
+
426
+ def _basic_field_list_validation(parsed: InferredSchema) -> None:
427
+ """Lightweight structural validation of an inferred field list.
428
+
429
+ Checks:
430
+ * Non-empty field set.
431
+ * No duplicate names.
432
+ * All types in the allowed primitive set.
433
+ * ``enum_values`` only on string fields and size within bounds (2–24).
434
+
435
+ Args:
436
+ parsed (InferredSchema): Candidate ``InferredSchema`` instance.
437
+
438
+ Raises:
439
+ ValueError: Any invariant is violated.
440
+ """
441
+ names = [f.name for f in parsed.fields]
442
+ if not names:
443
+ raise ValueError("no fields suggested")
444
+ if len(names) != len(set(names)):
445
+ raise ValueError("duplicate field names detected")
446
+ allowed = {"string", "integer", "float", "boolean"}
447
+ for f in parsed.fields:
448
+ if f.type not in allowed:
449
+ raise ValueError(f"unsupported field type: {f.type}")
450
+ if f.enum_values is not None:
451
+ if f.type != "string":
452
+ raise ValueError(f"enum_values only allowed for string field: {f.name}")
453
+ if not (2 <= len(f.enum_values) <= 24):
454
+ raise ValueError(f"enum_values length out of bounds for field {f.name}")