openaivec 0.14.7__py3-none-any.whl → 0.14.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openaivec/_schema.py CHANGED
@@ -3,150 +3,89 @@
3
3
  This (non-public) module converts a small *representative* sample of free‑text
4
4
  examples plus a *purpose* statement into:
5
5
 
6
- 1. A vetted, flat list of scalar field specifications (``FieldSpec``) that can
7
- be *reliably* extracted across similar future inputs.
6
+ 1. A vetted hierarchical object specification (``ObjectSpec``) whose recursively
7
+ defined ``fields`` (``FieldSpec``) capture reliably extractable signals.
8
8
  2. A reusable, self‑contained extraction prompt (``inference_prompt``) that
9
- freezes the agreed schema contract (no additions / renames / omissions).
10
- 3. A dynamically generated Pydantic model whose fields mirror the inferred
11
- schema, enabling immediate typed parsing with the OpenAI Responses API.
9
+ freezes the agreed schema contract (no additions / renames / omissions).
10
+ 3. A dynamically generated Pydantic model mirroring the hierarchical schema,
11
+ enabling immediate typed parsing with the OpenAI Responses API.
12
12
  4. A ``PreparedTask`` wrapper (``InferredSchema.task``) for downstream batched
13
- responses/structured extraction flows in pandas or Spark.
13
+ responses / structured extraction flows in pandas or Spark.
14
14
 
15
15
  Core goals:
16
16
  * Minimize manual, subjective schema design iterations.
17
17
  * Enforce objective naming / typing / enum rules early (guard rails rather than
18
- after‑the‑fact cleaning).
19
- * Provide deterministic reusability: the same prompt + model yield stable
20
- column ordering & types for analytics or feature engineering.
18
+ after‑the‑fact cleaning).
19
+ * Provide deterministic reusability: the same prompt + model yield stable field
20
+ ordering & types for analytics or feature engineering.
21
21
  * Avoid outcome / target label leakage in predictive (feature engineering)
22
- contexts by explicitly excluding direct target restatements.
22
+ contexts by explicitly excluding direct target restatements.
23
23
 
24
24
  This module is intentionally **internal** (``__all__ = []``). Public users
25
25
  should interact through higher‑level batch APIs once a schema has been inferred.
26
26
 
27
- Design constraints:
28
- * Flat schema only (no nesting / arrays) to simplify Spark & pandas alignment.
29
- * Primitive types limited to {string, integer, float, boolean}.
30
- * Optional enumerations for *closed*, *observed* categorical sets only.
27
+ Design constraints (updated):
28
+ * Root: single ``ObjectSpec`` (UpperCamelCase name) containing one or more fields.
29
+ * Field types: string | integer | float | boolean | enum | object |
30
+ string_array | integer_array | float_array | boolean_array | enum_array | object_array
31
+ * Arrays are homogeneous lists of their base type.
32
+ * Nested objects / arrays of objects are allowed when semantically cohesive; keep
33
+ depth shallow and avoid gratuitous nesting.
34
+ * Enumerations use ``enum_spec`` with explicit ``name`` (UpperCamelCase) and 1–24
35
+ raw label values (project constant). Values collapse by uppercasing; order not guaranteed.
36
+ * Field names: lower_snake_case; unique per containing object.
37
+ * Boolean names: affirmative 'is_' prefix.
38
+ * Numeric (integer/float) names encode unit / measure suffix (e.g. *_count, *_ratio, *_ms).
31
39
  * Validation retries ensure a structurally coherent suggestion before returning.
32
40
 
33
41
  Example (conceptual):
34
- from openai import OpenAI
35
- client = OpenAI()
36
- inferer = SchemaInferer(client=client, model_name="gpt-4.1-mini")
37
- schema = inferer.infer_schema(
38
- SchemaInferenceInput(
39
- examples=["Order #123 delayed due to weather", "Order #456 delivered"],
40
- purpose="Extract operational status signals for logistics analytics",
41
- )
42
- )
43
- Model = schema.model # dynamic Pydantic model
44
- task = schema.task # PreparedTask for batch extraction
42
+ from openai import OpenAI
43
+ client = OpenAI()
44
+ inferer = SchemaInferer(client=client, model_name="gpt-4.1-mini")
45
+ schema = inferer.infer_schema(
46
+ SchemaInferenceInput(
47
+ examples=["Order #123 delayed due to weather", "Order #456 delivered"],
48
+ purpose="Extract operational status signals for logistics analytics",
49
+ )
50
+ )
51
+ Model = schema.model # dynamic Pydantic model
52
+ task = schema.task # PreparedTask for batch extraction
45
53
 
46
54
  The implementation purposefully does *not* emit or depend on JSON Schema; the
47
- authoritative contract is the ordered list of ``FieldSpec`` instances.
55
+ authoritative contract is the recursive ``ObjectSpec`` tree.
48
56
  """
49
57
 
50
58
  from dataclasses import dataclass
51
- from enum import Enum
52
- from typing import List, Literal, Optional, Type
53
59
 
54
60
  from openai import OpenAI
55
61
  from openai.types.responses import ParsedResponse
56
- from pydantic import BaseModel, Field, create_model
62
+ from pydantic import BaseModel, Field
57
63
 
64
+ from openaivec._dynamic import ObjectSpec, _build_model
58
65
  from openaivec._model import PreparedTask
59
66
 
60
67
  # Internal module: explicitly not part of public API
61
68
  __all__: list[str] = []
62
69
 
63
70
 
64
- class FieldSpec(BaseModel):
65
- """Specification for a single candidate output field.
66
-
67
- Each ``FieldSpec`` encodes a *flat*, scalar, semantically atomic unit the
68
- model should extract. These become columns in downstream DataFrames.
69
-
70
- Validation focuses on: objective naming, primitive typing, and *optional*
71
- closed categorical vocabularies. Enumerations are intentionally conservative
72
- (must derive from clear evidence) to reduce over‑fitted schemas.
73
-
74
- Attributes:
75
- name: Lower snake_case unique identifier (regex ^[a-z][a-z0-9_]*$). Avoid
76
- subjective modifiers ("best", "great", "high_quality").
77
- type: One of ``string|integer|float|boolean``. ``integer`` only if all
78
- observed numeric values are whole numbers; ``float`` if any decimal
79
- or ratio appears. ``boolean`` strictly for explicit binary forms.
80
- description: Concise, objective extraction rule (what qualifies / what
81
- to ignore). Disambiguate from overlapping fields if needed.
82
- enum_values: Optional stable closed set of lowercase string labels
83
- (2–24). Only for *string* type when the vocabulary is clearly
84
- evidenced; never hallucinate or extrapolate.
85
- """
86
-
87
- name: str = Field(
88
- description=(
89
- "Lower snake_case identifier (regex: ^[a-z][a-z0-9_]*$). Must be unique across all fields and "
90
- "express the semantic meaning succinctly (no adjectives like 'best', 'great'). For numeric (integer|float) "
91
- "fields the name MUST include an explicit unit or measure suffix (e.g. _count, _total_count, "
92
- "_duration_seconds, _ms, _price_usd, _ratio, _score) to eliminate ambiguity. Avoid bare numeric nouns like "
93
- "'duration' or 'value' without unit/scale. Boolean field names MUST begin with 'is_' followed by a "
94
- "descriptive predicate (e.g. is_active, is_delayed). Use positive forms (is_active) rather than "
95
- "negated forms (is_not_active)."
96
- )
97
- )
98
- type: Literal["string", "integer", "float", "boolean"] = Field(
99
- description=(
100
- "Primitive type. Use 'integer' only if all observed numeric values are whole numbers. "
101
- "Use 'float' if any value can contain a decimal or represents a ratio/score. Use 'boolean' only for "
102
- "explicit binary states (yes/no, true/false, present/absent) consistently encoded. Use 'string' otherwise. "
103
- "Never output arrays, objects, or composite encodings; flatten to the most specific scalar value."
104
- )
105
- )
106
- description: str = Field(
107
- description=(
108
- "Concise, objective definition plus extraction rule (what qualifies / what to ignore). Avoid subjective, "
109
- "speculative, or promotional language. If ambiguity exists with another field, clarify the distinction. "
110
- "Do NOT simply restate an original JSON/key name if the examples are already structured; only include a "
111
- "raw key verbatim when it is already the minimal, irreducible analytic unit. For derived fields, clearly "
112
- "state the transformation (e.g. sentiment of comment_text, normalized date, language code)."
113
- )
114
- )
115
- enum_values: Optional[List[str]] = Field(
116
- default=None,
117
- description=(
118
- "Optional finite categorical label set (classification) for a string field. Provide ONLY when a closed, "
119
- "stable vocabulary (2–24 lowercase tokens) is clearly evidenced or strongly implied by examples. "
120
- "Do NOT invent labels. Omit if open-ended or ambiguous. Order must be stable and semantically natural."
121
- ),
122
- )
123
-
124
-
125
71
  class InferredSchema(BaseModel):
126
72
  """Result of a schema inference round.
127
73
 
128
- Contains the normalized *purpose*, an objective *examples_summary*, the
129
- ordered ``fields`` contract, and the canonical reusable ``inference_prompt``.
130
-
131
- The prompt is constrained to be fully derivable from the other components;
132
- adding novel unstated facts is disallowed to preserve traceability.
74
+ Contains the normalized *purpose*, objective *examples_summary*, the root
75
+ hierarchical ``object_spec`` contract, and the canonical reusable
76
+ ``inference_prompt``. The prompt MUST be fully derivable from the other
77
+ components (no new unstated facts) to preserve traceability.
133
78
 
134
79
  Attributes:
135
- purpose: Unambiguous restatement of the user's objective (noise &
136
- redundancy removed).
80
+ purpose: Unambiguous restatement of the user's objective.
137
81
  examples_summary: Neutral description of structural / semantic patterns
138
- observed in the examples (domain, recurring signals, constraints).
139
- examples_purpose_alignment: Analytical explanation of how the concrete
140
- recurring patterns in the provided examples *justify*, *constrain*,
141
- or *refine* the stated purpose. Should map purpose facets to
142
- observed evidence (or explicitly note gaps) to discourage
143
- hallucinated fields and anchor extraction scope. This is an
144
- internal quality aid downstream consumers typically ignore it.
145
- fields: Ordered list of ``FieldSpec`` objects comprising the schema's
146
- sole authoritative contract.
147
- inference_prompt: Self-contained extraction instructions enforcing an
148
- exact field set (names, order, primitive types) with prohibition on
149
- alterations or subjective flourishes.
82
+ observed in the examples.
83
+ examples_purpose_alignment: Mapping from purpose facets to concrete
84
+ recurring evidence (or explicit gaps) anchoring extraction scope.
85
+ object_spec: Root ``ObjectSpec`` (UpperCamelCase name) whose ``fields``
86
+ recursively define the extraction schema.
87
+ inference_prompt: Canonical instructions enforcing exact field names,
88
+ hierarchy, and types (no additions/removals/renames).
150
89
  """
151
90
 
152
91
  purpose: str = Field(
@@ -168,20 +107,17 @@ class InferredSchema(BaseModel):
168
107
  "reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream extraction."
169
108
  )
170
109
  )
171
- fields: List[FieldSpec] = Field(
110
+ object_spec: ObjectSpec = Field(
172
111
  description=(
173
- "Ordered list of proposed fields derived strictly from observable, repeatable signals in the "
174
- "examples and aligned with the purpose."
112
+ "Root ObjectSpec (recursive). Each contained object's field list is unique-name ordered and derived "
113
+ "strictly from observable, repeatable signals aligned with the purpose."
175
114
  )
176
115
  )
177
116
  inference_prompt: str = Field(
178
117
  description=(
179
- "Canonical, reusable extraction prompt for structuring future inputs with this schema. "
180
- "Must be fully derivable from 'purpose', 'examples_summary', and 'fields' (no new unstated facts or "
181
- "speculation). It MUST: (1) instruct the model to output only the listed fields with the exact names "
182
- "and primitive types; (2) forbid adding, removing, or renaming fields; (3) avoid subjective or "
183
- "marketing language; (4) be self-contained (no TODOs, no external references, no unresolved "
184
- "placeholders). Intended for direct reuse as the prompt for deterministic alignment with 'fields'."
118
+ "Canonical, reusable extraction prompt. Must be derivable from purpose + summaries + object_spec. Enforces "
119
+ "exact hierarchical field set (names, order per object, types) forbidding additions, removals, renames, or "
120
+ "subjective language. Self-contained (no TODOs, external refs, or placeholders)."
185
121
  )
186
122
  )
187
123
 
@@ -199,13 +135,13 @@ class InferredSchema(BaseModel):
199
135
  return cls.model_validate_json(f.read())
200
136
 
201
137
  @property
202
- def model(self) -> Type[BaseModel]:
138
+ def model(self) -> type[BaseModel]:
203
139
  """Dynamically materialized Pydantic model for the inferred schema.
204
140
 
205
141
  Equivalent to calling :meth:`build_model` each access (not cached).
206
142
 
207
143
  Returns:
208
- Type[BaseModel]: Fresh model type reflecting ``fields`` ordering.
144
+ type[BaseModel]: Fresh model type reflecting ``fields`` ordering.
209
145
  """
210
146
  return self.build_model()
211
147
 
@@ -220,45 +156,8 @@ class InferredSchema(BaseModel):
220
156
  instructions=self.inference_prompt, response_format=self.model, top_p=None, temperature=None
221
157
  )
222
158
 
223
- def build_model(self) -> Type[BaseModel]:
224
- """Create a new dynamic ``BaseModel`` class adhering to this schema.
225
-
226
- Implementation details:
227
- * Maps primitive types: string→``str``, integer→``int``, float→``float``, boolean→``bool``.
228
- * For enumerated string fields, constructs an ad‑hoc ``Enum`` subclass with
229
- stable member names (collision‑safe, normalized to ``UPPER_SNAKE``).
230
- * All fields are required (ellipsis ``...``). Optionality can be
231
- introduced later by modifying this logic if needed.
232
-
233
- Returns:
234
- Type[BaseModel]: New (not cached) model type; order matches ``fields``.
235
- """
236
- type_map: dict[str, type] = {"string": str, "integer": int, "float": float, "boolean": bool}
237
- fields: dict[str, tuple[type, object]] = {}
238
-
239
- for spec in self.fields:
240
- py_type: type
241
- if spec.enum_values:
242
- enum_class_name = "Enum_" + "".join(part.capitalize() for part in spec.name.split("_"))
243
- members: dict[str, str] = {}
244
- for raw in spec.enum_values:
245
- sanitized = raw.upper().replace("-", "_").replace(" ", "_")
246
- if not sanitized or sanitized[0].isdigit():
247
- sanitized = f"V_{sanitized}"
248
- base = sanitized
249
- i = 2
250
- while sanitized in members:
251
- sanitized = f"{base}_{i}"
252
- i += 1
253
- members[sanitized] = raw
254
- enum_cls = Enum(enum_class_name, members) # type: ignore[arg-type]
255
- py_type = enum_cls
256
- else:
257
- py_type = type_map[spec.type]
258
- fields[spec.name] = (py_type, Field(description=spec.description))
259
-
260
- model = create_model("InferredSchema", **fields) # type: ignore[call-arg]
261
- return model
159
+ def build_model(self) -> type[BaseModel]:
160
+ return _build_model(self.object_spec)
262
161
 
263
162
  def save(self, path: str) -> None:
264
163
  """Persist this inferred schema as pretty‑printed JSON.
@@ -282,7 +181,7 @@ class SchemaInferenceInput(BaseModel):
282
181
  relevance & exclusion of outcome labels.
283
182
  """
284
183
 
285
- examples: List[str] = Field(
184
+ examples: list[str] = Field(
286
185
  description=(
287
186
  "Representative sample texts (strings). Provide only data the schema should generalize over; "
288
187
  "exclude outliers not in scope."
@@ -302,51 +201,41 @@ You are a schema inference engine.
302
201
  Task:
303
202
  1. Normalize the user's purpose (eliminate ambiguity, redundancy, contradictions).
304
203
  2. Objectively summarize observable patterns in the example texts.
305
- 3. Produce an "examples_purpose_alignment" explanation that explicitly maps purpose facets
306
- to concrete recurring evidence in the examples (or flags gaps). Use concise bullet‑style
307
- sentences (still a plain string) such as: "purpose facet -> supporting pattern / gap".
308
- This MUST NOT introduce new domain facts beyond the examples & purpose.
309
- 4. Propose a minimal flat set of scalar fields (no nesting / arrays) that are reliably extractable.
204
+ 3. Produce an "examples_purpose_alignment" explanation mapping purpose facets to concrete recurring evidence (or gaps).
205
+ 4. Propose a minimal hierarchical schema (root ObjectSpec) comprised of reliably extractable fields. Use nesting ONLY
206
+ when a group of fields forms a cohesive sub-entity repeated in the data; otherwise keep flat.
310
207
  5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
311
- 6. Provide enum_values ONLY when a small stable closed categorical set (224 lowercase tokens)
312
- is clearly evidenced; never invent.
313
- 7. If the purpose indicates prediction (predict / probability / likelihood), output only
314
- explanatory features (no target restatement).
208
+ 6. Provide enum_spec ONLY when a small stable closed categorical set (1{_MAX_ENUM_VALUES} raw tokens) is clearly
209
+ evidenced; never invent unseen categories.
210
+ 7. If the purpose indicates prediction (predict / probability / likelihood),
211
+ output only explanatory features (no target restatement).
315
212
 
316
213
  Rules:
317
- - Names: lower snake_case, unique, regex ^[a-z][a-z0-9_]*$, no subjective adjectives.
318
- - Types: string | integer | float | boolean
319
- * integer = all whole numbers
320
- * float = any decimals / ratios
321
- * boolean = explicit binary
322
- * else use string
323
- - Numeric (integer|float) field names MUST encode an explicit unit / scale / measure suffix
324
- (e.g. *_count, *_seconds, *_ms, *_usd, *_ratio, *_score). Avoid ambiguous bare numeric names.
325
- - Boolean field names MUST start with 'is_' followed by a positive predicate (e.g. is_active,
326
- is_delayed). Avoid negated forms.
327
- - No arrays, objects, composite encodings, or merged multi-concept fields.
328
- - Descriptions: concise, objective extraction rules (no marketing/emotion/speculation).
329
- - enum_values only for string fields with stable closed vocab; omit otherwise.
330
- - Exclude direct outcome labels (e.g. attrition_probability, will_buy, purchase_likelihood)
331
- in predictive / feature engineering contexts.
332
- - When examples already appear as serialized JSON / key-value records, DO NOT merely relist the
333
- raw original keys unless each is already an atomic, irreducible analytic signal. Prefer high-signal
334
- derived / normalized / aggregated features (e.g. sentiment, category, language_code, boolean flags,
335
- normalized_date, count metrics).
336
- - Superficial renames (adding trivial prefixes/suffixes like _value, _field, new_) are forbidden; a new
337
- field name must reflect a semantic transformation.
338
- - Keep field count focused (typically <= 12) prioritizing reusable analytical / ML features over low-signal
339
- restatements.
340
- - If you retain an original raw key unchanged, its description must justify why it is minimal and cannot
341
- be further decomposed without losing analytical value.
214
+ - Field names: lower snake_case, unique within each object, regex ^[a-z][a-z0-9_]*$, no subjective adjectives.
215
+ - Field types: string | integer | float | boolean | enum | object | string_array | integer_array | float_array |
216
+ boolean_array | enum_array | object_array
217
+ * *_array are homogeneous lists of their primitive / enum / object base type.
218
+ * Use object/object_array ONLY for semantically cohesive grouped attributes; avoid gratuitous layers.
219
+ - Enumerations: use enum_spec { name (UpperCamelCase), values [raw_tokens...] }. values length 1–{_MAX_ENUM_VALUES}.
220
+ Use ONLY when closed set is evidenced. Otherwise, use string.
221
+ - Numeric (integer|float) names encode explicit unit/measure suffix (e.g. *_count, *_seconds, *_usd, *_ratio, *_score).
222
+ - Boolean names start with 'is_' followed by positive predicate (no negations like is_not_*).
223
+ - Array field names SHOULD end with '_array' for primitive/enum arrays; object_array
224
+ fields may use plural noun or *_array pattern.
225
+ - Descriptions: concise, objective extraction criteria (no marketing/emotion/speculation).
226
+ - Exclude direct outcome labels in predictive contexts.
227
+ - Avoid superficial renames; semantic transformation only.
228
+ - Keep total field count focused (typically <= 16) optimizing for reusable analytical / ML features.
342
229
 
343
230
  Output contract:
344
- Return exactly an InferredSchema object with JSON keys:
345
- - purpose (string)
346
- - examples_summary (string)
347
- - examples_purpose_alignment (string)
348
- - fields (array of FieldSpec objects: name, type, description, enum_values?)
349
- - inference_prompt (string)
231
+ Return exactly an InferredSchema JSON object with keys:
232
+ - purpose (string)
233
+ - examples_summary (string)
234
+ - examples_purpose_alignment (string)
235
+ - object_spec (ObjectSpec: name, fields[list[FieldSpec]])
236
+ - inference_prompt (string)
237
+ Where each FieldSpec includes: name, type, description, optional enum_spec (for
238
+ enum / enum_array), optional object_spec (for object / object_array).
350
239
  """.strip()
351
240
 
352
241
 
@@ -372,14 +261,15 @@ class SchemaInferer:
372
261
  client: OpenAI
373
262
  model_name: str
374
263
 
375
- def infer_schema(self, data: "SchemaInferenceInput", *args, max_retries: int = 3, **kwargs) -> "InferredSchema":
264
+ def infer_schema(self, data: SchemaInferenceInput, *args, max_retries: int = 8, **kwargs) -> InferredSchema:
376
265
  """Infer a validated schema from representative examples.
377
266
 
378
- Workflow:
379
- 1. Submit ``SchemaInferenceInput`` (JSON) + instructions via
380
- ``responses.parse`` requesting an ``InferredSchema`` object.
381
- 2. Validate the returned field list with ``_basic_field_list_validation``.
382
- 3. Retry (up to ``max_retries``) if validation fails.
267
+ Workflow:
268
+ 1. Submit ``SchemaInferenceInput`` (JSON) + instructions via
269
+ ``responses.parse`` requesting an ``InferredSchema`` object.
270
+ 2. Attempt dynamic model build (``parsed.build_model()``) which performs recursive
271
+ structural validation (names, types, enum/object specs) via the dynamic layer.
272
+ 3. Retry (up to ``max_retries``) on validation failure.
383
273
 
384
274
  Args:
385
275
  data (SchemaInferenceInput): Representative examples + purpose.
@@ -431,46 +321,17 @@ class SchemaInferer:
431
321
  )
432
322
  parsed = response.output_parsed
433
323
  try:
434
- _basic_field_list_validation(parsed)
435
- parsed.build_model() # ensure dynamic model creation succeeds
324
+ # Validate the field list structure
325
+ parsed.build_model()
326
+ return parsed
436
327
  except ValueError as e:
437
328
  last_err = e
438
329
  previous_errors.append(str(e))
439
330
  if attempt == max_retries - 1:
440
- raise
441
- continue
442
- return parsed
443
- if last_err: # pragma: no cover
444
- raise last_err
445
- raise RuntimeError("unreachable retry loop state") # pragma: no cover
446
-
447
-
448
- def _basic_field_list_validation(parsed: InferredSchema) -> None:
449
- """Lightweight structural validation of an inferred field list.
331
+ raise ValueError(
332
+ f"Schema validation failed after {max_retries} attempts. Last error: {last_err}"
333
+ ) from last_err
450
334
 
451
- Checks:
452
- * Non-empty field set.
453
- * No duplicate names.
454
- * All types in the allowed primitive set.
455
- * ``enum_values`` only on string fields and size within bounds (2–24).
456
-
457
- Args:
458
- parsed (InferredSchema): Candidate ``InferredSchema`` instance.
459
-
460
- Raises:
461
- ValueError: Any invariant is violated.
462
- """
463
- names = [f.name for f in parsed.fields]
464
- if not names:
465
- raise ValueError("no fields suggested")
466
- if len(names) != len(set(names)):
467
- raise ValueError("duplicate field names detected")
468
- allowed = {"string", "integer", "float", "boolean"}
469
- for f in parsed.fields:
470
- if f.type not in allowed:
471
- raise ValueError(f"unsupported field type: {f.type}")
472
- if f.enum_values is not None:
473
- if f.type != "string":
474
- raise ValueError(f"enum_values only allowed for string field: {f.name}")
475
- if not (2 <= len(f.enum_values) <= 24):
476
- raise ValueError(f"enum_values length out of bounds for field {f.name}")
335
+ if last_err:
336
+ raise last_err
337
+ raise RuntimeError("unreachable retry loop state")
openaivec/_serialize.py CHANGED
@@ -4,19 +4,19 @@ This module provides utilities for converting Pydantic BaseModel classes
4
4
  to and from JSON schema representations with simplified, maintainable code.
5
5
  """
6
6
 
7
- from typing import Any, Dict, List, Literal, Tuple, Type, Union
7
+ from typing import Any, Literal
8
8
 
9
9
  from pydantic import BaseModel, Field, create_model
10
10
 
11
11
  __all__ = []
12
12
 
13
13
 
14
- def serialize_base_model(obj: Type[BaseModel]) -> Dict[str, Any]:
14
+ def serialize_base_model(obj: type[BaseModel]) -> dict[str, Any]:
15
15
  """Serialize a Pydantic BaseModel to JSON schema."""
16
16
  return obj.model_json_schema()
17
17
 
18
18
 
19
- def dereference_json_schema(json_schema: Dict[str, Any]) -> Dict[str, Any]:
19
+ def dereference_json_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
20
20
  """Dereference JSON schema by resolving $ref pointers with circular reference protection."""
21
21
  model_map = json_schema.get("$defs", {})
22
22
 
@@ -61,7 +61,7 @@ def dereference_json_schema(json_schema: Dict[str, Any]) -> Dict[str, Any]:
61
61
  # ============================================================================
62
62
 
63
63
 
64
- def _resolve_union_type(union_options: List[Dict[str, Any]]) -> Type:
64
+ def _resolve_union_type(union_options: list[dict[str, Any]]) -> type:
65
65
  """Resolve anyOf/oneOf to Union type."""
66
66
  union_types = []
67
67
  for option in union_options:
@@ -75,12 +75,14 @@ def _resolve_union_type(union_options: List[Dict[str, Any]]) -> Type:
75
75
  elif len(union_types) == 2 and type(None) in union_types:
76
76
  # Optional type: T | None
77
77
  non_none_type = next(t for t in union_types if t is not type(None))
78
- return Union[non_none_type, type(None)] # type: ignore[return-value]
78
+ return non_none_type | None # type: ignore[return-value]
79
79
  else:
80
+ from typing import Union
81
+
80
82
  return Union[tuple(union_types)] # type: ignore[return-value]
81
83
 
82
84
 
83
- def _resolve_basic_type(type_name: str, field_def: Dict[str, Any]) -> Type:
85
+ def _resolve_basic_type(type_name: str, field_def: dict[str, Any]) -> type:
84
86
  """Resolve basic JSON schema types to Python types."""
85
87
  type_mapping = {
86
88
  "string": str,
@@ -101,14 +103,14 @@ def _resolve_basic_type(type_name: str, field_def: Dict[str, Any]) -> Type:
101
103
  elif type_name == "array":
102
104
  if "items" in field_def:
103
105
  inner_type = parse_field(field_def["items"])
104
- return List[inner_type]
106
+ return list[inner_type]
105
107
  else:
106
- return List[Any]
108
+ return list[Any]
107
109
  else:
108
110
  raise ValueError(f"Unsupported type: {type_name}")
109
111
 
110
112
 
111
- def parse_field(field_def: Dict[str, Any]) -> Type:
113
+ def parse_field(field_def: dict[str, Any]) -> type:
112
114
  """Parse a JSON schema field definition to a Python type.
113
115
 
114
116
  Simplified version with clear separation of concerns.
@@ -141,17 +143,19 @@ def _create_field_info(description: str | None, default_value: Any, is_required:
141
143
  return Field(default=default_value, description=description) if description else Field(default=default_value)
142
144
 
143
145
 
144
- def _make_optional_if_needed(field_type: Type, is_required: bool, has_default: bool) -> Type:
146
+ def _make_optional_if_needed(field_type: type, is_required: bool, has_default: bool) -> type:
145
147
  """Make field type optional if needed."""
146
148
  if is_required or has_default:
147
149
  return field_type
148
150
 
149
151
  # Check if already nullable
152
+ from typing import Union
153
+
150
154
  if hasattr(field_type, "__origin__") and field_type.__origin__ is Union and type(None) in field_type.__args__:
151
155
  return field_type
152
156
 
153
157
  # Make optional
154
- return Union[field_type, type(None)] # type: ignore[return-value]
158
+ return field_type | None # type: ignore[return-value]
155
159
 
156
160
 
157
161
  # ============================================================================
@@ -159,7 +163,7 @@ def _make_optional_if_needed(field_type: Type, is_required: bool, has_default: b
159
163
  # ============================================================================
160
164
 
161
165
 
162
- def _process_enum_field(field_name: str, field_def: Dict[str, Any], is_required: bool) -> Tuple[Type, Field]: # type: ignore[type-arg]
166
+ def _process_enum_field(field_name: str, field_def: dict[str, Any], is_required: bool) -> tuple[type, Field]: # type: ignore[type-arg]
163
167
  """Process enum field with Literal type."""
164
168
  enum_values = field_def["enum"]
165
169
 
@@ -175,14 +179,14 @@ def _process_enum_field(field_name: str, field_def: Dict[str, Any], is_required:
175
179
  has_default = default_value is not None
176
180
 
177
181
  if not is_required and not has_default:
178
- literal_type = Union[literal_type, type(None)] # type: ignore[assignment]
182
+ literal_type = literal_type | None # type: ignore[assignment]
179
183
  default_value = None
180
184
 
181
185
  field_info = _create_field_info(description, default_value, is_required)
182
186
  return literal_type, field_info # type: ignore[return-value]
183
187
 
184
188
 
185
- def _process_regular_field(field_name: str, field_def: Dict[str, Any], is_required: bool) -> Tuple[Type, Field]: # type: ignore[type-arg]
189
+ def _process_regular_field(field_name: str, field_def: dict[str, Any], is_required: bool) -> tuple[type, Field]: # type: ignore[type-arg]
186
190
  """Process regular (non-enum) field."""
187
191
  field_type = parse_field(field_def)
188
192
  description = field_def.get("description")
@@ -204,7 +208,7 @@ def _process_regular_field(field_name: str, field_def: Dict[str, Any], is_requir
204
208
  # ============================================================================
205
209
 
206
210
 
207
- def deserialize_base_model(json_schema: Dict[str, Any]) -> Type[BaseModel]:
211
+ def deserialize_base_model(json_schema: dict[str, Any]) -> type[BaseModel]:
208
212
  """Deserialize a JSON schema to a Pydantic BaseModel class.
209
213
 
210
214
  Refactored version with clear separation of concerns and simplified logic.