openaivec 0.12.5__py3-none-any.whl → 1.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. openaivec/__init__.py +13 -4
  2. openaivec/_cache/__init__.py +12 -0
  3. openaivec/_cache/optimize.py +109 -0
  4. openaivec/_cache/proxy.py +806 -0
  5. openaivec/{di.py → _di.py} +36 -12
  6. openaivec/_embeddings.py +203 -0
  7. openaivec/{log.py → _log.py} +2 -2
  8. openaivec/_model.py +113 -0
  9. openaivec/{prompt.py → _prompt.py} +95 -28
  10. openaivec/_provider.py +207 -0
  11. openaivec/_responses.py +511 -0
  12. openaivec/_schema/__init__.py +9 -0
  13. openaivec/_schema/infer.py +340 -0
  14. openaivec/_schema/spec.py +350 -0
  15. openaivec/_serialize.py +234 -0
  16. openaivec/{util.py → _util.py} +25 -85
  17. openaivec/pandas_ext.py +1496 -318
  18. openaivec/spark.py +485 -183
  19. openaivec/task/__init__.py +9 -7
  20. openaivec/task/customer_support/__init__.py +9 -15
  21. openaivec/task/customer_support/customer_sentiment.py +17 -15
  22. openaivec/task/customer_support/inquiry_classification.py +23 -22
  23. openaivec/task/customer_support/inquiry_summary.py +14 -13
  24. openaivec/task/customer_support/intent_analysis.py +21 -19
  25. openaivec/task/customer_support/response_suggestion.py +16 -16
  26. openaivec/task/customer_support/urgency_analysis.py +24 -25
  27. openaivec/task/nlp/__init__.py +4 -4
  28. openaivec/task/nlp/dependency_parsing.py +10 -12
  29. openaivec/task/nlp/keyword_extraction.py +11 -14
  30. openaivec/task/nlp/morphological_analysis.py +12 -14
  31. openaivec/task/nlp/named_entity_recognition.py +16 -18
  32. openaivec/task/nlp/sentiment_analysis.py +14 -11
  33. openaivec/task/nlp/translation.py +6 -9
  34. openaivec/task/table/__init__.py +2 -2
  35. openaivec/task/table/fillna.py +11 -11
  36. openaivec-1.0.10.dist-info/METADATA +399 -0
  37. openaivec-1.0.10.dist-info/RECORD +39 -0
  38. {openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/WHEEL +1 -1
  39. openaivec/embeddings.py +0 -172
  40. openaivec/model.py +0 -67
  41. openaivec/provider.py +0 -45
  42. openaivec/responses.py +0 -393
  43. openaivec/serialize.py +0 -225
  44. openaivec-0.12.5.dist-info/METADATA +0 -696
  45. openaivec-0.12.5.dist-info/RECORD +0 -33
  46. {openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,340 @@
1
+ """Internal schema inference & dynamic model materialization utilities.
2
+
3
+ This (non-public) module converts a small *representative* sample of free‑text
4
+ examples plus an *instructions* statement into:
5
+
6
+ 1. A vetted hierarchical object specification (``ObjectSpec``) whose recursively
7
+ defined ``fields`` (``FieldSpec``) capture reliably extractable signals.
8
+ 2. A reusable, self‑contained extraction prompt (``inference_prompt``) that
9
+ freezes the agreed schema contract (no additions / renames / omissions).
10
+ 3. A dynamically generated Pydantic model mirroring the hierarchical schema,
11
+ enabling immediate typed parsing with the OpenAI Responses API.
12
+ 4. A ``PreparedTask`` wrapper (``InferredSchema.task``) for downstream batched
13
+ responses / structured extraction flows in pandas or Spark.
14
+
15
+ Core goals:
16
+ * Minimize manual, subjective schema design iterations.
17
+ * Enforce objective naming / typing / enum rules early (guard rails rather than
18
+ after‑the‑fact cleaning).
19
+ * Provide deterministic reusability: the same prompt + model yield stable field
20
+ ordering & types for analytics or feature engineering.
21
+ * Avoid outcome / target label leakage in predictive (feature engineering)
22
+ contexts by explicitly excluding direct target restatements.
23
+
24
+ This module is intentionally **internal** (``__all__ = []``). Public users
25
+ should interact through higher‑level batch APIs once a schema has been inferred.
26
+
27
+ Design constraints (updated):
28
+ * Root: single ``ObjectSpec`` (UpperCamelCase name) containing one or more fields.
29
+ * Field types: string | integer | float | boolean | enum | object |
30
+ string_array | integer_array | float_array | boolean_array | enum_array | object_array
31
+ * Arrays are homogeneous lists of their base type.
32
+ * Nested objects / arrays of objects are allowed when semantically cohesive; keep
33
+ depth shallow and avoid gratuitous nesting.
34
+ * Enumerations use ``enum_spec`` with explicit ``name`` (UpperCamelCase) and 1–24
35
+ raw label values (project constant). Values collapse by uppercasing; order not guaranteed.
36
+ * Field names: lower_snake_case; unique per containing object.
37
+ * Boolean names: affirmative 'is_' prefix.
38
+ * Numeric (integer/float) names encode unit / measure suffix (e.g. *_count, *_ratio, *_ms).
39
+ * Validation retries ensure a structurally coherent suggestion before returning.
40
+
41
+ Example (conceptual):
42
+ from openai import OpenAI
43
+ client = OpenAI()
44
+ inferer = SchemaInferer(client=client, model_name="gpt-4.1-mini")
45
+ schema = inferer.infer_schema(
46
+ SchemaInferenceInput(
47
+ examples=["Order #123 delayed due to weather", "Order #456 delivered"],
48
+ instructions="Extract operational status signals for logistics analytics",
49
+ )
50
+ )
51
+ Model = schema.model # dynamic Pydantic model
52
+ task = schema.task # PreparedTask for batch extraction
53
+
54
+ The implementation purposefully does *not* emit or depend on JSON Schema; the
55
+ authoritative contract is the recursive ``ObjectSpec`` tree.
56
+ """
57
+
58
+ from dataclasses import dataclass
59
+
60
+ from openai import OpenAI
61
+ from openai.types.responses import ParsedResponse
62
+ from pydantic import BaseModel, Field
63
+
64
+ from openaivec._model import PreparedTask
65
+ from openaivec._schema.spec import ObjectSpec, _build_model
66
+
67
+ # Internal module: explicitly not part of public API
68
+ __all__: list[str] = []
69
+
70
+
71
+ class SchemaInferenceOutput(BaseModel):
72
+ """Result of a schema inference round.
73
+
74
+ Contains the normalized *instructions*, objective *examples_summary*, the root
75
+ hierarchical ``object_spec`` contract, and the canonical reusable
76
+ ``inference_prompt``. The prompt MUST be fully derivable from the other
77
+ components (no new unstated facts) to preserve traceability.
78
+
79
+ Attributes:
80
+ instructions: Unambiguous restatement of the user's objective.
81
+ examples_summary: Neutral description of structural / semantic patterns
82
+ observed in the examples.
83
+ examples_instructions_alignment: Mapping from instructions facets to concrete
84
+ recurring evidence (or explicit gaps) anchoring extraction scope.
85
+ object_spec: Root ``ObjectSpec`` (UpperCamelCase name) whose ``fields``
86
+ recursively define the extraction schema.
87
+ inference_prompt: Canonical instructions enforcing exact field names,
88
+ hierarchy, and types (no additions/removals/renames).
89
+ """
90
+
91
+ instructions: str = Field(
92
+ description=(
93
+ "Normalized, unambiguous restatement of the user objective with redundant, vague, or "
94
+ "conflicting phrasing removed."
95
+ )
96
+ )
97
+ examples_summary: str = Field(
98
+ description=(
99
+ "Objective characterization of the provided examples: content domain, structure, recurring "
100
+ "patterns, and notable constraints."
101
+ )
102
+ )
103
+ examples_instructions_alignment: str = Field(
104
+ description=(
105
+ "Explanation of how observable recurring patterns in the examples substantiate and bound the stated "
106
+ "instructions. Should reference instructions facets and cite supporting example evidence (or note any "
107
+ "gaps) to reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream "
108
+ "extraction."
109
+ )
110
+ )
111
+ object_spec: ObjectSpec = Field(
112
+ description=(
113
+ "Root ObjectSpec (recursive). Each contained object's field list is unique-name ordered and derived "
114
+ "strictly from observable, repeatable signals aligned with the instructions."
115
+ )
116
+ )
117
+ inference_prompt: str = Field(
118
+ description=(
119
+ "Canonical, reusable extraction prompt. Must be derivable from instructions + summaries + object_spec. "
120
+ "Enforces exact hierarchical field set (names, order per object, types) forbidding additions, removals, "
121
+ "renames, or subjective language. Self-contained (no TODOs, external refs, or placeholders)."
122
+ )
123
+ )
124
+
125
+ @classmethod
126
+ def load(cls, path: str) -> "SchemaInferenceOutput":
127
+ """Load an inferred schema from a JSON file.
128
+
129
+ Args:
130
+ path (str): Path to a UTF‑8 JSON document previously produced via ``save``.
131
+
132
+ Returns:
133
+ InferredSchema: Reconstructed instance.
134
+ """
135
+ with open(path, "r", encoding="utf-8") as f:
136
+ return cls.model_validate_json(f.read())
137
+
138
+ @property
139
+ def model(self) -> type[BaseModel]:
140
+ """Dynamically materialized Pydantic model for the inferred schema.
141
+
142
+ Equivalent to calling :meth:`build_model` each access (not cached).
143
+
144
+ Returns:
145
+ type[BaseModel]: Fresh model type reflecting ``fields`` ordering.
146
+ """
147
+ return self.build_model()
148
+
149
+ @property
150
+ def task(self) -> PreparedTask:
151
+ """PreparedTask integrating the schema's extraction prompt & model.
152
+
153
+ Returns:
154
+ PreparedTask: Ready for batched structured extraction calls.
155
+ """
156
+ return PreparedTask(
157
+ instructions=self.inference_prompt,
158
+ response_format=self.model,
159
+ )
160
+
161
+ def build_model(self) -> type[BaseModel]:
162
+ return _build_model(self.object_spec)
163
+
164
+ def save(self, path: str) -> None:
165
+ """Persist this inferred schema as pretty‑printed JSON.
166
+
167
+ Args:
168
+ path (str): Destination filesystem path.
169
+ """
170
+ with open(path, "w", encoding="utf-8") as f:
171
+ f.write(self.model_dump_json(indent=2))
172
+
173
+
174
+ class SchemaInferenceInput(BaseModel):
175
+ """Input payload for schema inference.
176
+
177
+ Attributes:
178
+ examples: Representative sample texts restricted to the in‑scope
179
+ distribution (exclude outliers / noise). Size should be *minimal*
180
+ yet sufficient to surface recurring patterns.
181
+ instructions: Plain language description of downstream usage (analytics,
182
+ filtering, enrichment, feature engineering, etc.). Guides field
183
+ relevance & exclusion of outcome labels.
184
+ """
185
+
186
+ examples: list[str] = Field(
187
+ description=(
188
+ "Representative sample texts (strings). Provide only data the schema should generalize over; "
189
+ "exclude outliers not in scope."
190
+ )
191
+ )
192
+ instructions: str = Field(
193
+ description=(
194
+ "Plain language statement describing the downstream use of the extracted structured data (e.g. "
195
+ "analytics, filtering, enrichment)."
196
+ )
197
+ )
198
+
199
+
200
+ _INFER_INSTRUCTIONS = """
201
+ You are a schema inference engine.
202
+
203
+ Task:
204
+ 1. Normalize the user's instructions (eliminate ambiguity, redundancy, contradictions).
205
+ 2. Objectively summarize observable patterns in the example texts.
206
+ 3. Produce an "examples_instructions_alignment" explanation mapping instructions facets to concrete recurring
207
+ evidence (or gaps).
208
+ 4. Propose a minimal hierarchical schema (root ObjectSpec) comprised of reliably extractable fields. Use nesting ONLY
209
+ when a group of fields forms a cohesive sub-entity repeated in the data; otherwise keep flat.
210
+ 5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
211
+ 6. Provide enum_spec ONLY when a small stable closed categorical set (1–{_MAX_ENUM_VALUES} raw tokens) is clearly
212
+ evidenced; never invent unseen categories.
213
+ 7. If the instructions indicate prediction (predict / probability / likelihood),
214
+ output only explanatory features (no target restatement).
215
+
216
+ Rules:
217
+ - Field names: lower snake_case, unique within each object, regex ^[a-z][a-z0-9_]*$, no subjective adjectives.
218
+ - Field types: string | integer | float | boolean | enum | object | string_array | integer_array | float_array |
219
+ boolean_array | enum_array | object_array
220
+ * *_array are homogeneous lists of their primitive / enum / object base type.
221
+ * Use object/object_array ONLY for semantically cohesive grouped attributes; avoid gratuitous layers.
222
+ - Enumerations: use enum_spec { name (UpperCamelCase), values [raw_tokens...] }. values length 1–{_MAX_ENUM_VALUES}.
223
+ Use ONLY when closed set is evidenced. Otherwise, use string.
224
+ - Numeric (integer|float) names encode explicit unit/measure suffix (e.g. *_count, *_seconds, *_usd, *_ratio, *_score).
225
+ - Boolean names start with 'is_' followed by positive predicate (no negations like is_not_*).
226
+ - Array field names SHOULD end with '_array' for primitive/enum arrays; object_array
227
+ fields may use plural noun or *_array pattern.
228
+ - Descriptions: concise, objective extraction criteria (no marketing/emotion/speculation).
229
+ - Exclude direct outcome labels in predictive contexts.
230
+ - Avoid superficial renames; semantic transformation only.
231
+ - Keep total field count focused (typically <= 16) optimizing for reusable analytical / ML features.
232
+
233
+ Output contract:
234
+ Return exactly an InferredSchema JSON object with keys:
235
+ - instructions (string)
236
+ - examples_summary (string)
237
+ - examples_instructions_alignment (string)
238
+ - object_spec (ObjectSpec: name, fields[list[FieldSpec]])
239
+ - inference_prompt (string)
240
+ Where each FieldSpec includes: name, type, description, optional enum_spec (for
241
+ enum / enum_array), optional object_spec (for object / object_array).
242
+ """.strip()
243
+
244
+
245
+ @dataclass(frozen=True)
246
+ class SchemaInferer:
247
+ """High-level orchestrator for schema inference against the Responses API.
248
+
249
+ Responsibilities:
250
+ * Issue a structured parsing request with strict instructions.
251
+ * Retry (up to ``max_retries``) when the produced field list violates
252
+ baseline structural rules (duplicate names, unsupported types, etc.).
253
+ * Return a fully validated ``InferredSchema`` ready for dynamic model
254
+ generation & downstream batch extraction.
255
+
256
+ The inferred schema intentionally avoids JSON Schema intermediates; the
257
+ authoritative contract is the ordered ``FieldSpec`` list.
258
+
259
+ Attributes:
260
+ client: OpenAI client for calling ``responses.parse``.
261
+ model_name: Model / deployment identifier.
262
+ """
263
+
264
+ client: OpenAI
265
+ model_name: str
266
+
267
+ def infer_schema(self, data: SchemaInferenceInput, *args, max_retries: int = 8, **kwargs) -> SchemaInferenceOutput:
268
+ """Infer a validated schema from representative examples.
269
+
270
+ Workflow:
271
+ 1. Submit ``SchemaInferenceInput`` (JSON) + instructions via
272
+ ``responses.parse`` requesting an ``InferredSchema`` object.
273
+ 2. Attempt dynamic model build (``parsed.build_model()``) which performs recursive
274
+ structural validation (names, types, enum/object specs) via the dynamic layer.
275
+ 3. Retry (up to ``max_retries``) on validation failure.
276
+
277
+ Args:
278
+ data (SchemaInferenceInput): Representative examples + instructions.
279
+ *args: Positional passthrough to ``client.responses.parse``.
280
+ max_retries (int, optional): Attempts before surfacing the last validation error
281
+ (must be >= 1). Defaults to 3.
282
+ **kwargs: Keyword passthrough to ``client.responses.parse``.
283
+
284
+ Returns:
285
+ InferredSchema: Fully validated schema (instructions, examples summary,
286
+ ordered fields, extraction prompt).
287
+
288
+ Raises:
289
+ ValueError: Validation still fails after exhausting retries.
290
+ """
291
+ if max_retries < 1:
292
+ raise ValueError("max_retries must be >= 1")
293
+
294
+ last_err: Exception | None = None
295
+ previous_errors: list[str] = []
296
+ for attempt in range(max_retries):
297
+ if attempt == 0:
298
+ instructions = _INFER_INSTRUCTIONS
299
+ else:
300
+ # Provide structured feedback for correction. Keep concise and prohibit speculative expansion.
301
+ feedback_lines = [
302
+ "--- PRIOR VALIDATION FEEDBACK ---",
303
+ ]
304
+ for i, err in enumerate(previous_errors[-5:], 1): # include last up to 5 errors
305
+ feedback_lines.append(f"{i}. {err}")
306
+ feedback_lines.extend(
307
+ [
308
+ "Adjust ONLY listed issues; avoid adding brand-new fields unless essential.",
309
+ "Don't hallucinate or broaden enum_values unless enum rule caused failure.",
310
+ "Duplicate names: minimally rename; keep semantics.",
311
+ "Unsupported type: change to string|integer|float|boolean (no new facts).",
312
+ "Bad enum length: drop enum or constrain to 2–24 evidenced tokens.",
313
+ ]
314
+ )
315
+ instructions = _INFER_INSTRUCTIONS + "\n\n" + "\n".join(feedback_lines)
316
+
317
+ response: ParsedResponse[SchemaInferenceOutput] = self.client.responses.parse(
318
+ model=self.model_name,
319
+ instructions=instructions,
320
+ input=data.model_dump_json(),
321
+ text_format=SchemaInferenceOutput,
322
+ *args,
323
+ **kwargs,
324
+ )
325
+ parsed = response.output_parsed
326
+ try:
327
+ # Validate the field list structure
328
+ parsed.build_model()
329
+ return parsed
330
+ except ValueError as e:
331
+ last_err = e
332
+ previous_errors.append(str(e))
333
+ if attempt == max_retries - 1:
334
+ raise ValueError(
335
+ f"Schema validation failed after {max_retries} attempts. Last error: {last_err}"
336
+ ) from last_err
337
+
338
+ if last_err:
339
+ raise last_err
340
+ raise RuntimeError("unreachable retry loop state")
@@ -0,0 +1,350 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from enum import Enum
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, Field, create_model
8
+
9
+ __all__: list[str] = []
10
+
11
+ _MAX_ENUM_VALUES = 24
12
+
13
+
14
+ class FieldSpec(BaseModel):
15
+ name: str = Field(
16
+ description=(
17
+ "Field name in lower_snake_case. Rules: (1) Use only lowercase letters, numbers, and underscores; "
18
+ "must start with a letter. (2) For numeric quantities append an explicit unit (e.g. 'duration_seconds', "
19
+ "'price_usd'). (3) Boolean fields use an affirmative 'is_' prefix (e.g. 'is_active'); avoid negative / "
20
+ "ambiguous forms like 'is_deleted' (prefer 'is_active', 'is_enabled'). (4) Name must be unique within the "
21
+ "containing object."
22
+ )
23
+ )
24
+ type: Literal[
25
+ "string",
26
+ "integer",
27
+ "float",
28
+ "boolean",
29
+ "enum",
30
+ "object",
31
+ "string_array",
32
+ "integer_array",
33
+ "float_array",
34
+ "boolean_array",
35
+ "enum_array",
36
+ "object_array",
37
+ ] = Field(
38
+ description=(
39
+ "Logical data type. Allowed values: string | integer | float | boolean | enum | object | string_array | "
40
+ "integer_array | float_array | boolean_array | enum_array | object_array. *_array variants represent a "
41
+ "homogeneous list of the base type. 'enum' / 'enum_array' require 'enum_spec'. 'object' / 'object_array' "
42
+ "require 'object_spec'. Primitives must not define 'enum_spec' or 'object_spec'."
43
+ )
44
+ )
45
+ description: str = Field(
46
+ description=(
47
+ "Human‑readable, concise explanation of the field's meaning and business intent. Should clarify units, "
48
+ "value semantics, and any domain constraints not captured by type. 1–2 sentences; no implementation notes."
49
+ )
50
+ )
51
+ enum_spec: EnumSpec | None = Field(
52
+ default=None,
53
+ description=(
54
+ "Enumeration specification for 'enum' / 'enum_array'. Must be provided (non-empty) for those types and "
55
+ "omitted for all others. Maximum size enforced by constant."
56
+ ),
57
+ )
58
+ object_spec: ObjectSpec | None = Field(
59
+ default=None,
60
+ description=(
61
+ "Nested object schema. Required for 'object' / 'object_array'; must be omitted for every other type. The "
62
+ "contained 'name' is used to derive the generated nested Pydantic model class name."
63
+ ),
64
+ )
65
+
66
+
67
+ class EnumSpec(BaseModel):
68
+ """Enumeration specification for enum / enum_array field types.
69
+
70
+ Attributes:
71
+ name: Required Enum class name (UpperCamelCase). Must match ^[A-Z][A-Za-z0-9]*$. Previously optional; now
72
+ explicit to remove implicit coupling to the field name and make schemas self‑describing.
73
+ values: Raw label values (1–_MAX_ENUM_VALUES before de‑dup). Values are uppercased then
74
+ de-duplicated using a set; ordering of generated Enum members is not guaranteed. Any
75
+ casing variants collapse silently to a single member.
76
+ """
77
+
78
+ name: str = Field(
79
+ description=("Required Enum class name (UpperCamelCase). Valid pattern: ^[A-Z][A-Za-z0-9]*$."),
80
+ )
81
+ values: list[str] = Field(
82
+ description=(
83
+ f"Raw enum label values (1–{_MAX_ENUM_VALUES}). Uppercased then deduplicated; order of members "
84
+ "not guaranteed."
85
+ )
86
+ )
87
+
88
+
89
+ class ObjectSpec(BaseModel):
90
+ name: str = Field(
91
+ description=(
92
+ "Object model class name in UpperCamelCase (singular noun). Must match ^[A-Z][A-Za-z0-9]*$ and is used "
93
+ "directly as the generated Pydantic model class name (no transformation)."
94
+ )
95
+ )
96
+ fields: list[FieldSpec] = Field(
97
+ description=(
98
+ "Non-empty list of FieldSpec definitions composing the object. Each field name must be unique; order is "
99
+ "preserved in the generated model."
100
+ )
101
+ )
102
+
103
+
104
+ def _build_model(object_spec: ObjectSpec) -> type[BaseModel]:
105
+ lower_sname_pattern = re.compile(r"^[a-z][a-z0-9]*(?:_[a-z0-9]+)*$")
106
+ upper_camel_pattern = re.compile(r"^[A-Z][A-Za-z0-9]*$")
107
+ type_map: dict[str, type] = {
108
+ "string": str,
109
+ "integer": int,
110
+ "float": float,
111
+ "boolean": bool,
112
+ "string_array": list[str],
113
+ "integer_array": list[int],
114
+ "float_array": list[float],
115
+ "boolean_array": list[bool],
116
+ }
117
+ output_fields: dict[str, tuple[type, object]] = {}
118
+
119
+ field_names: list[str] = [field.name for field in object_spec.fields]
120
+
121
+ # Assert that names of fields are not duplicated
122
+ if len(field_names) != len(set(field_names)):
123
+ raise ValueError("Field names must be unique within the object spec.")
124
+
125
+ for field in object_spec.fields:
126
+ # Assert that field names are lower_snake_case
127
+ if not lower_sname_pattern.match(field.name):
128
+ raise ValueError(f"Field name '{field.name}' must be in lower_snake_case format (e.g., 'my_field_name').")
129
+
130
+ # (EnumSpec.name now mandatory; no need to derive a fallback name from the field.)
131
+ match field:
132
+ case FieldSpec(
133
+ name=name,
134
+ type="string"
135
+ | "integer"
136
+ | "float"
137
+ | "boolean"
138
+ | "string_array"
139
+ | "integer_array"
140
+ | "float_array"
141
+ | "boolean_array",
142
+ description=description,
143
+ enum_spec=None,
144
+ object_spec=None,
145
+ ):
146
+ field_type = type_map[field.type]
147
+ output_fields[name] = (field_type, Field(description=description))
148
+
149
+ case FieldSpec(name=name, type="enum", description=description, enum_spec=enum_spec, object_spec=None) if (
150
+ enum_spec
151
+ and 0 < len(enum_spec.values) <= _MAX_ENUM_VALUES
152
+ and upper_camel_pattern.match(enum_spec.name)
153
+ ):
154
+ member_names = list({v.upper() for v in enum_spec.values})
155
+ enum_type = Enum(enum_spec.name, member_names)
156
+ output_fields[name] = (enum_type, Field(description=description))
157
+
158
+ case FieldSpec(
159
+ name=name, type="enum_array", description=description, enum_spec=enum_spec, object_spec=None
160
+ ) if (
161
+ enum_spec
162
+ and 0 < len(enum_spec.values) <= _MAX_ENUM_VALUES
163
+ and upper_camel_pattern.match(enum_spec.name)
164
+ ):
165
+ member_names = list({v.upper() for v in enum_spec.values})
166
+ enum_type = Enum(enum_spec.name, member_names)
167
+ output_fields[name] = (list[enum_type], Field(description=description))
168
+
169
+ case FieldSpec(
170
+ name=name, type="object", description=description, enum_spec=None, object_spec=object_spec
171
+ ) if object_spec and upper_camel_pattern.match(object_spec.name):
172
+ nested_model = _build_model(object_spec)
173
+ output_fields[name] = (nested_model, Field(description=description))
174
+
175
+ case FieldSpec(
176
+ name=name, type="object_array", description=description, enum_spec=None, object_spec=object_spec
177
+ ) if object_spec and upper_camel_pattern.match(object_spec.name):
178
+ nested_model = _build_model(object_spec)
179
+ output_fields[name] = (list[nested_model], Field(description=description))
180
+
181
+ # ---- Error cases (explicit reasons) ----
182
+ # Enum type without enum_spec (None or empty)
183
+ case FieldSpec(
184
+ name=name,
185
+ type="enum",
186
+ enum_spec=enum_spec,
187
+ object_spec=None,
188
+ ) if not enum_spec or not enum_spec.values:
189
+ raise ValueError(f"Field '{name}': enum type requires non-empty enum_spec values list.")
190
+ # Enum type exceeding max length
191
+ case FieldSpec(
192
+ name=name,
193
+ type="enum",
194
+ enum_spec=enum_spec,
195
+ object_spec=None,
196
+ ) if enum_spec and len(enum_spec.values) > _MAX_ENUM_VALUES:
197
+ raise ValueError(
198
+ (
199
+ f"Field '{name}': enum type supports at most {_MAX_ENUM_VALUES} enum_spec values "
200
+ f"(got {len(enum_spec.values)})."
201
+ )
202
+ )
203
+ # Enum type invalid explicit name pattern
204
+ case FieldSpec(
205
+ name=name,
206
+ type="enum",
207
+ enum_spec=enum_spec,
208
+ object_spec=None,
209
+ ) if enum_spec and not upper_camel_pattern.match(enum_spec.name):
210
+ raise ValueError(
211
+ (f"Field '{name}': enum_spec.name '{enum_spec.name}' invalid – must match ^[A-Z][A-Za-z0-9]*$")
212
+ )
213
+ # Enum type incorrectly provides an object_spec
214
+ case FieldSpec(
215
+ name=name,
216
+ type="enum",
217
+ enum_spec=enum_spec,
218
+ object_spec=object_spec,
219
+ ) if object_spec is not None:
220
+ raise ValueError(
221
+ f"Field '{name}': enum type must not provide object_spec (got object_spec={object_spec!r})."
222
+ )
223
+ # Enum array type without enum_spec
224
+ case FieldSpec(
225
+ name=name,
226
+ type="enum_array",
227
+ enum_spec=enum_spec,
228
+ object_spec=None,
229
+ ) if not enum_spec or not enum_spec.values:
230
+ raise ValueError(f"Field '{name}': enum_array type requires non-empty enum_spec values list.")
231
+ # Enum array type exceeding max length
232
+ case FieldSpec(
233
+ name=name,
234
+ type="enum_array",
235
+ enum_spec=enum_spec,
236
+ object_spec=None,
237
+ ) if enum_spec and len(enum_spec.values) > _MAX_ENUM_VALUES:
238
+ raise ValueError(
239
+ (
240
+ f"Field '{name}': enum_array type supports at most {_MAX_ENUM_VALUES} enum_spec values "
241
+ f"(got {len(enum_spec.values)})."
242
+ )
243
+ )
244
+ # Enum array type invalid explicit name pattern
245
+ case FieldSpec(
246
+ name=name,
247
+ type="enum_array",
248
+ enum_spec=enum_spec,
249
+ object_spec=None,
250
+ ) if enum_spec and not upper_camel_pattern.match(enum_spec.name):
251
+ raise ValueError(
252
+ (f"Field '{name}': enum_spec.name '{enum_spec.name}' invalid – must match ^[A-Z][A-Za-z0-9]*$")
253
+ )
254
+ # Enum array type incorrectly provides an object_spec
255
+ case FieldSpec(
256
+ name=name,
257
+ type="enum_array",
258
+ enum_spec=enum_spec,
259
+ object_spec=object_spec,
260
+ ) if object_spec is not None:
261
+ raise ValueError(
262
+ f"Field '{name}': enum_array type must not provide object_spec (got object_spec={object_spec!r})."
263
+ )
264
+ # Object type missing object_spec
265
+ case FieldSpec(
266
+ name=name,
267
+ type="object",
268
+ enum_spec=enum_spec,
269
+ object_spec=None,
270
+ ):
271
+ raise ValueError(f"Field '{name}': object type requires object_spec (got object_spec=None).")
272
+ # Object array type missing object_spec
273
+ case FieldSpec(
274
+ name=name,
275
+ type="object_array",
276
+ enum_spec=enum_spec,
277
+ object_spec=None,
278
+ ):
279
+ raise ValueError(f"Field '{name}': object_array type requires object_spec (got object_spec=None).")
280
+ # Object/object_array provided but invalid name pattern
281
+ case FieldSpec(
282
+ name=name,
283
+ type="object" | "object_array",
284
+ enum_spec=enum_spec,
285
+ object_spec=object_spec,
286
+ ) if object_spec is not None and not upper_camel_pattern.match(object_spec.name):
287
+ raise ValueError(
288
+ (
289
+ f"Field '{name}': object_spec.name '{object_spec.name}' must be UpperCamelCase "
290
+ "(regex ^[A-Z][A-Za-z0-9]*$) and contain only letters and digits."
291
+ )
292
+ )
293
+ # Object/object_array types must not provide enum_spec
294
+ case FieldSpec(
295
+ name=name,
296
+ type="object" | "object_array",
297
+ enum_spec=enum_spec,
298
+ object_spec=object_spec,
299
+ ) if enum_spec is not None:
300
+ raise ValueError(
301
+ f"Field '{name}': {field.type} must not define enum_spec (got enum_spec={enum_spec!r})."
302
+ )
303
+ # Primitive / simple array types must not have enum_spec
304
+ case FieldSpec(
305
+ name=name,
306
+ type="string"
307
+ | "integer"
308
+ | "float"
309
+ | "boolean"
310
+ | "string_array"
311
+ | "integer_array"
312
+ | "float_array"
313
+ | "boolean_array",
314
+ enum_spec=enum_spec,
315
+ object_spec=object_spec,
316
+ ) if enum_spec is not None:
317
+ raise ValueError(
318
+ (f"Field '{name}': type '{field.type}' must not define enum_spec (got enum_spec={enum_spec!r}).")
319
+ )
320
+ # Primitive / simple array types must not have object_spec
321
+ case FieldSpec(
322
+ name=name,
323
+ type="string"
324
+ | "integer"
325
+ | "float"
326
+ | "boolean"
327
+ | "string_array"
328
+ | "integer_array"
329
+ | "float_array"
330
+ | "boolean_array",
331
+ enum_spec=None,
332
+ object_spec=object_spec,
333
+ ) if object_spec is not None:
334
+ raise ValueError(
335
+ (
336
+ f"Field '{name}': type '{field.type}' must not define object_spec "
337
+ f"(got object_spec={object_spec!r})."
338
+ )
339
+ )
340
+ # Any other unmatched combination
341
+ case FieldSpec() as f:
342
+ raise ValueError(
343
+ (
344
+ "Field configuration invalid / unrecognized combination: "
345
+ f"name={f.name!r}, type={f.type!r}, enum_spec={'set' if f.enum_spec else None}, "
346
+ f"object_spec={'set' if f.object_spec else None}."
347
+ )
348
+ )
349
+
350
+ return create_model(object_spec.name, **output_fields)