openaivec 0.14.12__py3-none-any.whl → 0.14.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_embeddings.py +17 -4
- openaivec/_model.py +7 -12
- openaivec/_prompt.py +3 -6
- openaivec/_responses.py +39 -117
- openaivec/_schema.py +27 -23
- openaivec/pandas_ext.py +355 -343
- openaivec/spark.py +32 -39
- openaivec/task/__init__.py +1 -1
- openaivec/task/customer_support/customer_sentiment.py +4 -9
- openaivec/task/customer_support/inquiry_classification.py +5 -8
- openaivec/task/customer_support/inquiry_summary.py +5 -6
- openaivec/task/customer_support/intent_analysis.py +5 -7
- openaivec/task/customer_support/response_suggestion.py +5 -8
- openaivec/task/customer_support/urgency_analysis.py +5 -8
- openaivec/task/nlp/dependency_parsing.py +1 -2
- openaivec/task/nlp/keyword_extraction.py +1 -2
- openaivec/task/nlp/morphological_analysis.py +1 -2
- openaivec/task/nlp/named_entity_recognition.py +1 -2
- openaivec/task/nlp/sentiment_analysis.py +1 -2
- openaivec/task/nlp/translation.py +1 -1
- openaivec/task/table/fillna.py +8 -3
- {openaivec-0.14.12.dist-info → openaivec-0.14.13.dist-info}/METADATA +1 -1
- openaivec-0.14.13.dist-info/RECORD +37 -0
- openaivec-0.14.12.dist-info/RECORD +0 -37
- {openaivec-0.14.12.dist-info → openaivec-0.14.13.dist-info}/WHEEL +0 -0
- {openaivec-0.14.12.dist-info → openaivec-0.14.13.dist-info}/licenses/LICENSE +0 -0
openaivec/_schema.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Internal schema inference & dynamic model materialization utilities.
|
|
2
2
|
|
|
3
3
|
This (non-public) module converts a small *representative* sample of free‑text
|
|
4
|
-
examples plus
|
|
4
|
+
examples plus an *instructions* statement into:
|
|
5
5
|
|
|
6
6
|
1. A vetted hierarchical object specification (``ObjectSpec``) whose recursively
|
|
7
7
|
defined ``fields`` (``FieldSpec``) capture reliably extractable signals.
|
|
@@ -45,7 +45,7 @@ Example (conceptual):
|
|
|
45
45
|
schema = inferer.infer_schema(
|
|
46
46
|
SchemaInferenceInput(
|
|
47
47
|
examples=["Order #123 delayed due to weather", "Order #456 delivered"],
|
|
48
|
-
|
|
48
|
+
instructions="Extract operational status signals for logistics analytics",
|
|
49
49
|
)
|
|
50
50
|
)
|
|
51
51
|
Model = schema.model # dynamic Pydantic model
|
|
@@ -71,16 +71,16 @@ __all__: list[str] = []
|
|
|
71
71
|
class InferredSchema(BaseModel):
|
|
72
72
|
"""Result of a schema inference round.
|
|
73
73
|
|
|
74
|
-
Contains the normalized *
|
|
74
|
+
Contains the normalized *instructions*, objective *examples_summary*, the root
|
|
75
75
|
hierarchical ``object_spec`` contract, and the canonical reusable
|
|
76
76
|
``inference_prompt``. The prompt MUST be fully derivable from the other
|
|
77
77
|
components (no new unstated facts) to preserve traceability.
|
|
78
78
|
|
|
79
79
|
Attributes:
|
|
80
|
-
|
|
80
|
+
instructions: Unambiguous restatement of the user's objective.
|
|
81
81
|
examples_summary: Neutral description of structural / semantic patterns
|
|
82
82
|
observed in the examples.
|
|
83
|
-
|
|
83
|
+
examples_instructions_alignment: Mapping from instructions facets to concrete
|
|
84
84
|
recurring evidence (or explicit gaps) anchoring extraction scope.
|
|
85
85
|
object_spec: Root ``ObjectSpec`` (UpperCamelCase name) whose ``fields``
|
|
86
86
|
recursively define the extraction schema.
|
|
@@ -88,7 +88,7 @@ class InferredSchema(BaseModel):
|
|
|
88
88
|
hierarchy, and types (no additions/removals/renames).
|
|
89
89
|
"""
|
|
90
90
|
|
|
91
|
-
|
|
91
|
+
instructions: str = Field(
|
|
92
92
|
description=(
|
|
93
93
|
"Normalized, unambiguous restatement of the user objective with redundant, vague, or "
|
|
94
94
|
"conflicting phrasing removed."
|
|
@@ -100,24 +100,25 @@ class InferredSchema(BaseModel):
|
|
|
100
100
|
"patterns, and notable constraints."
|
|
101
101
|
)
|
|
102
102
|
)
|
|
103
|
-
|
|
103
|
+
examples_instructions_alignment: str = Field(
|
|
104
104
|
description=(
|
|
105
105
|
"Explanation of how observable recurring patterns in the examples substantiate and bound the stated "
|
|
106
|
-
"
|
|
107
|
-
"reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream
|
|
106
|
+
"instructions. Should reference instructions facets and cite supporting example evidence (or note any "
|
|
107
|
+
"gaps) to reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream "
|
|
108
|
+
"extraction."
|
|
108
109
|
)
|
|
109
110
|
)
|
|
110
111
|
object_spec: ObjectSpec = Field(
|
|
111
112
|
description=(
|
|
112
113
|
"Root ObjectSpec (recursive). Each contained object's field list is unique-name ordered and derived "
|
|
113
|
-
"strictly from observable, repeatable signals aligned with the
|
|
114
|
+
"strictly from observable, repeatable signals aligned with the instructions."
|
|
114
115
|
)
|
|
115
116
|
)
|
|
116
117
|
inference_prompt: str = Field(
|
|
117
118
|
description=(
|
|
118
|
-
"Canonical, reusable extraction prompt. Must be derivable from
|
|
119
|
-
"exact hierarchical field set (names, order per object, types) forbidding additions, removals,
|
|
120
|
-
"subjective language. Self-contained (no TODOs, external refs, or placeholders)."
|
|
119
|
+
"Canonical, reusable extraction prompt. Must be derivable from instructions + summaries + object_spec. "
|
|
120
|
+
"Enforces exact hierarchical field set (names, order per object, types) forbidding additions, removals, "
|
|
121
|
+
"renames, or subjective language. Self-contained (no TODOs, external refs, or placeholders)."
|
|
121
122
|
)
|
|
122
123
|
)
|
|
123
124
|
|
|
@@ -153,7 +154,9 @@ class InferredSchema(BaseModel):
|
|
|
153
154
|
PreparedTask: Ready for batched structured extraction calls.
|
|
154
155
|
"""
|
|
155
156
|
return PreparedTask(
|
|
156
|
-
instructions=self.inference_prompt,
|
|
157
|
+
instructions=self.inference_prompt,
|
|
158
|
+
response_format=self.model,
|
|
159
|
+
api_kwargs={"top_p": None, "temperature": None},
|
|
157
160
|
)
|
|
158
161
|
|
|
159
162
|
def build_model(self) -> type[BaseModel]:
|
|
@@ -176,7 +179,7 @@ class SchemaInferenceInput(BaseModel):
|
|
|
176
179
|
examples: Representative sample texts restricted to the in‑scope
|
|
177
180
|
distribution (exclude outliers / noise). Size should be *minimal*
|
|
178
181
|
yet sufficient to surface recurring patterns.
|
|
179
|
-
|
|
182
|
+
instructions: Plain language description of downstream usage (analytics,
|
|
180
183
|
filtering, enrichment, feature engineering, etc.). Guides field
|
|
181
184
|
relevance & exclusion of outcome labels.
|
|
182
185
|
"""
|
|
@@ -187,7 +190,7 @@ class SchemaInferenceInput(BaseModel):
|
|
|
187
190
|
"exclude outliers not in scope."
|
|
188
191
|
)
|
|
189
192
|
)
|
|
190
|
-
|
|
193
|
+
instructions: str = Field(
|
|
191
194
|
description=(
|
|
192
195
|
"Plain language statement describing the downstream use of the extracted structured data (e.g. "
|
|
193
196
|
"analytics, filtering, enrichment)."
|
|
@@ -199,15 +202,16 @@ _INFER_INSTRUCTIONS = """
|
|
|
199
202
|
You are a schema inference engine.
|
|
200
203
|
|
|
201
204
|
Task:
|
|
202
|
-
1. Normalize the user's
|
|
205
|
+
1. Normalize the user's instructions (eliminate ambiguity, redundancy, contradictions).
|
|
203
206
|
2. Objectively summarize observable patterns in the example texts.
|
|
204
|
-
3. Produce an "
|
|
207
|
+
3. Produce an "examples_instructions_alignment" explanation mapping instructions facets to concrete recurring
|
|
208
|
+
evidence (or gaps).
|
|
205
209
|
4. Propose a minimal hierarchical schema (root ObjectSpec) comprised of reliably extractable fields. Use nesting ONLY
|
|
206
210
|
when a group of fields forms a cohesive sub-entity repeated in the data; otherwise keep flat.
|
|
207
211
|
5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
|
|
208
212
|
6. Provide enum_spec ONLY when a small stable closed categorical set (1–{_MAX_ENUM_VALUES} raw tokens) is clearly
|
|
209
213
|
evidenced; never invent unseen categories.
|
|
210
|
-
7. If the
|
|
214
|
+
7. If the instructions indicate prediction (predict / probability / likelihood),
|
|
211
215
|
output only explanatory features (no target restatement).
|
|
212
216
|
|
|
213
217
|
Rules:
|
|
@@ -229,9 +233,9 @@ Rules:
|
|
|
229
233
|
|
|
230
234
|
Output contract:
|
|
231
235
|
Return exactly an InferredSchema JSON object with keys:
|
|
232
|
-
-
|
|
236
|
+
- instructions (string)
|
|
233
237
|
- examples_summary (string)
|
|
234
|
-
-
|
|
238
|
+
- examples_instructions_alignment (string)
|
|
235
239
|
- object_spec (ObjectSpec: name, fields[list[FieldSpec]])
|
|
236
240
|
- inference_prompt (string)
|
|
237
241
|
Where each FieldSpec includes: name, type, description, optional enum_spec (for
|
|
@@ -272,14 +276,14 @@ class SchemaInferer:
|
|
|
272
276
|
3. Retry (up to ``max_retries``) on validation failure.
|
|
273
277
|
|
|
274
278
|
Args:
|
|
275
|
-
data (SchemaInferenceInput): Representative examples +
|
|
279
|
+
data (SchemaInferenceInput): Representative examples + instructions.
|
|
276
280
|
*args: Positional passthrough to ``client.responses.parse``.
|
|
277
281
|
max_retries (int, optional): Attempts before surfacing the last validation error
|
|
278
282
|
(must be >= 1). Defaults to 3.
|
|
279
283
|
**kwargs: Keyword passthrough to ``client.responses.parse``.
|
|
280
284
|
|
|
281
285
|
Returns:
|
|
282
|
-
InferredSchema: Fully validated schema (
|
|
286
|
+
InferredSchema: Fully validated schema (instructions, examples summary,
|
|
283
287
|
ordered fields, extraction prompt).
|
|
284
288
|
|
|
285
289
|
Raises:
|