openaivec 0.14.6__py3-none-any.whl → 0.14.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_di.py +10 -9
- openaivec/_embeddings.py +12 -13
- openaivec/_log.py +1 -1
- openaivec/_model.py +3 -3
- openaivec/_optimize.py +3 -4
- openaivec/_prompt.py +4 -5
- openaivec/_proxy.py +34 -35
- openaivec/_responses.py +29 -29
- openaivec/_schema.py +80 -20
- openaivec/_serialize.py +19 -15
- openaivec/_util.py +9 -8
- openaivec/pandas_ext.py +20 -19
- openaivec/spark.py +11 -10
- openaivec/task/customer_support/customer_sentiment.py +2 -2
- openaivec/task/customer_support/inquiry_classification.py +8 -8
- openaivec/task/customer_support/inquiry_summary.py +4 -4
- openaivec/task/customer_support/intent_analysis.py +5 -5
- openaivec/task/customer_support/response_suggestion.py +4 -4
- openaivec/task/customer_support/urgency_analysis.py +9 -9
- openaivec/task/nlp/dependency_parsing.py +2 -4
- openaivec/task/nlp/keyword_extraction.py +3 -5
- openaivec/task/nlp/morphological_analysis.py +4 -6
- openaivec/task/nlp/named_entity_recognition.py +7 -9
- openaivec/task/nlp/sentiment_analysis.py +3 -3
- openaivec/task/nlp/translation.py +1 -2
- openaivec/task/table/fillna.py +2 -3
- {openaivec-0.14.6.dist-info → openaivec-0.14.8.dist-info}/METADATA +1 -1
- openaivec-0.14.8.dist-info/RECORD +36 -0
- openaivec-0.14.6.dist-info/RECORD +0 -36
- {openaivec-0.14.6.dist-info → openaivec-0.14.8.dist-info}/WHEEL +0 -0
- {openaivec-0.14.6.dist-info → openaivec-0.14.8.dist-info}/licenses/LICENSE +0 -0
openaivec/_responses.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import warnings
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from logging import Logger, getLogger
|
|
4
|
-
from typing import Any, Generic,
|
|
4
|
+
from typing import Any, Generic, cast
|
|
5
5
|
|
|
6
6
|
from openai import AsyncOpenAI, BadRequestError, InternalServerError, OpenAI, RateLimitError
|
|
7
7
|
from openai.types.responses import ParsedResponse
|
|
@@ -120,11 +120,11 @@ class Message(BaseModel, Generic[ResponseFormat]):
|
|
|
120
120
|
|
|
121
121
|
|
|
122
122
|
class Request(BaseModel):
|
|
123
|
-
user_messages:
|
|
123
|
+
user_messages: list[Message[str]]
|
|
124
124
|
|
|
125
125
|
|
|
126
126
|
class Response(BaseModel, Generic[ResponseFormat]):
|
|
127
|
-
assistant_messages:
|
|
127
|
+
assistant_messages: list[Message[ResponseFormat]]
|
|
128
128
|
|
|
129
129
|
|
|
130
130
|
@dataclass(frozen=True)
|
|
@@ -150,7 +150,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
150
150
|
system_message (str): System prompt prepended to every request.
|
|
151
151
|
temperature (float): Sampling temperature.
|
|
152
152
|
top_p (float): Nucleus‑sampling parameter.
|
|
153
|
-
response_format (
|
|
153
|
+
response_format (type[ResponseFormat]): Expected Pydantic model class or ``str`` for each assistant message.
|
|
154
154
|
cache (BatchingMapProxy[str, ResponseFormat]): Order‑preserving batching proxy with de‑duplication and caching.
|
|
155
155
|
|
|
156
156
|
Notes:
|
|
@@ -165,7 +165,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
165
165
|
system_message: str
|
|
166
166
|
temperature: float | None = None
|
|
167
167
|
top_p: float = 1.0
|
|
168
|
-
response_format:
|
|
168
|
+
response_format: type[ResponseFormat] = str # type: ignore[assignment]
|
|
169
169
|
cache: BatchingMapProxy[str, ResponseFormat] = field(default_factory=lambda: BatchingMapProxy(batch_size=None))
|
|
170
170
|
_vectorized_system_message: str = field(init=False)
|
|
171
171
|
_model_json_schema: dict = field(init=False)
|
|
@@ -178,7 +178,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
178
178
|
system_message: str,
|
|
179
179
|
temperature: float | None = 0.0,
|
|
180
180
|
top_p: float = 1.0,
|
|
181
|
-
response_format:
|
|
181
|
+
response_format: type[ResponseFormat] = str,
|
|
182
182
|
batch_size: int | None = None,
|
|
183
183
|
) -> "BatchResponses":
|
|
184
184
|
"""Factory constructor.
|
|
@@ -189,7 +189,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
189
189
|
system_message (str): System prompt for the model.
|
|
190
190
|
temperature (float, optional): Sampling temperature. Defaults to 0.0.
|
|
191
191
|
top_p (float, optional): Nucleus sampling parameter. Defaults to 1.0.
|
|
192
|
-
response_format (
|
|
192
|
+
response_format (type[ResponseFormat], optional): Expected output type. Defaults to ``str``.
|
|
193
193
|
batch_size (int | None, optional): Max unique prompts per API call. Defaults to None
|
|
194
194
|
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
195
195
|
|
|
@@ -242,12 +242,12 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
242
242
|
@observe(_LOGGER)
|
|
243
243
|
@backoff(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
|
|
244
244
|
def _request_llm(
|
|
245
|
-
self, user_messages:
|
|
245
|
+
self, user_messages: list[Message[str]], **extra_api_params: Any
|
|
246
246
|
) -> ParsedResponse[Response[ResponseFormat]]:
|
|
247
247
|
"""Make a single call to the OpenAI JSON‑mode endpoint.
|
|
248
248
|
|
|
249
249
|
Args:
|
|
250
|
-
user_messages (
|
|
250
|
+
user_messages (list[Message[str]]): Sequence of ``Message[str]`` representing the
|
|
251
251
|
prompts for this minibatch. Each message carries a unique `id`
|
|
252
252
|
so we can restore ordering later.
|
|
253
253
|
|
|
@@ -265,7 +265,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
265
265
|
body: response_format # type: ignore
|
|
266
266
|
|
|
267
267
|
class ResponseT(BaseModel):
|
|
268
|
-
assistant_messages:
|
|
268
|
+
assistant_messages: list[MessageT]
|
|
269
269
|
|
|
270
270
|
# Build base API parameters (cannot be overridden by caller)
|
|
271
271
|
api_params: dict[str, Any] = {
|
|
@@ -300,7 +300,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
300
300
|
return cast(ParsedResponse[Response[ResponseFormat]], completion)
|
|
301
301
|
|
|
302
302
|
@observe(_LOGGER)
|
|
303
|
-
def _predict_chunk(self, user_messages:
|
|
303
|
+
def _predict_chunk(self, user_messages: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
|
|
304
304
|
"""Helper executed for every unique minibatch.
|
|
305
305
|
|
|
306
306
|
This method:
|
|
@@ -316,11 +316,11 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
316
316
|
if not responses.output_parsed:
|
|
317
317
|
return [None] * len(messages)
|
|
318
318
|
response_dict = {message.id: message.body for message in responses.output_parsed.assistant_messages}
|
|
319
|
-
sorted_responses:
|
|
319
|
+
sorted_responses: list[ResponseFormat | None] = [response_dict.get(m.id, None) for m in messages]
|
|
320
320
|
return sorted_responses
|
|
321
321
|
|
|
322
322
|
@observe(_LOGGER)
|
|
323
|
-
def parse(self, inputs:
|
|
323
|
+
def parse(self, inputs: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
|
|
324
324
|
"""Batched predict.
|
|
325
325
|
|
|
326
326
|
Accepts arbitrary keyword arguments that are forwarded to the underlying
|
|
@@ -329,16 +329,16 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
329
329
|
configured values but can be overridden explicitly.
|
|
330
330
|
|
|
331
331
|
Args:
|
|
332
|
-
inputs (
|
|
332
|
+
inputs (list[str]): Prompts that require responses. Duplicates are de‑duplicated.
|
|
333
333
|
**api_kwargs: Extra keyword args forwarded to the OpenAI Responses API.
|
|
334
334
|
|
|
335
335
|
Returns:
|
|
336
|
-
|
|
336
|
+
list[ResponseFormat | None]: Assistant responses aligned to ``inputs``.
|
|
337
337
|
"""
|
|
338
338
|
if not api_kwargs:
|
|
339
339
|
return self.cache.map(inputs, self._predict_chunk) # type: ignore[return-value]
|
|
340
340
|
|
|
341
|
-
def _predict_with(xs:
|
|
341
|
+
def _predict_with(xs: list[str]) -> list[ResponseFormat | None]:
|
|
342
342
|
return self._predict_chunk(xs, **api_kwargs)
|
|
343
343
|
|
|
344
344
|
return self.cache.map(inputs, _predict_with) # type: ignore[return-value]
|
|
@@ -385,7 +385,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
385
385
|
system_message (str): System prompt prepended to every request.
|
|
386
386
|
temperature (float): Sampling temperature.
|
|
387
387
|
top_p (float): Nucleus‑sampling parameter.
|
|
388
|
-
response_format (
|
|
388
|
+
response_format (type[ResponseFormat]): Expected Pydantic model class or ``str`` for each assistant message.
|
|
389
389
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Async batching proxy with de‑duplication
|
|
390
390
|
and concurrency control.
|
|
391
391
|
"""
|
|
@@ -395,7 +395,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
395
395
|
system_message: str
|
|
396
396
|
temperature: float | None = 0.0
|
|
397
397
|
top_p: float = 1.0
|
|
398
|
-
response_format:
|
|
398
|
+
response_format: type[ResponseFormat] = str # type: ignore[assignment]
|
|
399
399
|
cache: AsyncBatchingMapProxy[str, ResponseFormat] = field(
|
|
400
400
|
default_factory=lambda: AsyncBatchingMapProxy(batch_size=None, max_concurrency=8)
|
|
401
401
|
)
|
|
@@ -410,7 +410,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
410
410
|
system_message: str,
|
|
411
411
|
temperature: float | None = None,
|
|
412
412
|
top_p: float = 1.0,
|
|
413
|
-
response_format:
|
|
413
|
+
response_format: type[ResponseFormat] = str,
|
|
414
414
|
batch_size: int | None = None,
|
|
415
415
|
max_concurrency: int = 8,
|
|
416
416
|
) -> "AsyncBatchResponses":
|
|
@@ -422,7 +422,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
422
422
|
system_message (str): System prompt.
|
|
423
423
|
temperature (float, optional): Sampling temperature. Defaults to 0.0.
|
|
424
424
|
top_p (float, optional): Nucleus sampling parameter. Defaults to 1.0.
|
|
425
|
-
response_format (
|
|
425
|
+
response_format (type[ResponseFormat], optional): Expected output type. Defaults to ``str``.
|
|
426
426
|
batch_size (int | None, optional): Max unique prompts per API call. Defaults to None
|
|
427
427
|
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
428
428
|
max_concurrency (int, optional): Max concurrent API calls. Defaults to 8.
|
|
@@ -482,12 +482,12 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
482
482
|
@backoff_async(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
|
|
483
483
|
@observe(_LOGGER)
|
|
484
484
|
async def _request_llm(
|
|
485
|
-
self, user_messages:
|
|
485
|
+
self, user_messages: list[Message[str]], **extra_api_params: Any
|
|
486
486
|
) -> ParsedResponse[Response[ResponseFormat]]:
|
|
487
487
|
"""Make a single async call to the OpenAI JSON‑mode endpoint.
|
|
488
488
|
|
|
489
489
|
Args:
|
|
490
|
-
user_messages (
|
|
490
|
+
user_messages (list[Message[str]]): Sequence of ``Message[str]`` representing the minibatch prompts.
|
|
491
491
|
|
|
492
492
|
Returns:
|
|
493
493
|
ParsedResponse[Response[ResponseFormat]]: Parsed response with assistant messages (arbitrary order).
|
|
@@ -502,7 +502,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
502
502
|
body: response_format # type: ignore
|
|
503
503
|
|
|
504
504
|
class ResponseT(BaseModel):
|
|
505
|
-
assistant_messages:
|
|
505
|
+
assistant_messages: list[MessageT]
|
|
506
506
|
|
|
507
507
|
# Build base API parameters (cannot be overridden by caller)
|
|
508
508
|
api_params: dict[str, Any] = {
|
|
@@ -537,7 +537,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
537
537
|
return cast(ParsedResponse[Response[ResponseFormat]], completion)
|
|
538
538
|
|
|
539
539
|
@observe(_LOGGER)
|
|
540
|
-
async def _predict_chunk(self, user_messages:
|
|
540
|
+
async def _predict_chunk(self, user_messages: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
|
|
541
541
|
"""Async helper executed for every unique minibatch.
|
|
542
542
|
|
|
543
543
|
This method:
|
|
@@ -553,11 +553,11 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
553
553
|
return [None] * len(messages)
|
|
554
554
|
response_dict = {message.id: message.body for message in responses.output_parsed.assistant_messages}
|
|
555
555
|
# Ensure proper handling for missing IDs - this shouldn't happen in normal operation
|
|
556
|
-
sorted_responses:
|
|
556
|
+
sorted_responses: list[ResponseFormat | None] = [response_dict.get(m.id, None) for m in messages]
|
|
557
557
|
return sorted_responses
|
|
558
558
|
|
|
559
559
|
@observe(_LOGGER)
|
|
560
|
-
async def parse(self, inputs:
|
|
560
|
+
async def parse(self, inputs: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
|
|
561
561
|
"""Batched predict (async).
|
|
562
562
|
|
|
563
563
|
Accepts arbitrary keyword arguments forwarded to ``AsyncOpenAI.responses.parse``.
|
|
@@ -566,16 +566,16 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
566
566
|
changing the public surface again.
|
|
567
567
|
|
|
568
568
|
Args:
|
|
569
|
-
inputs (
|
|
569
|
+
inputs (list[str]): Prompts that require responses. Duplicates are de‑duplicated.
|
|
570
570
|
**api_kwargs: Extra keyword args for the OpenAI Responses API.
|
|
571
571
|
|
|
572
572
|
Returns:
|
|
573
|
-
|
|
573
|
+
list[ResponseFormat | None]: Assistant responses aligned to ``inputs``.
|
|
574
574
|
"""
|
|
575
575
|
if not api_kwargs:
|
|
576
576
|
return await self.cache.map(inputs, self._predict_chunk) # type: ignore[return-value]
|
|
577
577
|
|
|
578
|
-
async def _predict_with(xs:
|
|
578
|
+
async def _predict_with(xs: list[str]) -> list[ResponseFormat | None]:
|
|
579
579
|
return await self._predict_chunk(xs, **api_kwargs)
|
|
580
580
|
|
|
581
581
|
return await self.cache.map(inputs, _predict_with) # type: ignore[return-value]
|
openaivec/_schema.py
CHANGED
|
@@ -25,8 +25,11 @@ This module is intentionally **internal** (``__all__ = []``). Public users
|
|
|
25
25
|
should interact through higher‑level batch APIs once a schema has been inferred.
|
|
26
26
|
|
|
27
27
|
Design constraints:
|
|
28
|
-
* Flat schema only (no
|
|
29
|
-
|
|
28
|
+
* Flat schema only (no nested objects). Top-level arrays permitted ONLY as homogeneous arrays of primitives
|
|
29
|
+
(e.g. array of strings) – represented via specialized primitive array type names
|
|
30
|
+
(string_array, integer_array, float_array, boolean_array).
|
|
31
|
+
* Primitive scalar types limited to {string, integer, float, boolean}; optional array variants
|
|
32
|
+
{string_array, integer_array, float_array, boolean_array}.
|
|
30
33
|
* Optional enumerations for *closed*, *observed* categorical sets only.
|
|
31
34
|
* Validation retries ensure a structurally coherent suggestion before returning.
|
|
32
35
|
|
|
@@ -49,7 +52,7 @@ authoritative contract is the ordered list of ``FieldSpec`` instances.
|
|
|
49
52
|
|
|
50
53
|
from dataclasses import dataclass
|
|
51
54
|
from enum import Enum
|
|
52
|
-
from typing import
|
|
55
|
+
from typing import Literal
|
|
53
56
|
|
|
54
57
|
from openai import OpenAI
|
|
55
58
|
from openai.types.responses import ParsedResponse
|
|
@@ -87,24 +90,44 @@ class FieldSpec(BaseModel):
|
|
|
87
90
|
name: str = Field(
|
|
88
91
|
description=(
|
|
89
92
|
"Lower snake_case identifier (regex: ^[a-z][a-z0-9_]*$). Must be unique across all fields and "
|
|
90
|
-
"express the semantic meaning succinctly (no adjectives like 'best', 'great')."
|
|
93
|
+
"express the semantic meaning succinctly (no adjectives like 'best', 'great'). For numeric (integer|float) "
|
|
94
|
+
"fields the name MUST include an explicit unit or measure suffix (e.g. _count, _total_count, "
|
|
95
|
+
"_duration_seconds, _ms, _price_usd, _ratio, _score) to eliminate ambiguity. Avoid bare numeric nouns like "
|
|
96
|
+
"'duration' or 'value' without unit/scale. Boolean field names MUST begin with 'is_' followed by a "
|
|
97
|
+
"descriptive predicate (e.g. is_active, is_delayed). Use positive forms (is_active) rather than "
|
|
98
|
+
"negated forms (is_not_active)."
|
|
91
99
|
)
|
|
92
100
|
)
|
|
93
|
-
type: Literal[
|
|
101
|
+
type: Literal[
|
|
102
|
+
"string",
|
|
103
|
+
"integer",
|
|
104
|
+
"float",
|
|
105
|
+
"boolean",
|
|
106
|
+
"string_array",
|
|
107
|
+
"integer_array",
|
|
108
|
+
"float_array",
|
|
109
|
+
"boolean_array",
|
|
110
|
+
] = Field(
|
|
94
111
|
description=(
|
|
95
112
|
"Primitive type. Use 'integer' only if all observed numeric values are whole numbers. "
|
|
96
113
|
"Use 'float' if any value can contain a decimal or represents a ratio/score. Use 'boolean' only for "
|
|
97
114
|
"explicit binary states (yes/no, true/false, present/absent) consistently encoded. Use 'string' otherwise. "
|
|
98
|
-
"
|
|
115
|
+
"Array variants (string_array, integer_array, float_array, boolean_array) are ONLY allowed when the value "
|
|
116
|
+
"is a repeatable homogeneous collection whose individual elements would otherwise stand as valid scalar "
|
|
117
|
+
"extractions (e.g. keywords, error_codes, tag_ids). Do not encode objects or mixed-type arrays; flatten or "
|
|
118
|
+
"choose the most informative level."
|
|
99
119
|
)
|
|
100
120
|
)
|
|
101
121
|
description: str = Field(
|
|
102
122
|
description=(
|
|
103
123
|
"Concise, objective definition plus extraction rule (what qualifies / what to ignore). Avoid subjective, "
|
|
104
|
-
"speculative, or promotional language. If ambiguity exists with another field, clarify the distinction."
|
|
124
|
+
"speculative, or promotional language. If ambiguity exists with another field, clarify the distinction. "
|
|
125
|
+
"Do NOT simply restate an original JSON/key name if the examples are already structured; only include a "
|
|
126
|
+
"raw key verbatim when it is already the minimal, irreducible analytic unit. For derived fields, clearly "
|
|
127
|
+
"state the transformation (e.g. sentiment of comment_text, normalized date, language code)."
|
|
105
128
|
)
|
|
106
129
|
)
|
|
107
|
-
enum_values:
|
|
130
|
+
enum_values: list[str] | None = Field(
|
|
108
131
|
default=None,
|
|
109
132
|
description=(
|
|
110
133
|
"Optional finite categorical label set (classification) for a string field. Provide ONLY when a closed, "
|
|
@@ -160,7 +183,7 @@ class InferredSchema(BaseModel):
|
|
|
160
183
|
"reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream extraction."
|
|
161
184
|
)
|
|
162
185
|
)
|
|
163
|
-
fields:
|
|
186
|
+
fields: list[FieldSpec] = Field(
|
|
164
187
|
description=(
|
|
165
188
|
"Ordered list of proposed fields derived strictly from observable, repeatable signals in the "
|
|
166
189
|
"examples and aligned with the purpose."
|
|
@@ -191,13 +214,13 @@ class InferredSchema(BaseModel):
|
|
|
191
214
|
return cls.model_validate_json(f.read())
|
|
192
215
|
|
|
193
216
|
@property
|
|
194
|
-
def model(self) ->
|
|
217
|
+
def model(self) -> type[BaseModel]:
|
|
195
218
|
"""Dynamically materialized Pydantic model for the inferred schema.
|
|
196
219
|
|
|
197
220
|
Equivalent to calling :meth:`build_model` each access (not cached).
|
|
198
221
|
|
|
199
222
|
Returns:
|
|
200
|
-
|
|
223
|
+
type[BaseModel]: Fresh model type reflecting ``fields`` ordering.
|
|
201
224
|
"""
|
|
202
225
|
return self.build_model()
|
|
203
226
|
|
|
@@ -212,7 +235,7 @@ class InferredSchema(BaseModel):
|
|
|
212
235
|
instructions=self.inference_prompt, response_format=self.model, top_p=None, temperature=None
|
|
213
236
|
)
|
|
214
237
|
|
|
215
|
-
def build_model(self) ->
|
|
238
|
+
def build_model(self) -> type[BaseModel]:
|
|
216
239
|
"""Create a new dynamic ``BaseModel`` class adhering to this schema.
|
|
217
240
|
|
|
218
241
|
Implementation details:
|
|
@@ -223,9 +246,14 @@ class InferredSchema(BaseModel):
|
|
|
223
246
|
introduced later by modifying this logic if needed.
|
|
224
247
|
|
|
225
248
|
Returns:
|
|
226
|
-
|
|
249
|
+
type[BaseModel]: New (not cached) model type; order matches ``fields``.
|
|
227
250
|
"""
|
|
228
|
-
type_map: dict[str, type] = {
|
|
251
|
+
type_map: dict[str, type] = {
|
|
252
|
+
"string": str,
|
|
253
|
+
"integer": int,
|
|
254
|
+
"float": float,
|
|
255
|
+
"boolean": bool,
|
|
256
|
+
}
|
|
229
257
|
fields: dict[str, tuple[type, object]] = {}
|
|
230
258
|
|
|
231
259
|
for spec in self.fields:
|
|
@@ -246,7 +274,11 @@ class InferredSchema(BaseModel):
|
|
|
246
274
|
enum_cls = Enum(enum_class_name, members) # type: ignore[arg-type]
|
|
247
275
|
py_type = enum_cls
|
|
248
276
|
else:
|
|
249
|
-
|
|
277
|
+
if spec.type.endswith("_array"):
|
|
278
|
+
base = spec.type.rsplit("_", 1)[0]
|
|
279
|
+
py_type = list[type_map[base]] # type: ignore[index]
|
|
280
|
+
else:
|
|
281
|
+
py_type = type_map[spec.type]
|
|
250
282
|
fields[spec.name] = (py_type, Field(description=spec.description))
|
|
251
283
|
|
|
252
284
|
model = create_model("InferredSchema", **fields) # type: ignore[call-arg]
|
|
@@ -274,7 +306,7 @@ class SchemaInferenceInput(BaseModel):
|
|
|
274
306
|
relevance & exclusion of outcome labels.
|
|
275
307
|
"""
|
|
276
308
|
|
|
277
|
-
examples:
|
|
309
|
+
examples: list[str] = Field(
|
|
278
310
|
description=(
|
|
279
311
|
"Representative sample texts (strings). Provide only data the schema should generalize over; "
|
|
280
312
|
"exclude outliers not in scope."
|
|
@@ -298,7 +330,8 @@ Task:
|
|
|
298
330
|
to concrete recurring evidence in the examples (or flags gaps). Use concise bullet‑style
|
|
299
331
|
sentences (still a plain string) such as: "purpose facet -> supporting pattern / gap".
|
|
300
332
|
This MUST NOT introduce new domain facts beyond the examples & purpose.
|
|
301
|
-
4. Propose a minimal flat set of scalar fields (
|
|
333
|
+
4. Propose a minimal flat set of scalar fields (and ONLY when justified,
|
|
334
|
+
homogeneous primitive arrays) that are reliably extractable.
|
|
302
335
|
5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
|
|
303
336
|
6. Provide enum_values ONLY when a small stable closed categorical set (2–24 lowercase tokens)
|
|
304
337
|
is clearly evidenced; never invent.
|
|
@@ -312,11 +345,29 @@ Rules:
|
|
|
312
345
|
* float = any decimals / ratios
|
|
313
346
|
* boolean = explicit binary
|
|
314
347
|
* else use string
|
|
315
|
-
-
|
|
348
|
+
- Numeric (integer|float) field names MUST encode an explicit unit / scale / measure suffix
|
|
349
|
+
(e.g. *_count, *_seconds, *_ms, *_usd, *_ratio, *_score). Avoid ambiguous bare numeric names.
|
|
350
|
+
- Boolean field names MUST start with 'is_' followed by a positive predicate (e.g. is_active,
|
|
351
|
+
is_delayed). Avoid negated forms.
|
|
352
|
+
- No nested objects or mixed-type arrays. Homogeneous primitive arrays are allowed ONLY if each element is an atomic
|
|
353
|
+
scalar signal (use *_array types: string_array, integer_array, float_array, boolean_array). The array is expected to
|
|
354
|
+
contain 0..N such elements per record.
|
|
355
|
+
- Array field names MUST end with '_array' (e.g. keywords_array, tag_ids_array). Do not use plural-only forms
|
|
356
|
+
(e.g. keywords) for arrays; the suffix makes container semantics explicit.
|
|
316
357
|
- Descriptions: concise, objective extraction rules (no marketing/emotion/speculation).
|
|
317
358
|
- enum_values only for string fields with stable closed vocab; omit otherwise.
|
|
318
359
|
- Exclude direct outcome labels (e.g. attrition_probability, will_buy, purchase_likelihood)
|
|
319
360
|
in predictive / feature engineering contexts.
|
|
361
|
+
- When examples already appear as serialized JSON / key-value records, DO NOT merely relist the
|
|
362
|
+
raw original keys unless each is already an atomic, irreducible analytic signal. Prefer high-signal
|
|
363
|
+
derived / normalized / aggregated features (e.g. sentiment, category, language_code, boolean flags,
|
|
364
|
+
normalized_date, count metrics).
|
|
365
|
+
- Superficial renames (adding trivial prefixes/suffixes like _value, _field, new_) are forbidden; a new
|
|
366
|
+
field name must reflect a semantic transformation.
|
|
367
|
+
- Keep field count focused (typically <= 12) prioritizing reusable analytical / ML features over low-signal
|
|
368
|
+
restatements.
|
|
369
|
+
- If you retain an original raw key unchanged, its description must justify why it is minimal and cannot
|
|
370
|
+
be further decomposed without losing analytical value.
|
|
320
371
|
|
|
321
372
|
Output contract:
|
|
322
373
|
Return exactly an InferredSchema object with JSON keys:
|
|
@@ -443,12 +494,21 @@ def _basic_field_list_validation(parsed: InferredSchema) -> None:
|
|
|
443
494
|
raise ValueError("no fields suggested")
|
|
444
495
|
if len(names) != len(set(names)):
|
|
445
496
|
raise ValueError("duplicate field names detected")
|
|
446
|
-
allowed = {
|
|
497
|
+
allowed = {
|
|
498
|
+
"string",
|
|
499
|
+
"integer",
|
|
500
|
+
"float",
|
|
501
|
+
"boolean",
|
|
502
|
+
"string_array",
|
|
503
|
+
"integer_array",
|
|
504
|
+
"float_array",
|
|
505
|
+
"boolean_array",
|
|
506
|
+
}
|
|
447
507
|
for f in parsed.fields:
|
|
448
508
|
if f.type not in allowed:
|
|
449
509
|
raise ValueError(f"unsupported field type: {f.type}")
|
|
450
510
|
if f.enum_values is not None:
|
|
451
511
|
if f.type != "string":
|
|
452
|
-
raise ValueError(f"enum_values only allowed for string field: {f.name}")
|
|
512
|
+
raise ValueError(f"enum_values only allowed for plain string field: {f.name}")
|
|
453
513
|
if not (2 <= len(f.enum_values) <= 24):
|
|
454
514
|
raise ValueError(f"enum_values length out of bounds for field {f.name}")
|
openaivec/_serialize.py
CHANGED
|
@@ -4,19 +4,19 @@ This module provides utilities for converting Pydantic BaseModel classes
|
|
|
4
4
|
to and from JSON schema representations with simplified, maintainable code.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, Literal
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, Field, create_model
|
|
10
10
|
|
|
11
11
|
__all__ = []
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def serialize_base_model(obj:
|
|
14
|
+
def serialize_base_model(obj: type[BaseModel]) -> dict[str, Any]:
|
|
15
15
|
"""Serialize a Pydantic BaseModel to JSON schema."""
|
|
16
16
|
return obj.model_json_schema()
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def dereference_json_schema(json_schema:
|
|
19
|
+
def dereference_json_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
|
|
20
20
|
"""Dereference JSON schema by resolving $ref pointers with circular reference protection."""
|
|
21
21
|
model_map = json_schema.get("$defs", {})
|
|
22
22
|
|
|
@@ -61,7 +61,7 @@ def dereference_json_schema(json_schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
61
61
|
# ============================================================================
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
def _resolve_union_type(union_options:
|
|
64
|
+
def _resolve_union_type(union_options: list[dict[str, Any]]) -> type:
|
|
65
65
|
"""Resolve anyOf/oneOf to Union type."""
|
|
66
66
|
union_types = []
|
|
67
67
|
for option in union_options:
|
|
@@ -75,12 +75,14 @@ def _resolve_union_type(union_options: List[Dict[str, Any]]) -> Type:
|
|
|
75
75
|
elif len(union_types) == 2 and type(None) in union_types:
|
|
76
76
|
# Optional type: T | None
|
|
77
77
|
non_none_type = next(t for t in union_types if t is not type(None))
|
|
78
|
-
return
|
|
78
|
+
return non_none_type | None # type: ignore[return-value]
|
|
79
79
|
else:
|
|
80
|
+
from typing import Union
|
|
81
|
+
|
|
80
82
|
return Union[tuple(union_types)] # type: ignore[return-value]
|
|
81
83
|
|
|
82
84
|
|
|
83
|
-
def _resolve_basic_type(type_name: str, field_def:
|
|
85
|
+
def _resolve_basic_type(type_name: str, field_def: dict[str, Any]) -> type:
|
|
84
86
|
"""Resolve basic JSON schema types to Python types."""
|
|
85
87
|
type_mapping = {
|
|
86
88
|
"string": str,
|
|
@@ -101,14 +103,14 @@ def _resolve_basic_type(type_name: str, field_def: Dict[str, Any]) -> Type:
|
|
|
101
103
|
elif type_name == "array":
|
|
102
104
|
if "items" in field_def:
|
|
103
105
|
inner_type = parse_field(field_def["items"])
|
|
104
|
-
return
|
|
106
|
+
return list[inner_type]
|
|
105
107
|
else:
|
|
106
|
-
return
|
|
108
|
+
return list[Any]
|
|
107
109
|
else:
|
|
108
110
|
raise ValueError(f"Unsupported type: {type_name}")
|
|
109
111
|
|
|
110
112
|
|
|
111
|
-
def parse_field(field_def:
|
|
113
|
+
def parse_field(field_def: dict[str, Any]) -> type:
|
|
112
114
|
"""Parse a JSON schema field definition to a Python type.
|
|
113
115
|
|
|
114
116
|
Simplified version with clear separation of concerns.
|
|
@@ -141,17 +143,19 @@ def _create_field_info(description: str | None, default_value: Any, is_required:
|
|
|
141
143
|
return Field(default=default_value, description=description) if description else Field(default=default_value)
|
|
142
144
|
|
|
143
145
|
|
|
144
|
-
def _make_optional_if_needed(field_type:
|
|
146
|
+
def _make_optional_if_needed(field_type: type, is_required: bool, has_default: bool) -> type:
|
|
145
147
|
"""Make field type optional if needed."""
|
|
146
148
|
if is_required or has_default:
|
|
147
149
|
return field_type
|
|
148
150
|
|
|
149
151
|
# Check if already nullable
|
|
152
|
+
from typing import Union
|
|
153
|
+
|
|
150
154
|
if hasattr(field_type, "__origin__") and field_type.__origin__ is Union and type(None) in field_type.__args__:
|
|
151
155
|
return field_type
|
|
152
156
|
|
|
153
157
|
# Make optional
|
|
154
|
-
return
|
|
158
|
+
return field_type | None # type: ignore[return-value]
|
|
155
159
|
|
|
156
160
|
|
|
157
161
|
# ============================================================================
|
|
@@ -159,7 +163,7 @@ def _make_optional_if_needed(field_type: Type, is_required: bool, has_default: b
|
|
|
159
163
|
# ============================================================================
|
|
160
164
|
|
|
161
165
|
|
|
162
|
-
def _process_enum_field(field_name: str, field_def:
|
|
166
|
+
def _process_enum_field(field_name: str, field_def: dict[str, Any], is_required: bool) -> tuple[type, Field]: # type: ignore[type-arg]
|
|
163
167
|
"""Process enum field with Literal type."""
|
|
164
168
|
enum_values = field_def["enum"]
|
|
165
169
|
|
|
@@ -175,14 +179,14 @@ def _process_enum_field(field_name: str, field_def: Dict[str, Any], is_required:
|
|
|
175
179
|
has_default = default_value is not None
|
|
176
180
|
|
|
177
181
|
if not is_required and not has_default:
|
|
178
|
-
literal_type =
|
|
182
|
+
literal_type = literal_type | None # type: ignore[assignment]
|
|
179
183
|
default_value = None
|
|
180
184
|
|
|
181
185
|
field_info = _create_field_info(description, default_value, is_required)
|
|
182
186
|
return literal_type, field_info # type: ignore[return-value]
|
|
183
187
|
|
|
184
188
|
|
|
185
|
-
def _process_regular_field(field_name: str, field_def:
|
|
189
|
+
def _process_regular_field(field_name: str, field_def: dict[str, Any], is_required: bool) -> tuple[type, Field]: # type: ignore[type-arg]
|
|
186
190
|
"""Process regular (non-enum) field."""
|
|
187
191
|
field_type = parse_field(field_def)
|
|
188
192
|
description = field_def.get("description")
|
|
@@ -204,7 +208,7 @@ def _process_regular_field(field_name: str, field_def: Dict[str, Any], is_requir
|
|
|
204
208
|
# ============================================================================
|
|
205
209
|
|
|
206
210
|
|
|
207
|
-
def deserialize_base_model(json_schema:
|
|
211
|
+
def deserialize_base_model(json_schema: dict[str, Any]) -> type[BaseModel]:
|
|
208
212
|
"""Deserialize a JSON schema to a Pydantic BaseModel class.
|
|
209
213
|
|
|
210
214
|
Refactored version with clear separation of concerns and simplified logic.
|
openaivec/_util.py
CHANGED
|
@@ -2,8 +2,9 @@ import asyncio
|
|
|
2
2
|
import functools
|
|
3
3
|
import re
|
|
4
4
|
import time
|
|
5
|
+
from collections.abc import Awaitable, Callable
|
|
5
6
|
from dataclasses import dataclass
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import TypeVar
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
import tiktoken
|
|
@@ -36,14 +37,14 @@ def get_exponential_with_cutoff(scale: float) -> float:
|
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
def backoff(
|
|
39
|
-
exceptions:
|
|
40
|
+
exceptions: list[type[Exception]],
|
|
40
41
|
scale: int | None = None,
|
|
41
42
|
max_retries: int | None = None,
|
|
42
43
|
) -> Callable[..., V]:
|
|
43
44
|
"""Decorator implementing exponential back‑off retry logic.
|
|
44
45
|
|
|
45
46
|
Args:
|
|
46
|
-
exceptions (
|
|
47
|
+
exceptions (list[type[Exception]]): List of exception types that trigger a retry.
|
|
47
48
|
scale (int | None): Initial scale parameter for the exponential jitter.
|
|
48
49
|
This scale is used as the mean for the first delay's exponential
|
|
49
50
|
distribution and doubles with each subsequent retry. If ``None``,
|
|
@@ -88,14 +89,14 @@ def backoff(
|
|
|
88
89
|
|
|
89
90
|
|
|
90
91
|
def backoff_async(
|
|
91
|
-
exceptions:
|
|
92
|
+
exceptions: list[type[Exception]],
|
|
92
93
|
scale: int | None = None,
|
|
93
94
|
max_retries: int | None = None,
|
|
94
95
|
) -> Callable[..., Awaitable[V]]:
|
|
95
96
|
"""Asynchronous version of the backoff decorator.
|
|
96
97
|
|
|
97
98
|
Args:
|
|
98
|
-
exceptions (
|
|
99
|
+
exceptions (list[type[Exception]]): List of exception types that trigger a retry.
|
|
99
100
|
scale (int | None): Initial scale parameter for the exponential jitter.
|
|
100
101
|
This scale is used as the mean for the first delay's exponential
|
|
101
102
|
distribution and doubles with each subsequent retry. If ``None``,
|
|
@@ -145,7 +146,7 @@ class TextChunker:
|
|
|
145
146
|
|
|
146
147
|
enc: tiktoken.Encoding
|
|
147
148
|
|
|
148
|
-
def split(self, original: str, max_tokens: int, sep:
|
|
149
|
+
def split(self, original: str, max_tokens: int, sep: list[str]) -> list[str]:
|
|
149
150
|
"""Token‑aware sentence segmentation.
|
|
150
151
|
|
|
151
152
|
The text is first split by the given separators, then greedily packed
|
|
@@ -154,11 +155,11 @@ class TextChunker:
|
|
|
154
155
|
Args:
|
|
155
156
|
original (str): Original text to split.
|
|
156
157
|
max_tokens (int): Maximum number of tokens allowed per chunk.
|
|
157
|
-
sep (
|
|
158
|
+
sep (list[str]): List of separator patterns used by
|
|
158
159
|
:pyfunc:`re.split`.
|
|
159
160
|
|
|
160
161
|
Returns:
|
|
161
|
-
|
|
162
|
+
list[str]: List of text chunks respecting the ``max_tokens`` limit.
|
|
162
163
|
"""
|
|
163
164
|
sentences = re.split(f"({'|'.join(sep)})", original)
|
|
164
165
|
sentences = [s.strip() for s in sentences if s.strip()]
|