openaivec 0.14.7__py3-none-any.whl → 0.14.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_di.py +10 -9
- openaivec/_embeddings.py +12 -13
- openaivec/_log.py +1 -1
- openaivec/_model.py +3 -3
- openaivec/_optimize.py +3 -4
- openaivec/_prompt.py +4 -5
- openaivec/_proxy.py +34 -35
- openaivec/_responses.py +29 -29
- openaivec/_schema.py +56 -18
- openaivec/_serialize.py +19 -15
- openaivec/_util.py +9 -8
- openaivec/pandas_ext.py +20 -19
- openaivec/spark.py +11 -10
- openaivec/task/customer_support/customer_sentiment.py +2 -2
- openaivec/task/customer_support/inquiry_classification.py +8 -8
- openaivec/task/customer_support/inquiry_summary.py +4 -4
- openaivec/task/customer_support/intent_analysis.py +5 -5
- openaivec/task/customer_support/response_suggestion.py +4 -4
- openaivec/task/customer_support/urgency_analysis.py +9 -9
- openaivec/task/nlp/dependency_parsing.py +2 -4
- openaivec/task/nlp/keyword_extraction.py +3 -5
- openaivec/task/nlp/morphological_analysis.py +4 -6
- openaivec/task/nlp/named_entity_recognition.py +7 -9
- openaivec/task/nlp/sentiment_analysis.py +3 -3
- openaivec/task/nlp/translation.py +1 -2
- openaivec/task/table/fillna.py +2 -3
- {openaivec-0.14.7.dist-info → openaivec-0.14.8.dist-info}/METADATA +1 -1
- openaivec-0.14.8.dist-info/RECORD +36 -0
- openaivec-0.14.7.dist-info/RECORD +0 -36
- {openaivec-0.14.7.dist-info → openaivec-0.14.8.dist-info}/WHEEL +0 -0
- {openaivec-0.14.7.dist-info → openaivec-0.14.8.dist-info}/licenses/LICENSE +0 -0
openaivec/_responses.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import warnings
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from logging import Logger, getLogger
|
|
4
|
-
from typing import Any, Generic,
|
|
4
|
+
from typing import Any, Generic, cast
|
|
5
5
|
|
|
6
6
|
from openai import AsyncOpenAI, BadRequestError, InternalServerError, OpenAI, RateLimitError
|
|
7
7
|
from openai.types.responses import ParsedResponse
|
|
@@ -120,11 +120,11 @@ class Message(BaseModel, Generic[ResponseFormat]):
|
|
|
120
120
|
|
|
121
121
|
|
|
122
122
|
class Request(BaseModel):
|
|
123
|
-
user_messages:
|
|
123
|
+
user_messages: list[Message[str]]
|
|
124
124
|
|
|
125
125
|
|
|
126
126
|
class Response(BaseModel, Generic[ResponseFormat]):
|
|
127
|
-
assistant_messages:
|
|
127
|
+
assistant_messages: list[Message[ResponseFormat]]
|
|
128
128
|
|
|
129
129
|
|
|
130
130
|
@dataclass(frozen=True)
|
|
@@ -150,7 +150,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
150
150
|
system_message (str): System prompt prepended to every request.
|
|
151
151
|
temperature (float): Sampling temperature.
|
|
152
152
|
top_p (float): Nucleus‑sampling parameter.
|
|
153
|
-
response_format (
|
|
153
|
+
response_format (type[ResponseFormat]): Expected Pydantic model class or ``str`` for each assistant message.
|
|
154
154
|
cache (BatchingMapProxy[str, ResponseFormat]): Order‑preserving batching proxy with de‑duplication and caching.
|
|
155
155
|
|
|
156
156
|
Notes:
|
|
@@ -165,7 +165,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
165
165
|
system_message: str
|
|
166
166
|
temperature: float | None = None
|
|
167
167
|
top_p: float = 1.0
|
|
168
|
-
response_format:
|
|
168
|
+
response_format: type[ResponseFormat] = str # type: ignore[assignment]
|
|
169
169
|
cache: BatchingMapProxy[str, ResponseFormat] = field(default_factory=lambda: BatchingMapProxy(batch_size=None))
|
|
170
170
|
_vectorized_system_message: str = field(init=False)
|
|
171
171
|
_model_json_schema: dict = field(init=False)
|
|
@@ -178,7 +178,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
178
178
|
system_message: str,
|
|
179
179
|
temperature: float | None = 0.0,
|
|
180
180
|
top_p: float = 1.0,
|
|
181
|
-
response_format:
|
|
181
|
+
response_format: type[ResponseFormat] = str,
|
|
182
182
|
batch_size: int | None = None,
|
|
183
183
|
) -> "BatchResponses":
|
|
184
184
|
"""Factory constructor.
|
|
@@ -189,7 +189,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
189
189
|
system_message (str): System prompt for the model.
|
|
190
190
|
temperature (float, optional): Sampling temperature. Defaults to 0.0.
|
|
191
191
|
top_p (float, optional): Nucleus sampling parameter. Defaults to 1.0.
|
|
192
|
-
response_format (
|
|
192
|
+
response_format (type[ResponseFormat], optional): Expected output type. Defaults to ``str``.
|
|
193
193
|
batch_size (int | None, optional): Max unique prompts per API call. Defaults to None
|
|
194
194
|
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
195
195
|
|
|
@@ -242,12 +242,12 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
242
242
|
@observe(_LOGGER)
|
|
243
243
|
@backoff(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
|
|
244
244
|
def _request_llm(
|
|
245
|
-
self, user_messages:
|
|
245
|
+
self, user_messages: list[Message[str]], **extra_api_params: Any
|
|
246
246
|
) -> ParsedResponse[Response[ResponseFormat]]:
|
|
247
247
|
"""Make a single call to the OpenAI JSON‑mode endpoint.
|
|
248
248
|
|
|
249
249
|
Args:
|
|
250
|
-
user_messages (
|
|
250
|
+
user_messages (list[Message[str]]): Sequence of ``Message[str]`` representing the
|
|
251
251
|
prompts for this minibatch. Each message carries a unique `id`
|
|
252
252
|
so we can restore ordering later.
|
|
253
253
|
|
|
@@ -265,7 +265,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
265
265
|
body: response_format # type: ignore
|
|
266
266
|
|
|
267
267
|
class ResponseT(BaseModel):
|
|
268
|
-
assistant_messages:
|
|
268
|
+
assistant_messages: list[MessageT]
|
|
269
269
|
|
|
270
270
|
# Build base API parameters (cannot be overridden by caller)
|
|
271
271
|
api_params: dict[str, Any] = {
|
|
@@ -300,7 +300,7 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
300
300
|
return cast(ParsedResponse[Response[ResponseFormat]], completion)
|
|
301
301
|
|
|
302
302
|
@observe(_LOGGER)
|
|
303
|
-
def _predict_chunk(self, user_messages:
|
|
303
|
+
def _predict_chunk(self, user_messages: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
|
|
304
304
|
"""Helper executed for every unique minibatch.
|
|
305
305
|
|
|
306
306
|
This method:
|
|
@@ -316,11 +316,11 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
316
316
|
if not responses.output_parsed:
|
|
317
317
|
return [None] * len(messages)
|
|
318
318
|
response_dict = {message.id: message.body for message in responses.output_parsed.assistant_messages}
|
|
319
|
-
sorted_responses:
|
|
319
|
+
sorted_responses: list[ResponseFormat | None] = [response_dict.get(m.id, None) for m in messages]
|
|
320
320
|
return sorted_responses
|
|
321
321
|
|
|
322
322
|
@observe(_LOGGER)
|
|
323
|
-
def parse(self, inputs:
|
|
323
|
+
def parse(self, inputs: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
|
|
324
324
|
"""Batched predict.
|
|
325
325
|
|
|
326
326
|
Accepts arbitrary keyword arguments that are forwarded to the underlying
|
|
@@ -329,16 +329,16 @@ class BatchResponses(Generic[ResponseFormat]):
|
|
|
329
329
|
configured values but can be overridden explicitly.
|
|
330
330
|
|
|
331
331
|
Args:
|
|
332
|
-
inputs (
|
|
332
|
+
inputs (list[str]): Prompts that require responses. Duplicates are de‑duplicated.
|
|
333
333
|
**api_kwargs: Extra keyword args forwarded to the OpenAI Responses API.
|
|
334
334
|
|
|
335
335
|
Returns:
|
|
336
|
-
|
|
336
|
+
list[ResponseFormat | None]: Assistant responses aligned to ``inputs``.
|
|
337
337
|
"""
|
|
338
338
|
if not api_kwargs:
|
|
339
339
|
return self.cache.map(inputs, self._predict_chunk) # type: ignore[return-value]
|
|
340
340
|
|
|
341
|
-
def _predict_with(xs:
|
|
341
|
+
def _predict_with(xs: list[str]) -> list[ResponseFormat | None]:
|
|
342
342
|
return self._predict_chunk(xs, **api_kwargs)
|
|
343
343
|
|
|
344
344
|
return self.cache.map(inputs, _predict_with) # type: ignore[return-value]
|
|
@@ -385,7 +385,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
385
385
|
system_message (str): System prompt prepended to every request.
|
|
386
386
|
temperature (float): Sampling temperature.
|
|
387
387
|
top_p (float): Nucleus‑sampling parameter.
|
|
388
|
-
response_format (
|
|
388
|
+
response_format (type[ResponseFormat]): Expected Pydantic model class or ``str`` for each assistant message.
|
|
389
389
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Async batching proxy with de‑duplication
|
|
390
390
|
and concurrency control.
|
|
391
391
|
"""
|
|
@@ -395,7 +395,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
395
395
|
system_message: str
|
|
396
396
|
temperature: float | None = 0.0
|
|
397
397
|
top_p: float = 1.0
|
|
398
|
-
response_format:
|
|
398
|
+
response_format: type[ResponseFormat] = str # type: ignore[assignment]
|
|
399
399
|
cache: AsyncBatchingMapProxy[str, ResponseFormat] = field(
|
|
400
400
|
default_factory=lambda: AsyncBatchingMapProxy(batch_size=None, max_concurrency=8)
|
|
401
401
|
)
|
|
@@ -410,7 +410,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
410
410
|
system_message: str,
|
|
411
411
|
temperature: float | None = None,
|
|
412
412
|
top_p: float = 1.0,
|
|
413
|
-
response_format:
|
|
413
|
+
response_format: type[ResponseFormat] = str,
|
|
414
414
|
batch_size: int | None = None,
|
|
415
415
|
max_concurrency: int = 8,
|
|
416
416
|
) -> "AsyncBatchResponses":
|
|
@@ -422,7 +422,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
422
422
|
system_message (str): System prompt.
|
|
423
423
|
temperature (float, optional): Sampling temperature. Defaults to 0.0.
|
|
424
424
|
top_p (float, optional): Nucleus sampling parameter. Defaults to 1.0.
|
|
425
|
-
response_format (
|
|
425
|
+
response_format (type[ResponseFormat], optional): Expected output type. Defaults to ``str``.
|
|
426
426
|
batch_size (int | None, optional): Max unique prompts per API call. Defaults to None
|
|
427
427
|
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
428
428
|
max_concurrency (int, optional): Max concurrent API calls. Defaults to 8.
|
|
@@ -482,12 +482,12 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
482
482
|
@backoff_async(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
|
|
483
483
|
@observe(_LOGGER)
|
|
484
484
|
async def _request_llm(
|
|
485
|
-
self, user_messages:
|
|
485
|
+
self, user_messages: list[Message[str]], **extra_api_params: Any
|
|
486
486
|
) -> ParsedResponse[Response[ResponseFormat]]:
|
|
487
487
|
"""Make a single async call to the OpenAI JSON‑mode endpoint.
|
|
488
488
|
|
|
489
489
|
Args:
|
|
490
|
-
user_messages (
|
|
490
|
+
user_messages (list[Message[str]]): Sequence of ``Message[str]`` representing the minibatch prompts.
|
|
491
491
|
|
|
492
492
|
Returns:
|
|
493
493
|
ParsedResponse[Response[ResponseFormat]]: Parsed response with assistant messages (arbitrary order).
|
|
@@ -502,7 +502,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
502
502
|
body: response_format # type: ignore
|
|
503
503
|
|
|
504
504
|
class ResponseT(BaseModel):
|
|
505
|
-
assistant_messages:
|
|
505
|
+
assistant_messages: list[MessageT]
|
|
506
506
|
|
|
507
507
|
# Build base API parameters (cannot be overridden by caller)
|
|
508
508
|
api_params: dict[str, Any] = {
|
|
@@ -537,7 +537,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
537
537
|
return cast(ParsedResponse[Response[ResponseFormat]], completion)
|
|
538
538
|
|
|
539
539
|
@observe(_LOGGER)
|
|
540
|
-
async def _predict_chunk(self, user_messages:
|
|
540
|
+
async def _predict_chunk(self, user_messages: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
|
|
541
541
|
"""Async helper executed for every unique minibatch.
|
|
542
542
|
|
|
543
543
|
This method:
|
|
@@ -553,11 +553,11 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
553
553
|
return [None] * len(messages)
|
|
554
554
|
response_dict = {message.id: message.body for message in responses.output_parsed.assistant_messages}
|
|
555
555
|
# Ensure proper handling for missing IDs - this shouldn't happen in normal operation
|
|
556
|
-
sorted_responses:
|
|
556
|
+
sorted_responses: list[ResponseFormat | None] = [response_dict.get(m.id, None) for m in messages]
|
|
557
557
|
return sorted_responses
|
|
558
558
|
|
|
559
559
|
@observe(_LOGGER)
|
|
560
|
-
async def parse(self, inputs:
|
|
560
|
+
async def parse(self, inputs: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
|
|
561
561
|
"""Batched predict (async).
|
|
562
562
|
|
|
563
563
|
Accepts arbitrary keyword arguments forwarded to ``AsyncOpenAI.responses.parse``.
|
|
@@ -566,16 +566,16 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
|
|
|
566
566
|
changing the public surface again.
|
|
567
567
|
|
|
568
568
|
Args:
|
|
569
|
-
inputs (
|
|
569
|
+
inputs (list[str]): Prompts that require responses. Duplicates are de‑duplicated.
|
|
570
570
|
**api_kwargs: Extra keyword args for the OpenAI Responses API.
|
|
571
571
|
|
|
572
572
|
Returns:
|
|
573
|
-
|
|
573
|
+
list[ResponseFormat | None]: Assistant responses aligned to ``inputs``.
|
|
574
574
|
"""
|
|
575
575
|
if not api_kwargs:
|
|
576
576
|
return await self.cache.map(inputs, self._predict_chunk) # type: ignore[return-value]
|
|
577
577
|
|
|
578
|
-
async def _predict_with(xs:
|
|
578
|
+
async def _predict_with(xs: list[str]) -> list[ResponseFormat | None]:
|
|
579
579
|
return await self._predict_chunk(xs, **api_kwargs)
|
|
580
580
|
|
|
581
581
|
return await self.cache.map(inputs, _predict_with) # type: ignore[return-value]
|
openaivec/_schema.py
CHANGED
|
@@ -25,8 +25,11 @@ This module is intentionally **internal** (``__all__ = []``). Public users
|
|
|
25
25
|
should interact through higher‑level batch APIs once a schema has been inferred.
|
|
26
26
|
|
|
27
27
|
Design constraints:
|
|
28
|
-
* Flat schema only (no
|
|
29
|
-
|
|
28
|
+
* Flat schema only (no nested objects). Top-level arrays permitted ONLY as homogeneous arrays of primitives
|
|
29
|
+
(e.g. array of strings) – represented via specialized primitive array type names
|
|
30
|
+
(string_array, integer_array, float_array, boolean_array).
|
|
31
|
+
* Primitive scalar types limited to {string, integer, float, boolean}; optional array variants
|
|
32
|
+
{string_array, integer_array, float_array, boolean_array}.
|
|
30
33
|
* Optional enumerations for *closed*, *observed* categorical sets only.
|
|
31
34
|
* Validation retries ensure a structurally coherent suggestion before returning.
|
|
32
35
|
|
|
@@ -49,7 +52,7 @@ authoritative contract is the ordered list of ``FieldSpec`` instances.
|
|
|
49
52
|
|
|
50
53
|
from dataclasses import dataclass
|
|
51
54
|
from enum import Enum
|
|
52
|
-
from typing import
|
|
55
|
+
from typing import Literal
|
|
53
56
|
|
|
54
57
|
from openai import OpenAI
|
|
55
58
|
from openai.types.responses import ParsedResponse
|
|
@@ -95,12 +98,24 @@ class FieldSpec(BaseModel):
|
|
|
95
98
|
"negated forms (is_not_active)."
|
|
96
99
|
)
|
|
97
100
|
)
|
|
98
|
-
type: Literal[
|
|
101
|
+
type: Literal[
|
|
102
|
+
"string",
|
|
103
|
+
"integer",
|
|
104
|
+
"float",
|
|
105
|
+
"boolean",
|
|
106
|
+
"string_array",
|
|
107
|
+
"integer_array",
|
|
108
|
+
"float_array",
|
|
109
|
+
"boolean_array",
|
|
110
|
+
] = Field(
|
|
99
111
|
description=(
|
|
100
112
|
"Primitive type. Use 'integer' only if all observed numeric values are whole numbers. "
|
|
101
113
|
"Use 'float' if any value can contain a decimal or represents a ratio/score. Use 'boolean' only for "
|
|
102
114
|
"explicit binary states (yes/no, true/false, present/absent) consistently encoded. Use 'string' otherwise. "
|
|
103
|
-
"
|
|
115
|
+
"Array variants (string_array, integer_array, float_array, boolean_array) are ONLY allowed when the value "
|
|
116
|
+
"is a repeatable homogeneous collection whose individual elements would otherwise stand as valid scalar "
|
|
117
|
+
"extractions (e.g. keywords, error_codes, tag_ids). Do not encode objects or mixed-type arrays; flatten or "
|
|
118
|
+
"choose the most informative level."
|
|
104
119
|
)
|
|
105
120
|
)
|
|
106
121
|
description: str = Field(
|
|
@@ -112,7 +127,7 @@ class FieldSpec(BaseModel):
|
|
|
112
127
|
"state the transformation (e.g. sentiment of comment_text, normalized date, language code)."
|
|
113
128
|
)
|
|
114
129
|
)
|
|
115
|
-
enum_values:
|
|
130
|
+
enum_values: list[str] | None = Field(
|
|
116
131
|
default=None,
|
|
117
132
|
description=(
|
|
118
133
|
"Optional finite categorical label set (classification) for a string field. Provide ONLY when a closed, "
|
|
@@ -168,7 +183,7 @@ class InferredSchema(BaseModel):
|
|
|
168
183
|
"reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream extraction."
|
|
169
184
|
)
|
|
170
185
|
)
|
|
171
|
-
fields:
|
|
186
|
+
fields: list[FieldSpec] = Field(
|
|
172
187
|
description=(
|
|
173
188
|
"Ordered list of proposed fields derived strictly from observable, repeatable signals in the "
|
|
174
189
|
"examples and aligned with the purpose."
|
|
@@ -199,13 +214,13 @@ class InferredSchema(BaseModel):
|
|
|
199
214
|
return cls.model_validate_json(f.read())
|
|
200
215
|
|
|
201
216
|
@property
|
|
202
|
-
def model(self) ->
|
|
217
|
+
def model(self) -> type[BaseModel]:
|
|
203
218
|
"""Dynamically materialized Pydantic model for the inferred schema.
|
|
204
219
|
|
|
205
220
|
Equivalent to calling :meth:`build_model` each access (not cached).
|
|
206
221
|
|
|
207
222
|
Returns:
|
|
208
|
-
|
|
223
|
+
type[BaseModel]: Fresh model type reflecting ``fields`` ordering.
|
|
209
224
|
"""
|
|
210
225
|
return self.build_model()
|
|
211
226
|
|
|
@@ -220,7 +235,7 @@ class InferredSchema(BaseModel):
|
|
|
220
235
|
instructions=self.inference_prompt, response_format=self.model, top_p=None, temperature=None
|
|
221
236
|
)
|
|
222
237
|
|
|
223
|
-
def build_model(self) ->
|
|
238
|
+
def build_model(self) -> type[BaseModel]:
|
|
224
239
|
"""Create a new dynamic ``BaseModel`` class adhering to this schema.
|
|
225
240
|
|
|
226
241
|
Implementation details:
|
|
@@ -231,9 +246,14 @@ class InferredSchema(BaseModel):
|
|
|
231
246
|
introduced later by modifying this logic if needed.
|
|
232
247
|
|
|
233
248
|
Returns:
|
|
234
|
-
|
|
249
|
+
type[BaseModel]: New (not cached) model type; order matches ``fields``.
|
|
235
250
|
"""
|
|
236
|
-
type_map: dict[str, type] = {
|
|
251
|
+
type_map: dict[str, type] = {
|
|
252
|
+
"string": str,
|
|
253
|
+
"integer": int,
|
|
254
|
+
"float": float,
|
|
255
|
+
"boolean": bool,
|
|
256
|
+
}
|
|
237
257
|
fields: dict[str, tuple[type, object]] = {}
|
|
238
258
|
|
|
239
259
|
for spec in self.fields:
|
|
@@ -254,7 +274,11 @@ class InferredSchema(BaseModel):
|
|
|
254
274
|
enum_cls = Enum(enum_class_name, members) # type: ignore[arg-type]
|
|
255
275
|
py_type = enum_cls
|
|
256
276
|
else:
|
|
257
|
-
|
|
277
|
+
if spec.type.endswith("_array"):
|
|
278
|
+
base = spec.type.rsplit("_", 1)[0]
|
|
279
|
+
py_type = list[type_map[base]] # type: ignore[index]
|
|
280
|
+
else:
|
|
281
|
+
py_type = type_map[spec.type]
|
|
258
282
|
fields[spec.name] = (py_type, Field(description=spec.description))
|
|
259
283
|
|
|
260
284
|
model = create_model("InferredSchema", **fields) # type: ignore[call-arg]
|
|
@@ -282,7 +306,7 @@ class SchemaInferenceInput(BaseModel):
|
|
|
282
306
|
relevance & exclusion of outcome labels.
|
|
283
307
|
"""
|
|
284
308
|
|
|
285
|
-
examples:
|
|
309
|
+
examples: list[str] = Field(
|
|
286
310
|
description=(
|
|
287
311
|
"Representative sample texts (strings). Provide only data the schema should generalize over; "
|
|
288
312
|
"exclude outliers not in scope."
|
|
@@ -306,7 +330,8 @@ Task:
|
|
|
306
330
|
to concrete recurring evidence in the examples (or flags gaps). Use concise bullet‑style
|
|
307
331
|
sentences (still a plain string) such as: "purpose facet -> supporting pattern / gap".
|
|
308
332
|
This MUST NOT introduce new domain facts beyond the examples & purpose.
|
|
309
|
-
4. Propose a minimal flat set of scalar fields (
|
|
333
|
+
4. Propose a minimal flat set of scalar fields (and ONLY when justified,
|
|
334
|
+
homogeneous primitive arrays) that are reliably extractable.
|
|
310
335
|
5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
|
|
311
336
|
6. Provide enum_values ONLY when a small stable closed categorical set (2–24 lowercase tokens)
|
|
312
337
|
is clearly evidenced; never invent.
|
|
@@ -324,7 +349,11 @@ Rules:
|
|
|
324
349
|
(e.g. *_count, *_seconds, *_ms, *_usd, *_ratio, *_score). Avoid ambiguous bare numeric names.
|
|
325
350
|
- Boolean field names MUST start with 'is_' followed by a positive predicate (e.g. is_active,
|
|
326
351
|
is_delayed). Avoid negated forms.
|
|
327
|
-
- No
|
|
352
|
+
- No nested objects or mixed-type arrays. Homogeneous primitive arrays are allowed ONLY if each element is an atomic
|
|
353
|
+
scalar signal (use *_array types: string_array, integer_array, float_array, boolean_array). The array is expected to
|
|
354
|
+
contain 0..N such elements per record.
|
|
355
|
+
- Array field names MUST end with '_array' (e.g. keywords_array, tag_ids_array). Do not use plural-only forms
|
|
356
|
+
(e.g. keywords) for arrays; the suffix makes container semantics explicit.
|
|
328
357
|
- Descriptions: concise, objective extraction rules (no marketing/emotion/speculation).
|
|
329
358
|
- enum_values only for string fields with stable closed vocab; omit otherwise.
|
|
330
359
|
- Exclude direct outcome labels (e.g. attrition_probability, will_buy, purchase_likelihood)
|
|
@@ -465,12 +494,21 @@ def _basic_field_list_validation(parsed: InferredSchema) -> None:
|
|
|
465
494
|
raise ValueError("no fields suggested")
|
|
466
495
|
if len(names) != len(set(names)):
|
|
467
496
|
raise ValueError("duplicate field names detected")
|
|
468
|
-
allowed = {
|
|
497
|
+
allowed = {
|
|
498
|
+
"string",
|
|
499
|
+
"integer",
|
|
500
|
+
"float",
|
|
501
|
+
"boolean",
|
|
502
|
+
"string_array",
|
|
503
|
+
"integer_array",
|
|
504
|
+
"float_array",
|
|
505
|
+
"boolean_array",
|
|
506
|
+
}
|
|
469
507
|
for f in parsed.fields:
|
|
470
508
|
if f.type not in allowed:
|
|
471
509
|
raise ValueError(f"unsupported field type: {f.type}")
|
|
472
510
|
if f.enum_values is not None:
|
|
473
511
|
if f.type != "string":
|
|
474
|
-
raise ValueError(f"enum_values only allowed for string field: {f.name}")
|
|
512
|
+
raise ValueError(f"enum_values only allowed for plain string field: {f.name}")
|
|
475
513
|
if not (2 <= len(f.enum_values) <= 24):
|
|
476
514
|
raise ValueError(f"enum_values length out of bounds for field {f.name}")
|
openaivec/_serialize.py
CHANGED
|
@@ -4,19 +4,19 @@ This module provides utilities for converting Pydantic BaseModel classes
|
|
|
4
4
|
to and from JSON schema representations with simplified, maintainable code.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, Literal
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, Field, create_model
|
|
10
10
|
|
|
11
11
|
__all__ = []
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def serialize_base_model(obj:
|
|
14
|
+
def serialize_base_model(obj: type[BaseModel]) -> dict[str, Any]:
|
|
15
15
|
"""Serialize a Pydantic BaseModel to JSON schema."""
|
|
16
16
|
return obj.model_json_schema()
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def dereference_json_schema(json_schema:
|
|
19
|
+
def dereference_json_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
|
|
20
20
|
"""Dereference JSON schema by resolving $ref pointers with circular reference protection."""
|
|
21
21
|
model_map = json_schema.get("$defs", {})
|
|
22
22
|
|
|
@@ -61,7 +61,7 @@ def dereference_json_schema(json_schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
61
61
|
# ============================================================================
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
def _resolve_union_type(union_options:
|
|
64
|
+
def _resolve_union_type(union_options: list[dict[str, Any]]) -> type:
|
|
65
65
|
"""Resolve anyOf/oneOf to Union type."""
|
|
66
66
|
union_types = []
|
|
67
67
|
for option in union_options:
|
|
@@ -75,12 +75,14 @@ def _resolve_union_type(union_options: List[Dict[str, Any]]) -> Type:
|
|
|
75
75
|
elif len(union_types) == 2 and type(None) in union_types:
|
|
76
76
|
# Optional type: T | None
|
|
77
77
|
non_none_type = next(t for t in union_types if t is not type(None))
|
|
78
|
-
return
|
|
78
|
+
return non_none_type | None # type: ignore[return-value]
|
|
79
79
|
else:
|
|
80
|
+
from typing import Union
|
|
81
|
+
|
|
80
82
|
return Union[tuple(union_types)] # type: ignore[return-value]
|
|
81
83
|
|
|
82
84
|
|
|
83
|
-
def _resolve_basic_type(type_name: str, field_def:
|
|
85
|
+
def _resolve_basic_type(type_name: str, field_def: dict[str, Any]) -> type:
|
|
84
86
|
"""Resolve basic JSON schema types to Python types."""
|
|
85
87
|
type_mapping = {
|
|
86
88
|
"string": str,
|
|
@@ -101,14 +103,14 @@ def _resolve_basic_type(type_name: str, field_def: Dict[str, Any]) -> Type:
|
|
|
101
103
|
elif type_name == "array":
|
|
102
104
|
if "items" in field_def:
|
|
103
105
|
inner_type = parse_field(field_def["items"])
|
|
104
|
-
return
|
|
106
|
+
return list[inner_type]
|
|
105
107
|
else:
|
|
106
|
-
return
|
|
108
|
+
return list[Any]
|
|
107
109
|
else:
|
|
108
110
|
raise ValueError(f"Unsupported type: {type_name}")
|
|
109
111
|
|
|
110
112
|
|
|
111
|
-
def parse_field(field_def:
|
|
113
|
+
def parse_field(field_def: dict[str, Any]) -> type:
|
|
112
114
|
"""Parse a JSON schema field definition to a Python type.
|
|
113
115
|
|
|
114
116
|
Simplified version with clear separation of concerns.
|
|
@@ -141,17 +143,19 @@ def _create_field_info(description: str | None, default_value: Any, is_required:
|
|
|
141
143
|
return Field(default=default_value, description=description) if description else Field(default=default_value)
|
|
142
144
|
|
|
143
145
|
|
|
144
|
-
def _make_optional_if_needed(field_type:
|
|
146
|
+
def _make_optional_if_needed(field_type: type, is_required: bool, has_default: bool) -> type:
|
|
145
147
|
"""Make field type optional if needed."""
|
|
146
148
|
if is_required or has_default:
|
|
147
149
|
return field_type
|
|
148
150
|
|
|
149
151
|
# Check if already nullable
|
|
152
|
+
from typing import Union
|
|
153
|
+
|
|
150
154
|
if hasattr(field_type, "__origin__") and field_type.__origin__ is Union and type(None) in field_type.__args__:
|
|
151
155
|
return field_type
|
|
152
156
|
|
|
153
157
|
# Make optional
|
|
154
|
-
return
|
|
158
|
+
return field_type | None # type: ignore[return-value]
|
|
155
159
|
|
|
156
160
|
|
|
157
161
|
# ============================================================================
|
|
@@ -159,7 +163,7 @@ def _make_optional_if_needed(field_type: Type, is_required: bool, has_default: b
|
|
|
159
163
|
# ============================================================================
|
|
160
164
|
|
|
161
165
|
|
|
162
|
-
def _process_enum_field(field_name: str, field_def:
|
|
166
|
+
def _process_enum_field(field_name: str, field_def: dict[str, Any], is_required: bool) -> tuple[type, Field]: # type: ignore[type-arg]
|
|
163
167
|
"""Process enum field with Literal type."""
|
|
164
168
|
enum_values = field_def["enum"]
|
|
165
169
|
|
|
@@ -175,14 +179,14 @@ def _process_enum_field(field_name: str, field_def: Dict[str, Any], is_required:
|
|
|
175
179
|
has_default = default_value is not None
|
|
176
180
|
|
|
177
181
|
if not is_required and not has_default:
|
|
178
|
-
literal_type =
|
|
182
|
+
literal_type = literal_type | None # type: ignore[assignment]
|
|
179
183
|
default_value = None
|
|
180
184
|
|
|
181
185
|
field_info = _create_field_info(description, default_value, is_required)
|
|
182
186
|
return literal_type, field_info # type: ignore[return-value]
|
|
183
187
|
|
|
184
188
|
|
|
185
|
-
def _process_regular_field(field_name: str, field_def:
|
|
189
|
+
def _process_regular_field(field_name: str, field_def: dict[str, Any], is_required: bool) -> tuple[type, Field]: # type: ignore[type-arg]
|
|
186
190
|
"""Process regular (non-enum) field."""
|
|
187
191
|
field_type = parse_field(field_def)
|
|
188
192
|
description = field_def.get("description")
|
|
@@ -204,7 +208,7 @@ def _process_regular_field(field_name: str, field_def: Dict[str, Any], is_requir
|
|
|
204
208
|
# ============================================================================
|
|
205
209
|
|
|
206
210
|
|
|
207
|
-
def deserialize_base_model(json_schema:
|
|
211
|
+
def deserialize_base_model(json_schema: dict[str, Any]) -> type[BaseModel]:
|
|
208
212
|
"""Deserialize a JSON schema to a Pydantic BaseModel class.
|
|
209
213
|
|
|
210
214
|
Refactored version with clear separation of concerns and simplified logic.
|
openaivec/_util.py
CHANGED
|
@@ -2,8 +2,9 @@ import asyncio
|
|
|
2
2
|
import functools
|
|
3
3
|
import re
|
|
4
4
|
import time
|
|
5
|
+
from collections.abc import Awaitable, Callable
|
|
5
6
|
from dataclasses import dataclass
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import TypeVar
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
import tiktoken
|
|
@@ -36,14 +37,14 @@ def get_exponential_with_cutoff(scale: float) -> float:
|
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
def backoff(
|
|
39
|
-
exceptions:
|
|
40
|
+
exceptions: list[type[Exception]],
|
|
40
41
|
scale: int | None = None,
|
|
41
42
|
max_retries: int | None = None,
|
|
42
43
|
) -> Callable[..., V]:
|
|
43
44
|
"""Decorator implementing exponential back‑off retry logic.
|
|
44
45
|
|
|
45
46
|
Args:
|
|
46
|
-
exceptions (
|
|
47
|
+
exceptions (list[type[Exception]]): List of exception types that trigger a retry.
|
|
47
48
|
scale (int | None): Initial scale parameter for the exponential jitter.
|
|
48
49
|
This scale is used as the mean for the first delay's exponential
|
|
49
50
|
distribution and doubles with each subsequent retry. If ``None``,
|
|
@@ -88,14 +89,14 @@ def backoff(
|
|
|
88
89
|
|
|
89
90
|
|
|
90
91
|
def backoff_async(
|
|
91
|
-
exceptions:
|
|
92
|
+
exceptions: list[type[Exception]],
|
|
92
93
|
scale: int | None = None,
|
|
93
94
|
max_retries: int | None = None,
|
|
94
95
|
) -> Callable[..., Awaitable[V]]:
|
|
95
96
|
"""Asynchronous version of the backoff decorator.
|
|
96
97
|
|
|
97
98
|
Args:
|
|
98
|
-
exceptions (
|
|
99
|
+
exceptions (list[type[Exception]]): List of exception types that trigger a retry.
|
|
99
100
|
scale (int | None): Initial scale parameter for the exponential jitter.
|
|
100
101
|
This scale is used as the mean for the first delay's exponential
|
|
101
102
|
distribution and doubles with each subsequent retry. If ``None``,
|
|
@@ -145,7 +146,7 @@ class TextChunker:
|
|
|
145
146
|
|
|
146
147
|
enc: tiktoken.Encoding
|
|
147
148
|
|
|
148
|
-
def split(self, original: str, max_tokens: int, sep:
|
|
149
|
+
def split(self, original: str, max_tokens: int, sep: list[str]) -> list[str]:
|
|
149
150
|
"""Token‑aware sentence segmentation.
|
|
150
151
|
|
|
151
152
|
The text is first split by the given separators, then greedily packed
|
|
@@ -154,11 +155,11 @@ class TextChunker:
|
|
|
154
155
|
Args:
|
|
155
156
|
original (str): Original text to split.
|
|
156
157
|
max_tokens (int): Maximum number of tokens allowed per chunk.
|
|
157
|
-
sep (
|
|
158
|
+
sep (list[str]): List of separator patterns used by
|
|
158
159
|
:pyfunc:`re.split`.
|
|
159
160
|
|
|
160
161
|
Returns:
|
|
161
|
-
|
|
162
|
+
list[str]: List of text chunks respecting the ``max_tokens`` limit.
|
|
162
163
|
"""
|
|
163
164
|
sentences = re.split(f"({'|'.join(sep)})", original)
|
|
164
165
|
sentences = [s.strip() for s in sentences if s.strip()]
|