PyPI - openaivec - Versions diffs - 0.14.6__py3-none-any.whl → 0.14.8__py3-none-any.whl - Mend

openaivec 0.14.6py3-none-any.whl → 0.14.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

openaivec/_di.py +10 -9
openaivec/_embeddings.py +12 -13
openaivec/_log.py +1 -1
openaivec/_model.py +3 -3
openaivec/_optimize.py +3 -4
openaivec/_prompt.py +4 -5
openaivec/_proxy.py +34 -35
openaivec/_responses.py +29 -29
openaivec/_schema.py +80 -20
openaivec/_serialize.py +19 -15
openaivec/_util.py +9 -8
openaivec/pandas_ext.py +20 -19
openaivec/spark.py +11 -10
openaivec/task/customer_support/customer_sentiment.py +2 -2
openaivec/task/customer_support/inquiry_classification.py +8 -8
openaivec/task/customer_support/inquiry_summary.py +4 -4
openaivec/task/customer_support/intent_analysis.py +5 -5
openaivec/task/customer_support/response_suggestion.py +4 -4
openaivec/task/customer_support/urgency_analysis.py +9 -9
openaivec/task/nlp/dependency_parsing.py +2 -4
openaivec/task/nlp/keyword_extraction.py +3 -5
openaivec/task/nlp/morphological_analysis.py +4 -6
openaivec/task/nlp/named_entity_recognition.py +7 -9
openaivec/task/nlp/sentiment_analysis.py +3 -3
openaivec/task/nlp/translation.py +1 -2
openaivec/task/table/fillna.py +2 -3
{openaivec-0.14.6.dist-info → openaivec-0.14.8.dist-info}/METADATA +1 -1
openaivec-0.14.8.dist-info/RECORD +36 -0
openaivec-0.14.6.dist-info/RECORD +0 -36
{openaivec-0.14.6.dist-info → openaivec-0.14.8.dist-info}/WHEEL +0 -0
{openaivec-0.14.6.dist-info → openaivec-0.14.8.dist-info}/licenses/LICENSE +0 -0

openaivec/_responses.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import warnings
 from dataclasses import dataclass, field
 from logging import Logger, getLogger
-from typing import Any, Generic, List, Type, cast
+from typing import Any, Generic, cast
 from openai import AsyncOpenAI, BadRequestError, InternalServerError, OpenAI, RateLimitError
 from openai.types.responses import ParsedResponse
@@ -120,11 +120,11 @@ class Message(BaseModel, Generic[ResponseFormat]):
 class Request(BaseModel):
-    user_messages: List[Message[str]]
+    user_messages: list[Message[str]]
 class Response(BaseModel, Generic[ResponseFormat]):
-    assistant_messages: List[Message[ResponseFormat]]
+    assistant_messages: list[Message[ResponseFormat]]
 @dataclass(frozen=True)
@@ -150,7 +150,7 @@ class BatchResponses(Generic[ResponseFormat]):
         system_message (str): System prompt prepended to every request.
         temperature (float): Sampling temperature.
         top_p (float): Nucleus‑sampling parameter.
-        response_format (Type[ResponseFormat]): Expected Pydantic model class or ``str`` for each assistant message.
+        response_format (type[ResponseFormat]): Expected Pydantic model class or ``str`` for each assistant message.
         cache (BatchingMapProxy[str, ResponseFormat]): Order‑preserving batching proxy with de‑duplication and caching.
     Notes:
@@ -165,7 +165,7 @@ class BatchResponses(Generic[ResponseFormat]):
     system_message: str
     temperature: float | None = None
     top_p: float = 1.0
-    response_format: Type[ResponseFormat] = str  # type: ignore[assignment]
+    response_format: type[ResponseFormat] = str  # type: ignore[assignment]
     cache: BatchingMapProxy[str, ResponseFormat] = field(default_factory=lambda: BatchingMapProxy(batch_size=None))
     _vectorized_system_message: str = field(init=False)
     _model_json_schema: dict = field(init=False)
@@ -178,7 +178,7 @@ class BatchResponses(Generic[ResponseFormat]):
         system_message: str,
         temperature: float | None = 0.0,
         top_p: float = 1.0,
-        response_format: Type[ResponseFormat] = str,
+        response_format: type[ResponseFormat] = str,
         batch_size: int | None = None,
     ) -> "BatchResponses":
         """Factory constructor.
@@ -189,7 +189,7 @@ class BatchResponses(Generic[ResponseFormat]):
             system_message (str): System prompt for the model.
             temperature (float, optional): Sampling temperature. Defaults to 0.0.
             top_p (float, optional): Nucleus sampling parameter. Defaults to 1.0.
-            response_format (Type[ResponseFormat], optional): Expected output type. Defaults to ``str``.
+            response_format (type[ResponseFormat], optional): Expected output type. Defaults to ``str``.
             batch_size (int | None, optional): Max unique prompts per API call. Defaults to None
                 (automatic batch size optimization). Set to a positive integer for fixed batch size.
@@ -242,12 +242,12 @@ class BatchResponses(Generic[ResponseFormat]):
     @observe(_LOGGER)
     @backoff(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
     def _request_llm(
-        self, user_messages: List[Message[str]], **extra_api_params: Any
+        self, user_messages: list[Message[str]], **extra_api_params: Any
     ) -> ParsedResponse[Response[ResponseFormat]]:
         """Make a single call to the OpenAI JSON‑mode endpoint.
         Args:
-            user_messages (List[Message[str]]): Sequence of ``Message[str]`` representing the
+            user_messages (list[Message[str]]): Sequence of ``Message[str]`` representing the
                 prompts for this minibatch.  Each message carries a unique `id`
                 so we can restore ordering later.
@@ -265,7 +265,7 @@ class BatchResponses(Generic[ResponseFormat]):
             body: response_format  # type: ignore
         class ResponseT(BaseModel):
-            assistant_messages: List[MessageT]
+            assistant_messages: list[MessageT]
         # Build base API parameters (cannot be overridden by caller)
         api_params: dict[str, Any] = {
@@ -300,7 +300,7 @@ class BatchResponses(Generic[ResponseFormat]):
         return cast(ParsedResponse[Response[ResponseFormat]], completion)
     @observe(_LOGGER)
-    def _predict_chunk(self, user_messages: List[str], **api_kwargs: Any) -> List[ResponseFormat | None]:
+    def _predict_chunk(self, user_messages: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
         """Helper executed for every unique minibatch.
             This method:
@@ -316,11 +316,11 @@ class BatchResponses(Generic[ResponseFormat]):
         if not responses.output_parsed:
             return [None] * len(messages)
         response_dict = {message.id: message.body for message in responses.output_parsed.assistant_messages}
-        sorted_responses: List[ResponseFormat | None] = [response_dict.get(m.id, None) for m in messages]
+        sorted_responses: list[ResponseFormat | None] = [response_dict.get(m.id, None) for m in messages]
         return sorted_responses
     @observe(_LOGGER)
-    def parse(self, inputs: List[str], **api_kwargs: Any) -> List[ResponseFormat | None]:
+    def parse(self, inputs: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
         """Batched predict.
         Accepts arbitrary keyword arguments that are forwarded to the underlying
@@ -329,16 +329,16 @@ class BatchResponses(Generic[ResponseFormat]):
         configured values but can be overridden explicitly.
         Args:
-            inputs (List[str]): Prompts that require responses. Duplicates are de‑duplicated.
+            inputs (list[str]): Prompts that require responses. Duplicates are de‑duplicated.
             **api_kwargs: Extra keyword args forwarded to the OpenAI Responses API.
         Returns:
-            List[ResponseFormat | None]: Assistant responses aligned to ``inputs``.
+            list[ResponseFormat | None]: Assistant responses aligned to ``inputs``.
         """
         if not api_kwargs:
             return self.cache.map(inputs, self._predict_chunk)  # type: ignore[return-value]
-        def _predict_with(xs: List[str]) -> List[ResponseFormat | None]:
+        def _predict_with(xs: list[str]) -> list[ResponseFormat | None]:
             return self._predict_chunk(xs, **api_kwargs)
         return self.cache.map(inputs, _predict_with)  # type: ignore[return-value]
@@ -385,7 +385,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
         system_message (str): System prompt prepended to every request.
         temperature (float): Sampling temperature.
         top_p (float): Nucleus‑sampling parameter.
-        response_format (Type[ResponseFormat]): Expected Pydantic model class or ``str`` for each assistant message.
+        response_format (type[ResponseFormat]): Expected Pydantic model class or ``str`` for each assistant message.
         cache (AsyncBatchingMapProxy[str, ResponseFormat]): Async batching proxy with de‑duplication
             and concurrency control.
     """
@@ -395,7 +395,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
     system_message: str
     temperature: float | None = 0.0
     top_p: float = 1.0
-    response_format: Type[ResponseFormat] = str  # type: ignore[assignment]
+    response_format: type[ResponseFormat] = str  # type: ignore[assignment]
     cache: AsyncBatchingMapProxy[str, ResponseFormat] = field(
         default_factory=lambda: AsyncBatchingMapProxy(batch_size=None, max_concurrency=8)
     )
@@ -410,7 +410,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
         system_message: str,
         temperature: float | None = None,
         top_p: float = 1.0,
-        response_format: Type[ResponseFormat] = str,
+        response_format: type[ResponseFormat] = str,
         batch_size: int | None = None,
         max_concurrency: int = 8,
     ) -> "AsyncBatchResponses":
@@ -422,7 +422,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
             system_message (str): System prompt.
             temperature (float, optional): Sampling temperature. Defaults to 0.0.
             top_p (float, optional): Nucleus sampling parameter. Defaults to 1.0.
-            response_format (Type[ResponseFormat], optional): Expected output type. Defaults to ``str``.
+            response_format (type[ResponseFormat], optional): Expected output type. Defaults to ``str``.
             batch_size (int | None, optional): Max unique prompts per API call. Defaults to None
                 (automatic batch size optimization). Set to a positive integer for fixed batch size.
             max_concurrency (int, optional): Max concurrent API calls. Defaults to 8.
@@ -482,12 +482,12 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
     @backoff_async(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
     @observe(_LOGGER)
     async def _request_llm(
-        self, user_messages: List[Message[str]], **extra_api_params: Any
+        self, user_messages: list[Message[str]], **extra_api_params: Any
     ) -> ParsedResponse[Response[ResponseFormat]]:
         """Make a single async call to the OpenAI JSON‑mode endpoint.
         Args:
-            user_messages (List[Message[str]]): Sequence of ``Message[str]`` representing the minibatch prompts.
+            user_messages (list[Message[str]]): Sequence of ``Message[str]`` representing the minibatch prompts.
         Returns:
             ParsedResponse[Response[ResponseFormat]]: Parsed response with assistant messages (arbitrary order).
@@ -502,7 +502,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
             body: response_format  # type: ignore
         class ResponseT(BaseModel):
-            assistant_messages: List[MessageT]
+            assistant_messages: list[MessageT]
         # Build base API parameters (cannot be overridden by caller)
         api_params: dict[str, Any] = {
@@ -537,7 +537,7 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
         return cast(ParsedResponse[Response[ResponseFormat]], completion)
     @observe(_LOGGER)
-    async def _predict_chunk(self, user_messages: List[str], **api_kwargs: Any) -> List[ResponseFormat | None]:
+    async def _predict_chunk(self, user_messages: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
         """Async helper executed for every unique minibatch.
             This method:
@@ -553,11 +553,11 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
             return [None] * len(messages)
         response_dict = {message.id: message.body for message in responses.output_parsed.assistant_messages}
         # Ensure proper handling for missing IDs - this shouldn't happen in normal operation
-        sorted_responses: List[ResponseFormat | None] = [response_dict.get(m.id, None) for m in messages]
+        sorted_responses: list[ResponseFormat | None] = [response_dict.get(m.id, None) for m in messages]
         return sorted_responses
     @observe(_LOGGER)
-    async def parse(self, inputs: List[str], **api_kwargs: Any) -> List[ResponseFormat | None]:
+    async def parse(self, inputs: list[str], **api_kwargs: Any) -> list[ResponseFormat | None]:
         """Batched predict (async).
         Accepts arbitrary keyword arguments forwarded to ``AsyncOpenAI.responses.parse``.
@@ -566,16 +566,16 @@ class AsyncBatchResponses(Generic[ResponseFormat]):
         changing the public surface again.
         Args:
-            inputs (List[str]): Prompts that require responses. Duplicates are de‑duplicated.
+            inputs (list[str]): Prompts that require responses. Duplicates are de‑duplicated.
             **api_kwargs: Extra keyword args for the OpenAI Responses API.
         Returns:
-            List[ResponseFormat | None]: Assistant responses aligned to ``inputs``.
+            list[ResponseFormat | None]: Assistant responses aligned to ``inputs``.
         """
         if not api_kwargs:
             return await self.cache.map(inputs, self._predict_chunk)  # type: ignore[return-value]
-        async def _predict_with(xs: List[str]) -> List[ResponseFormat | None]:
+        async def _predict_with(xs: list[str]) -> list[ResponseFormat | None]:
             return await self._predict_chunk(xs, **api_kwargs)
         return await self.cache.map(inputs, _predict_with)  # type: ignore[return-value]

openaivec/_schema.py CHANGED Viewed

@@ -25,8 +25,11 @@ This module is intentionally **internal** (``__all__ = []``). Public users
 should interact through higher‑level batch APIs once a schema has been inferred.
 Design constraints:
-* Flat schema only (no nesting / arrays) to simplify Spark & pandas alignment.
-* Primitive types limited to {string, integer, float, boolean}.
+* Flat schema only (no nested objects). Top-level arrays permitted ONLY as homogeneous arrays of primitives
+    (e.g. array of strings) – represented via specialized primitive array type names
+    (string_array, integer_array, float_array, boolean_array).
+* Primitive scalar types limited to {string, integer, float, boolean}; optional array variants
+    {string_array, integer_array, float_array, boolean_array}.
 * Optional enumerations for *closed*, *observed* categorical sets only.
 * Validation retries ensure a structurally coherent suggestion before returning.
@@ -49,7 +52,7 @@ authoritative contract is the ordered list of ``FieldSpec`` instances.
 from dataclasses import dataclass
 from enum import Enum
-from typing import List, Literal, Optional, Type
+from typing import Literal
 from openai import OpenAI
 from openai.types.responses import ParsedResponse
@@ -87,24 +90,44 @@ class FieldSpec(BaseModel):
     name: str = Field(
         description=(
             "Lower snake_case identifier (regex: ^[a-z][a-z0-9_]*$). Must be unique across all fields and "
-            "express the semantic meaning succinctly (no adjectives like 'best', 'great')."
+            "express the semantic meaning succinctly (no adjectives like 'best', 'great'). For numeric (integer|float) "
+            "fields the name MUST include an explicit unit or measure suffix (e.g. _count, _total_count, "
+            "_duration_seconds, _ms, _price_usd, _ratio, _score) to eliminate ambiguity. Avoid bare numeric nouns like "
+            "'duration' or 'value' without unit/scale. Boolean field names MUST begin with 'is_' followed by a "
+            "descriptive predicate (e.g. is_active, is_delayed). Use positive forms (is_active) rather than "
+            "negated forms (is_not_active)."
         )
     )
-    type: Literal["string", "integer", "float", "boolean"] = Field(
+    type: Literal[
+        "string",
+        "integer",
+        "float",
+        "boolean",
+        "string_array",
+        "integer_array",
+        "float_array",
+        "boolean_array",
+    ] = Field(
         description=(
             "Primitive type. Use 'integer' only if all observed numeric values are whole numbers. "
             "Use 'float' if any value can contain a decimal or represents a ratio/score. Use 'boolean' only for "
             "explicit binary states (yes/no, true/false, present/absent) consistently encoded. Use 'string' otherwise. "
-            "Never output arrays, objects, or composite encodings; flatten to the most specific scalar value."
+            "Array variants (string_array, integer_array, float_array, boolean_array) are ONLY allowed when the value "
+            "is a repeatable homogeneous collection whose individual elements would otherwise stand as valid scalar "
+            "extractions (e.g. keywords, error_codes, tag_ids). Do not encode objects or mixed-type arrays; flatten or "
+            "choose the most informative level."
         )
     )
     description: str = Field(
         description=(
             "Concise, objective definition plus extraction rule (what qualifies / what to ignore). Avoid subjective, "
-            "speculative, or promotional language. If ambiguity exists with another field, clarify the distinction."
+            "speculative, or promotional language. If ambiguity exists with another field, clarify the distinction. "
+            "Do NOT simply restate an original JSON/key name if the examples are already structured; only include a "
+            "raw key verbatim when it is already the minimal, irreducible analytic unit. For derived fields, clearly "
+            "state the transformation (e.g. sentiment of comment_text, normalized date, language code)."
         )
     )
-    enum_values: Optional[List[str]] = Field(
+    enum_values: list[str] | None = Field(
         default=None,
         description=(
             "Optional finite categorical label set (classification) for a string field. Provide ONLY when a closed, "
@@ -160,7 +183,7 @@ class InferredSchema(BaseModel):
             "reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream extraction."
         )
     )
-    fields: List[FieldSpec] = Field(
+    fields: list[FieldSpec] = Field(
         description=(
             "Ordered list of proposed fields derived strictly from observable, repeatable signals in the "
             "examples and aligned with the purpose."
@@ -191,13 +214,13 @@ class InferredSchema(BaseModel):
             return cls.model_validate_json(f.read())
     @property
-    def model(self) -> Type[BaseModel]:
+    def model(self) -> type[BaseModel]:
         """Dynamically materialized Pydantic model for the inferred schema.
         Equivalent to calling :meth:`build_model` each access (not cached).
         Returns:
-            Type[BaseModel]: Fresh model type reflecting ``fields`` ordering.
+            type[BaseModel]: Fresh model type reflecting ``fields`` ordering.
         """
         return self.build_model()
@@ -212,7 +235,7 @@ class InferredSchema(BaseModel):
             instructions=self.inference_prompt, response_format=self.model, top_p=None, temperature=None
         )
-    def build_model(self) -> Type[BaseModel]:
+    def build_model(self) -> type[BaseModel]:
         """Create a new dynamic ``BaseModel`` class adhering to this schema.
         Implementation details:
@@ -223,9 +246,14 @@ class InferredSchema(BaseModel):
               introduced later by modifying this logic if needed.
         Returns:
-            Type[BaseModel]: New (not cached) model type; order matches ``fields``.
+            type[BaseModel]: New (not cached) model type; order matches ``fields``.
         """
-        type_map: dict[str, type] = {"string": str, "integer": int, "float": float, "boolean": bool}
+        type_map: dict[str, type] = {
+            "string": str,
+            "integer": int,
+            "float": float,
+            "boolean": bool,
+        }
         fields: dict[str, tuple[type, object]] = {}
         for spec in self.fields:
@@ -246,7 +274,11 @@ class InferredSchema(BaseModel):
                 enum_cls = Enum(enum_class_name, members)  # type: ignore[arg-type]
                 py_type = enum_cls
             else:
-                py_type = type_map[spec.type]
+                if spec.type.endswith("_array"):
+                    base = spec.type.rsplit("_", 1)[0]
+                    py_type = list[type_map[base]]  # type: ignore[index]
+                else:
+                    py_type = type_map[spec.type]
             fields[spec.name] = (py_type, Field(description=spec.description))
         model = create_model("InferredSchema", **fields)  # type: ignore[call-arg]
@@ -274,7 +306,7 @@ class SchemaInferenceInput(BaseModel):
             relevance & exclusion of outcome labels.
     """
-    examples: List[str] = Field(
+    examples: list[str] = Field(
         description=(
             "Representative sample texts (strings). Provide only data the schema should generalize over; "
             "exclude outliers not in scope."
@@ -298,7 +330,8 @@ Task:
    to concrete recurring evidence in the examples (or flags gaps). Use concise bullet‑style
    sentences (still a plain string) such as: "purpose facet -> supporting pattern / gap".
    This MUST NOT introduce new domain facts beyond the examples & purpose.
-4. Propose a minimal flat set of scalar fields (no nesting / arrays) that are reliably extractable.
+4. Propose a minimal flat set of scalar fields (and ONLY when justified,
+   homogeneous primitive arrays) that are reliably extractable.
 5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
 6. Provide enum_values ONLY when a small stable closed categorical set (2–24 lowercase tokens)
     is clearly evidenced; never invent.
@@ -312,11 +345,29 @@ Rules:
     * float = any decimals / ratios
     * boolean = explicit binary
     * else use string
-- No arrays, objects, composite encodings, or merged multi-concept fields.
+- Numeric (integer|float) field names MUST encode an explicit unit / scale / measure suffix
+    (e.g. *_count, *_seconds, *_ms, *_usd, *_ratio, *_score). Avoid ambiguous bare numeric names.
+- Boolean field names MUST start with 'is_' followed by a positive predicate (e.g. is_active,
+  is_delayed). Avoid negated forms.
+- No nested objects or mixed-type arrays. Homogeneous primitive arrays are allowed ONLY if each element is an atomic
+    scalar signal (use *_array types: string_array, integer_array, float_array, boolean_array). The array is expected to
+    contain 0..N such elements per record.
+- Array field names MUST end with '_array' (e.g. keywords_array, tag_ids_array). Do not use plural-only forms
+    (e.g. keywords) for arrays; the suffix makes container semantics explicit.
 - Descriptions: concise, objective extraction rules (no marketing/emotion/speculation).
 - enum_values only for string fields with stable closed vocab; omit otherwise.
 - Exclude direct outcome labels (e.g. attrition_probability, will_buy, purchase_likelihood)
     in predictive / feature engineering contexts.
+- When examples already appear as serialized JSON / key-value records, DO NOT merely relist the
+    raw original keys unless each is already an atomic, irreducible analytic signal. Prefer high-signal
+    derived / normalized / aggregated features (e.g. sentiment, category, language_code, boolean flags,
+    normalized_date, count metrics).
+- Superficial renames (adding trivial prefixes/suffixes like _value, _field, new_) are forbidden; a new
+    field name must reflect a semantic transformation.
+- Keep field count focused (typically <= 12) prioritizing reusable analytical / ML features over low-signal
+    restatements.
+- If you retain an original raw key unchanged, its description must justify why it is minimal and cannot
+    be further decomposed without losing analytical value.
 Output contract:
 Return exactly an InferredSchema object with JSON keys:
@@ -443,12 +494,21 @@ def _basic_field_list_validation(parsed: InferredSchema) -> None:
         raise ValueError("no fields suggested")
     if len(names) != len(set(names)):
         raise ValueError("duplicate field names detected")
-    allowed = {"string", "integer", "float", "boolean"}
+    allowed = {
+        "string",
+        "integer",
+        "float",
+        "boolean",
+        "string_array",
+        "integer_array",
+        "float_array",
+        "boolean_array",
+    }
     for f in parsed.fields:
         if f.type not in allowed:
             raise ValueError(f"unsupported field type: {f.type}")
         if f.enum_values is not None:
             if f.type != "string":
-                raise ValueError(f"enum_values only allowed for string field: {f.name}")
+                raise ValueError(f"enum_values only allowed for plain string field: {f.name}")
             if not (2 <= len(f.enum_values) <= 24):
                 raise ValueError(f"enum_values length out of bounds for field {f.name}")

openaivec/_serialize.py CHANGED Viewed

@@ -4,19 +4,19 @@ This module provides utilities for converting Pydantic BaseModel classes
 to and from JSON schema representations with simplified, maintainable code.
 """
-from typing import Any, Dict, List, Literal, Tuple, Type, Union
+from typing import Any, Literal
 from pydantic import BaseModel, Field, create_model
 __all__ = []
-def serialize_base_model(obj: Type[BaseModel]) -> Dict[str, Any]:
+def serialize_base_model(obj: type[BaseModel]) -> dict[str, Any]:
     """Serialize a Pydantic BaseModel to JSON schema."""
     return obj.model_json_schema()
-def dereference_json_schema(json_schema: Dict[str, Any]) -> Dict[str, Any]:
+def dereference_json_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
     """Dereference JSON schema by resolving $ref pointers with circular reference protection."""
     model_map = json_schema.get("$defs", {})
@@ -61,7 +61,7 @@ def dereference_json_schema(json_schema: Dict[str, Any]) -> Dict[str, Any]:
 # ============================================================================
-def _resolve_union_type(union_options: List[Dict[str, Any]]) -> Type:
+def _resolve_union_type(union_options: list[dict[str, Any]]) -> type:
     """Resolve anyOf/oneOf to Union type."""
     union_types = []
     for option in union_options:
@@ -75,12 +75,14 @@ def _resolve_union_type(union_options: List[Dict[str, Any]]) -> Type:
     elif len(union_types) == 2 and type(None) in union_types:
         # Optional type: T | None
         non_none_type = next(t for t in union_types if t is not type(None))
-        return Union[non_none_type, type(None)]  # type: ignore[return-value]
+        return non_none_type | None  # type: ignore[return-value]
     else:
+        from typing import Union
         return Union[tuple(union_types)]  # type: ignore[return-value]
-def _resolve_basic_type(type_name: str, field_def: Dict[str, Any]) -> Type:
+def _resolve_basic_type(type_name: str, field_def: dict[str, Any]) -> type:
     """Resolve basic JSON schema types to Python types."""
     type_mapping = {
         "string": str,
@@ -101,14 +103,14 @@ def _resolve_basic_type(type_name: str, field_def: Dict[str, Any]) -> Type:
     elif type_name == "array":
         if "items" in field_def:
             inner_type = parse_field(field_def["items"])
-            return List[inner_type]
+            return list[inner_type]
         else:
-            return List[Any]
+            return list[Any]
     else:
         raise ValueError(f"Unsupported type: {type_name}")
-def parse_field(field_def: Dict[str, Any]) -> Type:
+def parse_field(field_def: dict[str, Any]) -> type:
     """Parse a JSON schema field definition to a Python type.
     Simplified version with clear separation of concerns.
@@ -141,17 +143,19 @@ def _create_field_info(description: str | None, default_value: Any, is_required:
         return Field(default=default_value, description=description) if description else Field(default=default_value)
-def _make_optional_if_needed(field_type: Type, is_required: bool, has_default: bool) -> Type:
+def _make_optional_if_needed(field_type: type, is_required: bool, has_default: bool) -> type:
     """Make field type optional if needed."""
     if is_required or has_default:
         return field_type
     # Check if already nullable
+    from typing import Union
     if hasattr(field_type, "__origin__") and field_type.__origin__ is Union and type(None) in field_type.__args__:
         return field_type
     # Make optional
-    return Union[field_type, type(None)]  # type: ignore[return-value]
+    return field_type | None  # type: ignore[return-value]
 # ============================================================================
@@ -159,7 +163,7 @@ def _make_optional_if_needed(field_type: Type, is_required: bool, has_default: b
 # ============================================================================
-def _process_enum_field(field_name: str, field_def: Dict[str, Any], is_required: bool) -> Tuple[Type, Field]:  # type: ignore[type-arg]
+def _process_enum_field(field_name: str, field_def: dict[str, Any], is_required: bool) -> tuple[type, Field]:  # type: ignore[type-arg]
     """Process enum field with Literal type."""
     enum_values = field_def["enum"]
@@ -175,14 +179,14 @@ def _process_enum_field(field_name: str, field_def: Dict[str, Any], is_required:
     has_default = default_value is not None
     if not is_required and not has_default:
-        literal_type = Union[literal_type, type(None)]  # type: ignore[assignment]
+        literal_type = literal_type | None  # type: ignore[assignment]
         default_value = None
     field_info = _create_field_info(description, default_value, is_required)
     return literal_type, field_info  # type: ignore[return-value]
-def _process_regular_field(field_name: str, field_def: Dict[str, Any], is_required: bool) -> Tuple[Type, Field]:  # type: ignore[type-arg]
+def _process_regular_field(field_name: str, field_def: dict[str, Any], is_required: bool) -> tuple[type, Field]:  # type: ignore[type-arg]
     """Process regular (non-enum) field."""
     field_type = parse_field(field_def)
     description = field_def.get("description")
@@ -204,7 +208,7 @@ def _process_regular_field(field_name: str, field_def: Dict[str, Any], is_requir
 # ============================================================================
-def deserialize_base_model(json_schema: Dict[str, Any]) -> Type[BaseModel]:
+def deserialize_base_model(json_schema: dict[str, Any]) -> type[BaseModel]:
     """Deserialize a JSON schema to a Pydantic BaseModel class.
     Refactored version with clear separation of concerns and simplified logic.

openaivec/_util.py CHANGED Viewed

@@ -2,8 +2,9 @@ import asyncio
 import functools
 import re
 import time
+from collections.abc import Awaitable, Callable
 from dataclasses import dataclass
-from typing import Awaitable, Callable, List, Type, TypeVar
+from typing import TypeVar
 import numpy as np
 import tiktoken
@@ -36,14 +37,14 @@ def get_exponential_with_cutoff(scale: float) -> float:
 def backoff(
-    exceptions: List[Type[Exception]],
+    exceptions: list[type[Exception]],
     scale: int | None = None,
     max_retries: int | None = None,
 ) -> Callable[..., V]:
     """Decorator implementing exponential back‑off retry logic.
     Args:
-        exceptions (List[Type[Exception]]): List of exception types that trigger a retry.
+        exceptions (list[type[Exception]]): List of exception types that trigger a retry.
         scale (int | None): Initial scale parameter for the exponential jitter.
             This scale is used as the mean for the first delay's exponential
             distribution and doubles with each subsequent retry. If ``None``,
@@ -88,14 +89,14 @@ def backoff(
 def backoff_async(
-    exceptions: List[Type[Exception]],
+    exceptions: list[type[Exception]],
     scale: int | None = None,
     max_retries: int | None = None,
 ) -> Callable[..., Awaitable[V]]:
     """Asynchronous version of the backoff decorator.
     Args:
-        exceptions (List[Type[Exception]]): List of exception types that trigger a retry.
+        exceptions (list[type[Exception]]): List of exception types that trigger a retry.
         scale (int | None): Initial scale parameter for the exponential jitter.
             This scale is used as the mean for the first delay's exponential
             distribution and doubles with each subsequent retry. If ``None``,
@@ -145,7 +146,7 @@ class TextChunker:
     enc: tiktoken.Encoding
-    def split(self, original: str, max_tokens: int, sep: List[str]) -> List[str]:
+    def split(self, original: str, max_tokens: int, sep: list[str]) -> list[str]:
         """Token‑aware sentence segmentation.
         The text is first split by the given separators, then greedily packed
@@ -154,11 +155,11 @@ class TextChunker:
         Args:
             original (str): Original text to split.
             max_tokens (int): Maximum number of tokens allowed per chunk.
-            sep (List[str]): List of separator patterns used by
+            sep (list[str]): List of separator patterns used by
                 :pyfunc:`re.split`.
         Returns:
-            List[str]: List of text chunks respecting the ``max_tokens`` limit.
+            list[str]: List of text chunks respecting the ``max_tokens`` limit.
         """
         sentences = re.split(f"({'|'.join(sep)})", original)
         sentences = [s.strip() for s in sentences if s.strip()]

openaivec 0.14.6__py3-none-any.whl → 0.14.8__py3-none-any.whl

openaivec 0.14.6py3-none-any.whl → 0.14.8py3-none-any.whl