PyPI - openaivec - Versions diffs - 0.12.5__py3-none-any.whl → 1.0.10__py3-none-any.whl - Mend

openaivec 0.12.5py3-none-any.whl → 1.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

openaivec/__init__.py +13 -4
openaivec/_cache/__init__.py +12 -0
openaivec/_cache/optimize.py +109 -0
openaivec/_cache/proxy.py +806 -0
openaivec/{di.py → _di.py} +36 -12
openaivec/_embeddings.py +203 -0
openaivec/{log.py → _log.py} +2 -2
openaivec/_model.py +113 -0
openaivec/{prompt.py → _prompt.py} +95 -28
openaivec/_provider.py +207 -0
openaivec/_responses.py +511 -0
openaivec/_schema/__init__.py +9 -0
openaivec/_schema/infer.py +340 -0
openaivec/_schema/spec.py +350 -0
openaivec/_serialize.py +234 -0
openaivec/{util.py → _util.py} +25 -85
openaivec/pandas_ext.py +1496 -318
openaivec/spark.py +485 -183
openaivec/task/__init__.py +9 -7
openaivec/task/customer_support/__init__.py +9 -15
openaivec/task/customer_support/customer_sentiment.py +17 -15
openaivec/task/customer_support/inquiry_classification.py +23 -22
openaivec/task/customer_support/inquiry_summary.py +14 -13
openaivec/task/customer_support/intent_analysis.py +21 -19
openaivec/task/customer_support/response_suggestion.py +16 -16
openaivec/task/customer_support/urgency_analysis.py +24 -25
openaivec/task/nlp/__init__.py +4 -4
openaivec/task/nlp/dependency_parsing.py +10 -12
openaivec/task/nlp/keyword_extraction.py +11 -14
openaivec/task/nlp/morphological_analysis.py +12 -14
openaivec/task/nlp/named_entity_recognition.py +16 -18
openaivec/task/nlp/sentiment_analysis.py +14 -11
openaivec/task/nlp/translation.py +6 -9
openaivec/task/table/__init__.py +2 -2
openaivec/task/table/fillna.py +11 -11
openaivec-1.0.10.dist-info/METADATA +399 -0
openaivec-1.0.10.dist-info/RECORD +39 -0
{openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/WHEEL +1 -1
openaivec/embeddings.py +0 -172
openaivec/model.py +0 -67
openaivec/provider.py +0 -45
openaivec/responses.py +0 -393
openaivec/serialize.py +0 -225
openaivec-0.12.5.dist-info/METADATA +0 -696
openaivec-0.12.5.dist-info/RECORD +0 -33
{openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/licenses/LICENSE +0 -0

openaivec/spark.py CHANGED Viewed

@@ -1,37 +1,51 @@
 """Asynchronous Spark UDFs for the OpenAI and Azure OpenAI APIs.
-This module provides functions (`responses_udf`, `task_udf`, `embeddings_udf`)
+This module provides functions (`responses_udf`, `task_udf`, `embeddings_udf`,
+`count_tokens_udf`, `split_to_chunks_udf`, `similarity_udf`, `parse_udf`)
 for creating asynchronous Spark UDFs that communicate with either the public
 OpenAI API or Azure OpenAI using the `openaivec.spark` subpackage.
-It supports UDFs for generating responses and creating embeddings asynchronously.
-The UDFs operate on Spark DataFrames and leverage asyncio for potentially
-improved performance in I/O-bound operations.
+It supports UDFs for generating responses, creating embeddings, parsing text,
+and computing similarities asynchronously. The UDFs operate on Spark DataFrames
+and leverage asyncio for improved performance in I/O-bound operations.
+**Performance Optimization**: All AI-powered UDFs (`responses_udf`, `task_udf`, `embeddings_udf`, `parse_udf`)
+automatically cache duplicate inputs within each partition, significantly reducing
+API calls and costs when processing datasets with overlapping content.
 ## Setup
 First, obtain a Spark session and configure authentication:
 ```python
-import os
 from pyspark.sql import SparkSession
+from openaivec.spark import setup, setup_azure
 spark = SparkSession.builder.getOrCreate()
-sc = spark.sparkContext
-# Configure authentication via SparkContext environment variables
 # Option 1: Using OpenAI
-sc.environment["OPENAI_API_KEY"] = "your-openai-api-key"
+setup(
+    spark,
+    api_key="your-openai-api-key",
+    responses_model_name="gpt-4.1-mini",  # Optional: set default model
+    embeddings_model_name="text-embedding-3-small"  # Optional: set default model
+)
 # Option 2: Using Azure OpenAI
-# sc.environment["AZURE_OPENAI_API_KEY"] = "your-azure-openai-api-key"
-# sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "your-azure-openai-endpoint"
-# sc.environment["AZURE_OPENAI_API_VERSION"] = "your-azure-openai-api-version"
+# setup_azure(
+#     spark,
+#     api_key="your-azure-openai-api-key",
+#     base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
+#     api_version="preview",
+#     responses_model_name="my-gpt4-deployment",  # Optional: set default deployment
+#     embeddings_model_name="my-embedding-deployment"  # Optional: set default deployment
+# )
 ```
 Next, create UDFs and register them:
 ```python
-from openaivec.spark import responses_udf, task_udf, embeddings_udf
+from openaivec.spark import responses_udf, task_udf, embeddings_udf, count_tokens_udf, split_to_chunks_udf
 from pydantic import BaseModel
 # Define a Pydantic model for structured responses (optional)
@@ -46,7 +60,7 @@ spark.udf.register(
     responses_udf(
         instructions="Translate the text to multiple languages.",
         response_format=Translation,
-        model_name="gpt-4.1-mini",  # Optional, defaults to gpt-4.1-mini
+        model_name="gpt-4.1-mini",  # For Azure: deployment name, for OpenAI: model name
         batch_size=64,              # Rows per API request within partition
         max_concurrency=8           # Concurrent requests PER EXECUTOR
     ),
@@ -63,11 +77,16 @@ spark.udf.register(
 spark.udf.register(
     "embed_async",
     embeddings_udf(
-        model_name="text-embedding-3-small",  # Optional, defaults to text-embedding-3-small
+        model_name="text-embedding-3-small",  # For Azure: deployment name, for OpenAI: model name
         batch_size=128,                       # Larger batches for embeddings
         max_concurrency=8                     # Concurrent requests PER EXECUTOR
     ),
 )
+# Register token counting, text chunking, and similarity UDFs
+spark.udf.register("count_tokens", count_tokens_udf())
+spark.udf.register("split_chunks", split_to_chunks_udf(max_tokens=512, sep=[".", "!", "?"]))
+spark.udf.register("compute_similarity", similarity_udf())
 ```
 You can now invoke the UDFs from Spark SQL:
@@ -77,7 +96,10 @@ SELECT
     text,
     translate_async(text) AS translation,
     sentiment_async(text) AS sentiment,
-    embed_async(text) AS embedding
+    embed_async(text) AS embedding,
+    count_tokens(text) AS token_count,
+    split_chunks(text) AS chunks,
+    compute_similarity(embed_async(text1), embed_async(text2)) AS similarity
 FROM your_table;
 ```
@@ -103,26 +125,38 @@ Note: This module provides asynchronous support through the pandas extensions.
 import asyncio
 import logging
+import os
+from collections.abc import Iterator
 from enum import Enum
-from typing import Dict, Iterator, List, Optional, Type, Union, get_args, get_origin
+from typing import Union, get_args, get_origin
+import numpy as np
 import pandas as pd
 import tiktoken
 from pydantic import BaseModel
+from pyspark import SparkContext
+from pyspark.sql import SparkSession
 from pyspark.sql.pandas.functions import pandas_udf
 from pyspark.sql.types import ArrayType, BooleanType, FloatType, IntegerType, StringType, StructField, StructType
 from pyspark.sql.udf import UserDefinedFunction
 from typing_extensions import Literal
-from . import pandas_ext
-from .model import PreparedTask, ResponseFormat
-from .serialize import deserialize_base_model, serialize_base_model
-from .util import TextChunker
+from openaivec import pandas_ext
+from openaivec._cache import AsyncBatchingMapProxy
+from openaivec._model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
+from openaivec._provider import CONTAINER
+from openaivec._schema import SchemaInferenceInput, SchemaInferenceOutput, SchemaInferer
+from openaivec._serialize import deserialize_base_model, serialize_base_model
+from openaivec._util import TextChunker
 __all__ = [
+    "setup",
+    "setup_azure",
     "responses_udf",
     "task_udf",
     "embeddings_udf",
+    "infer_schema",
+    "parse_udf",
     "split_to_chunks_udf",
     "count_tokens_udf",
     "similarity_udf",
@@ -130,21 +164,126 @@ __all__ = [
 _LOGGER: logging.Logger = logging.getLogger(__name__)
-_TIKTOKEN_ENC: tiktoken.Encoding | None = None
+def setup(
+    spark: SparkSession, api_key: str, responses_model_name: str | None = None, embeddings_model_name: str | None = None
+):
+    """Setup OpenAI authentication and default model names in Spark environment.
+    1. Configures OpenAI API key in SparkContext environment.
+    2. Configures OpenAI API key in local process environment.
+    3. Optionally registers default model names for responses and embeddings in the DI container.
+    Args:
+        spark (SparkSession): The Spark session to configure.
+        api_key (str): OpenAI API key for authentication.
+        responses_model_name (str | None): Default model name for response generation.
+            If provided, registers `ResponsesModelName` in the DI container.
+        embeddings_model_name (str | None): Default model name for embeddings.
+            If provided, registers `EmbeddingsModelName` in the DI container.
+    Example:
+        ```python
+        from pyspark.sql import SparkSession
+        from openaivec.spark import setup
+        spark = SparkSession.builder.getOrCreate()
+        setup(
+            spark,
+            api_key="sk-***",
+            responses_model_name="gpt-4.1-mini",
+            embeddings_model_name="text-embedding-3-small",
+        )
+        ```
+    """
+    CONTAINER.register(SparkSession, lambda: spark)
+    CONTAINER.register(SparkContext, lambda: CONTAINER.resolve(SparkSession).sparkContext)
+    sc = CONTAINER.resolve(SparkContext)
+    sc.environment["OPENAI_API_KEY"] = api_key
+    os.environ["OPENAI_API_KEY"] = api_key
+    if responses_model_name:
+        CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(responses_model_name))
+    if embeddings_model_name:
+        CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(embeddings_model_name))
+    CONTAINER.clear_singletons()
+def setup_azure(
+    spark: SparkSession,
+    api_key: str,
+    base_url: str,
+    api_version: str = "preview",
+    responses_model_name: str | None = None,
+    embeddings_model_name: str | None = None,
+):
+    """Setup Azure OpenAI authentication and default model names in Spark environment.
+    1. Configures Azure OpenAI API key, base URL, and API version in SparkContext environment.
+    2. Configures Azure OpenAI API key, base URL, and API version in local process environment.
+    3. Optionally registers default model names for responses and embeddings in the DI container.
+    Args:
+        spark (SparkSession): The Spark session to configure.
+        api_key (str): Azure OpenAI API key for authentication.
+        base_url (str): Base URL for the Azure OpenAI resource.
+        api_version (str): API version to use. Defaults to "preview".
+        responses_model_name (str | None): Default model name for response generation.
+            If provided, registers `ResponsesModelName` in the DI container.
+        embeddings_model_name (str | None): Default model name for embeddings.
+            If provided, registers `EmbeddingsModelName` in the DI container.
+    Example:
+        ```python
+        from pyspark.sql import SparkSession
+        from openaivec.spark import setup_azure
+        spark = SparkSession.builder.getOrCreate()
+        setup_azure(
+            spark,
+            api_key="azure-key",
+            base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
+            api_version="preview",
+            responses_model_name="gpt4-deployment",
+            embeddings_model_name="embedding-deployment",
+        )
+        ```
+    """
+    CONTAINER.register(SparkSession, lambda: spark)
+    CONTAINER.register(SparkContext, lambda: CONTAINER.resolve(SparkSession).sparkContext)
+    sc = CONTAINER.resolve(SparkContext)
+    sc.environment["AZURE_OPENAI_API_KEY"] = api_key
+    sc.environment["AZURE_OPENAI_BASE_URL"] = base_url
+    sc.environment["AZURE_OPENAI_API_VERSION"] = api_version
+    os.environ["AZURE_OPENAI_API_KEY"] = api_key
+    os.environ["AZURE_OPENAI_BASE_URL"] = base_url
+    os.environ["AZURE_OPENAI_API_VERSION"] = api_version
+    if responses_model_name:
+        CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(responses_model_name))
+    if embeddings_model_name:
+        CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(embeddings_model_name))
+    CONTAINER.clear_singletons()
 def _python_type_to_spark(python_type):
     origin = get_origin(python_type)
-    # For list types (e.g., List[int])
-    if origin is list or origin is List:
+    # For list types (e.g., list[int])
+    if origin is list:
         # Retrieve the inner type and recursively convert it
         inner_type = get_args(python_type)[0]
         return ArrayType(_python_type_to_spark(inner_type))
-    # For Optional types (Union[..., None])
+    # For Optional types (T | None via Union internally)
     elif origin is Union:
         non_none_args = [arg for arg in get_args(python_type) if arg is not type(None)]
         if len(non_none_args) == 1:
@@ -177,7 +316,7 @@ def _python_type_to_spark(python_type):
         raise ValueError(f"Unsupported type: {python_type}")
-def _pydantic_to_spark_schema(model: Type[BaseModel]) -> StructType:
+def _pydantic_to_spark_schema(model: type[BaseModel]) -> StructType:
     fields = []
     for field_name, field in model.model_fields.items():
         field_type = field.annotation
@@ -188,7 +327,7 @@ def _pydantic_to_spark_schema(model: Type[BaseModel]) -> StructType:
     return StructType(fields)
-def _safe_cast_str(x: Optional[str]) -> Optional[str]:
+def _safe_cast_str(x: str | None) -> str | None:
     try:
         if x is None:
             return None
@@ -199,7 +338,7 @@ def _safe_cast_str(x: Optional[str]) -> Optional[str]:
         return None
-def _safe_dump(x: Optional[BaseModel]) -> Dict:
+def _safe_dump(x: BaseModel | None) -> dict:
     try:
         if x is None:
             return {}
@@ -212,45 +351,52 @@ def _safe_dump(x: Optional[BaseModel]) -> Dict:
 def responses_udf(
     instructions: str,
-    response_format: Type[ResponseFormat] = str,
-    model_name: str = "gpt-4.1-mini",
-    batch_size: int = 128,
-    temperature: float = 0.0,
-    top_p: float = 1.0,
+    response_format: type[ResponseFormat] = str,
+    model_name: str | None = None,
+    batch_size: int | None = None,
     max_concurrency: int = 8,
+    **api_kwargs,
 ) -> UserDefinedFunction:
     """Create an asynchronous Spark pandas UDF for generating responses.
-    Configures and builds UDFs that leverage `pandas_ext.aio.responses`
+    Configures and builds UDFs that leverage `pandas_ext.aio.responses_with_cache`
     to generate text or structured responses from OpenAI models asynchronously.
+    Each partition maintains its own cache to eliminate duplicate API calls within
+    the partition, significantly reducing API usage and costs when processing
+    datasets with overlapping content.
     Note:
         Authentication must be configured via SparkContext environment variables.
         Set the appropriate environment variables on the SparkContext:
         For OpenAI:
             sc.environment["OPENAI_API_KEY"] = "your-openai-api-key"
         For Azure OpenAI:
             sc.environment["AZURE_OPENAI_API_KEY"] = "your-azure-openai-api-key"
-            sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "your-azure-openai-endpoint"
-            sc.environment["AZURE_OPENAI_API_VERSION"] = "your-azure-openai-api-version"
+            sc.environment["AZURE_OPENAI_BASE_URL"] = "https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/"
+            sc.environment["AZURE_OPENAI_API_VERSION"] = "preview"
     Args:
         instructions (str): The system prompt or instructions for the model.
-        response_format (Type[ResponseFormat]): The desired output format. Either `str` for plain text
+        response_format (type[ResponseFormat]): The desired output format. Either `str` for plain text
             or a Pydantic `BaseModel` for structured JSON output. Defaults to `str`.
-        model_name (str): Deployment name (Azure) or model name (OpenAI) for responses.
-            Defaults to "gpt-4.1-mini".
-        batch_size (int): Number of rows per async batch request within each partition.
+        model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
+            For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
+            via ResponsesModelName if not provided.
+        batch_size (int | None): Number of rows per async batch request within each partition.
             Larger values reduce API call overhead but increase memory usage.
-            Recommended: 32-128 depending on data complexity. Defaults to 128.
-        temperature (float): Sampling temperature (0.0 to 2.0). Defaults to 0.0.
-        top_p (float): Nucleus sampling parameter. Defaults to 1.0.
+            Defaults to None (automatic batch size optimization that dynamically
+            adjusts based on execution time, targeting 30-60 seconds per batch).
+            Set to a positive integer (e.g., 32-128) for fixed batch size.
         max_concurrency (int): Maximum number of concurrent API requests **PER EXECUTOR**.
             Total cluster concurrency = max_concurrency × number_of_executors.
             Higher values increase throughput but may hit OpenAI rate limits.
             Recommended: 4-12 per executor. Defaults to 8.
+        **api_kwargs: Additional OpenAI API parameters (e.g. ``temperature``, ``top_p``,
+            ``frequency_penalty``, ``presence_penalty``, ``seed``, ``max_output_tokens``, etc.)
+            forwarded verbatim to the underlying API calls. These parameters are applied to
+            all API requests made by the UDF.
     Returns:
         UserDefinedFunction: A Spark pandas UDF configured to generate responses asynchronously.
@@ -259,89 +405,130 @@ def responses_udf(
     Raises:
         ValueError: If `response_format` is not `str` or a Pydantic `BaseModel`.
+    Example:
+        ```python
+        from pyspark.sql import SparkSession
+        from openaivec.spark import responses_udf, setup
+        spark = SparkSession.builder.getOrCreate()
+        setup(spark, api_key="sk-***", responses_model_name="gpt-4.1-mini")
+        udf = responses_udf("Reply with one word.")
+        spark.udf.register("short_answer", udf)
+        df = spark.createDataFrame([("hello",), ("bye",)], ["text"])
+        df.selectExpr("short_answer(text) as reply").show()
+        ```
     Note:
         For optimal performance in distributed environments:
+        - **Automatic Caching**: Duplicate inputs within each partition are cached,
+          reducing API calls and costs significantly on datasets with repeated content
         - Monitor OpenAI API rate limits when scaling executor count
         - Consider your OpenAI tier limits: total_requests = max_concurrency × executors
         - Use Spark UI to optimize partition sizes relative to batch_size
     """
+    _model_name = model_name or CONTAINER.resolve(ResponsesModelName).value
     if issubclass(response_format, BaseModel):
         spark_schema = _pydantic_to_spark_schema(response_format)
         json_schema_string = serialize_base_model(response_format)
-        @pandas_udf(returnType=spark_schema)
+        @pandas_udf(returnType=spark_schema)  # type: ignore[call-overload]
         def structure_udf(col: Iterator[pd.Series]) -> Iterator[pd.DataFrame]:
-            pandas_ext.responses_model(model_name)
+            pandas_ext.set_responses_model(_model_name)
+            response_format = deserialize_base_model(json_schema_string)
+            cache = AsyncBatchingMapProxy[str, response_format](
+                batch_size=batch_size,
+                max_concurrency=max_concurrency,
+            )
-            for part in col:
-                predictions: pd.Series = asyncio.run(
-                    part.aio.responses(
-                        instructions=instructions,
-                        response_format=deserialize_base_model(json_schema_string),
-                        batch_size=batch_size,
-                        temperature=temperature,
-                        top_p=top_p,
-                        max_concurrency=max_concurrency,
+            try:
+                for part in col:
+                    predictions: pd.Series = asyncio.run(
+                        part.aio.responses_with_cache(
+                            instructions=instructions,
+                            response_format=response_format,
+                            cache=cache,
+                            **api_kwargs,
+                        )
                     )
-                )
-                yield pd.DataFrame(predictions.map(_safe_dump).tolist())
+                    yield pd.DataFrame(predictions.map(_safe_dump).tolist())
+            finally:
+                asyncio.run(cache.clear())
-        return structure_udf
+        return structure_udf  # type: ignore[return-value]
     elif issubclass(response_format, str):
-        @pandas_udf(returnType=StringType())
+        @pandas_udf(returnType=StringType())  # type: ignore[call-overload]
         def string_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
-            pandas_ext.responses_model(model_name)
+            pandas_ext.set_responses_model(_model_name)
+            cache = AsyncBatchingMapProxy[str, str](
+                batch_size=batch_size,
+                max_concurrency=max_concurrency,
+            )
-            for part in col:
-                predictions: pd.Series = asyncio.run(
-                    part.aio.responses(
-                        instructions=instructions,
-                        response_format=str,
-                        batch_size=batch_size,
-                        temperature=temperature,
-                        top_p=top_p,
-                        max_concurrency=max_concurrency,
+            try:
+                for part in col:
+                    predictions: pd.Series = asyncio.run(
+                        part.aio.responses_with_cache(
+                            instructions=instructions,
+                            response_format=str,
+                            cache=cache,
+                            **api_kwargs,
+                        )
                     )
-                )
-                yield predictions.map(_safe_cast_str)
+                    yield predictions.map(_safe_cast_str)
+            finally:
+                asyncio.run(cache.clear())
-        return string_udf
+        return string_udf  # type: ignore[return-value]
     else:
         raise ValueError(f"Unsupported response_format: {response_format}")
 def task_udf(
-    task: PreparedTask,
-    model_name: str = "gpt-4.1-mini",
-    batch_size: int = 128,
+    task: PreparedTask[ResponseFormat],
+    model_name: str | None = None,
+    batch_size: int | None = None,
     max_concurrency: int = 8,
+    **api_kwargs,
 ) -> UserDefinedFunction:
     """Create an asynchronous Spark pandas UDF from a predefined task.
     This function allows users to create UDFs from predefined tasks such as sentiment analysis,
     translation, or other common NLP operations defined in the openaivec.task module.
+    Each partition maintains its own cache to eliminate duplicate API calls within
+    the partition, significantly reducing API usage and costs when processing
+    datasets with overlapping content.
     Args:
-        task (PreparedTask): A predefined task configuration containing instructions,
-            response format, temperature, and top_p settings.
-        model_name (str): Deployment name (Azure) or model name (OpenAI) for responses.
-            Defaults to "gpt-4.1-mini".
-        batch_size (int): Number of rows per async batch request within each partition.
+        task (PreparedTask): A predefined task configuration containing instructions
+            and response format.
+        model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
+            For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
+            via ResponsesModelName if not provided.
+        batch_size (int | None): Number of rows per async batch request within each partition.
             Larger values reduce API call overhead but increase memory usage.
-            Recommended: 32-128 depending on task complexity. Defaults to 128.
+            Defaults to None (automatic batch size optimization that dynamically
+            adjusts based on execution time, targeting 30-60 seconds per batch).
+            Set to a positive integer (e.g., 32-128) for fixed batch size.
         max_concurrency (int): Maximum number of concurrent API requests **PER EXECUTOR**.
             Total cluster concurrency = max_concurrency × number_of_executors.
             Higher values increase throughput but may hit OpenAI rate limits.
             Recommended: 4-12 per executor. Defaults to 8.
+    Additional Keyword Args:
+        Arbitrary OpenAI Responses API parameters (e.g. ``temperature``, ``top_p``,
+        ``frequency_penalty``, ``presence_penalty``, ``seed``, ``max_output_tokens``, etc.)
+        are forwarded verbatim to the underlying API calls. These parameters are applied to
+        all API requests made by the UDF.
     Returns:
         UserDefinedFunction: A Spark pandas UDF configured to execute the specified task
-            asynchronously. Output schema is StringType for str response format or
-            a struct derived from the task's response format for BaseModel.
+            asynchronously with automatic caching for duplicate inputs within each partition.
+            Output schema is StringType for str response format or a struct derived from
+            the task's response format for BaseModel.
     Example:
         ```python
@@ -351,186 +538,301 @@ def task_udf(
         spark.udf.register("analyze_sentiment", sentiment_udf)
         ```
+    Note:
+        **Automatic Caching**: Duplicate inputs within each partition are cached,
+        reducing API calls and costs significantly on datasets with repeated content.
     """
-    # Serialize task parameters for Spark serialization compatibility
-    task_instructions = task.instructions
-    task_temperature = task.temperature
-    task_top_p = task.top_p
-    if issubclass(task.response_format, BaseModel):
-        task_response_format_json = serialize_base_model(task.response_format)
-        # Deserialize the response format from JSON
-        response_format = deserialize_base_model(task_response_format_json)
-        spark_schema = _pydantic_to_spark_schema(response_format)
+    return responses_udf(
+        instructions=task.instructions,
+        response_format=task.response_format,
+        model_name=model_name,
+        batch_size=batch_size,
+        max_concurrency=max_concurrency,
+        **api_kwargs,
+    )
-        @pandas_udf(returnType=spark_schema)
-        def task_udf(col: Iterator[pd.Series]) -> Iterator[pd.DataFrame]:
-            pandas_ext.responses_model(model_name)
-            for part in col:
-                predictions: pd.Series = asyncio.run(
-                    part.aio.responses(
-                        instructions=task_instructions,
-                        response_format=response_format,
-                        batch_size=batch_size,
-                        temperature=task_temperature,
-                        top_p=task_top_p,
-                        max_concurrency=max_concurrency,
-                    )
-                )
-                yield pd.DataFrame(predictions.map(_safe_dump).tolist())
+def infer_schema(
+    instructions: str,
+    example_table_name: str,
+    example_field_name: str,
+    max_examples: int = 100,
+) -> SchemaInferenceOutput:
+    """Infer the schema for a response format based on example data.
-        return task_udf
-    elif issubclass(task.response_format, str):
-        @pandas_udf(returnType=StringType())
-        def task_string_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
-            pandas_ext.responses_model(model_name)
+    This function retrieves examples from a Spark table and infers the schema
+    for the response format using the provided instructions. It is useful when
+    you want to dynamically generate a schema based on existing data.
-            for part in col:
-                predictions: pd.Series = asyncio.run(
-                    part.aio.responses(
-                        instructions=task_instructions,
-                        response_format=str,
-                        batch_size=batch_size,
-                        temperature=task_temperature,
-                        top_p=task_top_p,
-                        max_concurrency=max_concurrency,
-                    )
-                )
-                yield predictions.map(_safe_cast_str)
+    Args:
+        instructions (str): Instructions for the model to infer the schema.
+        example_table_name (str | None): Name of the Spark table containing example data.
+        example_field_name (str | None): Name of the field in the table to use as examples.
+        max_examples (int): Maximum number of examples to retrieve for schema inference.
-        return task_string_udf
-    else:
-        raise ValueError(f"Unsupported response_format in task: {task.response_format}")
+    Returns:
+        InferredSchema: An object containing the inferred schema and response format.
+    Example:
+        ```python
+        from pyspark.sql import SparkSession
+        spark = SparkSession.builder.getOrCreate()
+        spark.createDataFrame([("great product",), ("bad service",)], ["text"]).createOrReplaceTempView("examples")
+        infer_schema(
+            instructions="Classify sentiment as positive or negative.",
+            example_table_name="examples",
+            example_field_name="text",
+            max_examples=2,
+        )
+        ```
+    """
+    spark = CONTAINER.resolve(SparkSession)
+    examples: list[str] = (
+        spark.table(example_table_name).rdd.map(lambda row: row[example_field_name]).takeSample(False, max_examples)
+    )
+    input = SchemaInferenceInput(
+        instructions=instructions,
+        examples=examples,
+    )
+    inferer = CONTAINER.resolve(SchemaInferer)
+    return inferer.infer_schema(input)
+def parse_udf(
+    instructions: str,
+    response_format: type[ResponseFormat] | None = None,
+    example_table_name: str | None = None,
+    example_field_name: str | None = None,
+    max_examples: int = 100,
+    model_name: str | None = None,
+    batch_size: int | None = None,
+    max_concurrency: int = 8,
+    **api_kwargs,
+) -> UserDefinedFunction:
+    """Create an asynchronous Spark pandas UDF for parsing responses.
+    This function allows users to create UDFs that parse responses based on
+    provided instructions and either a predefined response format or example data.
+    It supports both structured responses using Pydantic models and plain text responses.
+    Each partition maintains its own cache to eliminate duplicate API calls within
+    the partition, significantly reducing API usage and costs when processing
+    datasets with overlapping content.
-def embeddings_udf(model_name: str = "text-embedding-3-small", batch_size: int = 128, max_concurrency: int = 8) -> UserDefinedFunction:
+    Args:
+        instructions (str): The system prompt or instructions for the model.
+        response_format (type[ResponseFormat] | None): The desired output format.
+            Either `str` for plain text or a Pydantic `BaseModel` for structured JSON output.
+            If not provided, the schema will be inferred from example data.
+        example_table_name (str | None): Name of the Spark table containing example data.
+            If provided, `example_field_name` must also be specified.
+        example_field_name (str | None): Name of the field in the table to use as examples.
+            If provided, `example_table_name` must also be specified.
+        max_examples (int): Maximum number of examples to retrieve for schema inference.
+            Defaults to 100.
+        model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
+            For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
+            via ResponsesModelName if not provided.
+        batch_size (int | None): Number of rows per async batch request within each partition.
+            Larger values reduce API call overhead but increase memory usage.
+            Defaults to None (automatic batch size optimization that dynamically
+            adjusts based on execution time, targeting 30-60 seconds per batch).
+            Set to a positive integer (e.g., 32-128) for fixed batch size
+        max_concurrency (int): Maximum number of concurrent API requests **PER EXECUTOR**.
+            Total cluster concurrency = max_concurrency × number_of_executors.
+            Higher values increase throughput but may hit OpenAI rate limits.
+            Recommended: 4-12 per executor. Defaults to 8.
+        **api_kwargs: Additional OpenAI API parameters (e.g. ``temperature``, ``top_p``,
+            ``frequency_penalty``, ``presence_penalty``, ``seed``, ``max_output_tokens``, etc.)
+            forwarded verbatim to the underlying API calls. These parameters are applied to
+            all API requests made by the UDF and override any parameters set in the
+            response_format or example data.
+    Example:
+        ```python
+        from pyspark.sql import SparkSession
+        spark = SparkSession.builder.getOrCreate()
+        spark.createDataFrame(
+            [("Order #123 delivered",), ("Order #456 delayed",)],
+            ["body"],
+        ).createOrReplaceTempView("messages")
+        udf = parse_udf(
+            instructions="Extract order id as `order_id` and status as `status`.",
+            example_table_name="messages",
+            example_field_name="body",
+        )
+        spark.udf.register("parse_ticket", udf)
+        spark.sql("SELECT parse_ticket(body) AS parsed FROM messages").show()
+        ```
+    Returns:
+        UserDefinedFunction: A Spark pandas UDF configured to parse responses asynchronously.
+            Output schema is `StringType` for str response format or a struct derived from
+            the response_format for BaseModel.
+    Raises:
+        ValueError: If neither `response_format` nor `example_table_name` and `example_field_name` are provided.
+    """
+    if not response_format and not (example_field_name and example_table_name):
+        raise ValueError("Either response_format or example_table_name and example_field_name must be provided.")
+    schema: SchemaInferenceOutput | None = None
+    if not response_format:
+        schema = infer_schema(
+            instructions=instructions,
+            example_table_name=example_table_name,
+            example_field_name=example_field_name,
+            max_examples=max_examples,
+        )
+    return responses_udf(
+        instructions=schema.inference_prompt if schema else instructions,
+        response_format=schema.model if schema else response_format,
+        model_name=model_name,
+        batch_size=batch_size,
+        max_concurrency=max_concurrency,
+        **api_kwargs,
+    )
+def embeddings_udf(
+    model_name: str | None = None,
+    batch_size: int | None = None,
+    max_concurrency: int = 8,
+    **api_kwargs,
+) -> UserDefinedFunction:
     """Create an asynchronous Spark pandas UDF for generating embeddings.
-    Configures and builds UDFs that leverage `pandas_ext.aio.embeddings`
+    Configures and builds UDFs that leverage `pandas_ext.aio.embeddings_with_cache`
     to generate vector embeddings from OpenAI models asynchronously.
+    Each partition maintains its own cache to eliminate duplicate API calls within
+    the partition, significantly reducing API usage and costs when processing
+    datasets with overlapping content.
     Note:
         Authentication must be configured via SparkContext environment variables.
         Set the appropriate environment variables on the SparkContext:
         For OpenAI:
             sc.environment["OPENAI_API_KEY"] = "your-openai-api-key"
         For Azure OpenAI:
             sc.environment["AZURE_OPENAI_API_KEY"] = "your-azure-openai-api-key"
-            sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "your-azure-openai-endpoint"
-            sc.environment["AZURE_OPENAI_API_VERSION"] = "your-azure-openai-api-version"
+            sc.environment["AZURE_OPENAI_BASE_URL"] = "https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/"
+            sc.environment["AZURE_OPENAI_API_VERSION"] = "preview"
     Args:
-        model_name (str): Deployment name (Azure) or model name (OpenAI) for embeddings.
-            Defaults to "text-embedding-3-small".
-        batch_size (int): Number of rows per async batch request within each partition.
+        model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-embedding-deployment").
+            For OpenAI, use the model name (e.g., "text-embedding-3-small").
+            Defaults to configured model in DI container via EmbeddingsModelName if not provided.
+        batch_size (int | None): Number of rows per async batch request within each partition.
             Larger values reduce API call overhead but increase memory usage.
+            Defaults to None (automatic batch size optimization that dynamically
+            adjusts based on execution time, targeting 30-60 seconds per batch).
+            Set to a positive integer (e.g., 64-256) for fixed batch size.
             Embeddings typically handle larger batches efficiently.
-            Recommended: 64-256 depending on text length. Defaults to 128.
         max_concurrency (int): Maximum number of concurrent API requests **PER EXECUTOR**.
             Total cluster concurrency = max_concurrency × number_of_executors.
             Higher values increase throughput but may hit OpenAI rate limits.
             Recommended: 4-12 per executor. Defaults to 8.
+        **api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
     Returns:
-        UserDefinedFunction: A Spark pandas UDF configured to generate embeddings asynchronously,
+        UserDefinedFunction: A Spark pandas UDF configured to generate embeddings asynchronously
+            with automatic caching for duplicate inputs within each partition,
             returning an `ArrayType(FloatType())` column.
     Note:
         For optimal performance in distributed environments:
+        - **Automatic Caching**: Duplicate inputs within each partition are cached,
+          reducing API calls and costs significantly on datasets with repeated content
         - Monitor OpenAI API rate limits when scaling executor count
         - Consider your OpenAI tier limits: total_requests = max_concurrency × executors
         - Embeddings API typically has higher throughput than chat completions
         - Use larger batch_size for embeddings compared to response generation
     """
-    @pandas_udf(returnType=ArrayType(FloatType()))
+    _model_name = model_name or CONTAINER.resolve(EmbeddingsModelName).value
+    @pandas_udf(returnType=ArrayType(FloatType()))  # type: ignore[call-overload,misc]
     def _embeddings_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
-        pandas_ext.embeddings_model(model_name)
+        pandas_ext.set_embeddings_model(_model_name)
+        cache = AsyncBatchingMapProxy[str, np.ndarray](
+            batch_size=batch_size,
+            max_concurrency=max_concurrency,
+        )
-        for part in col:
-            embeddings: pd.Series = asyncio.run(
-                part.aio.embeddings(batch_size=batch_size, max_concurrency=max_concurrency)
-            )
-            yield embeddings.map(lambda x: x.tolist())
+        try:
+            for part in col:
+                embeddings: pd.Series = asyncio.run(part.aio.embeddings_with_cache(cache=cache, **api_kwargs))
+                yield embeddings.map(lambda x: x.tolist())
+        finally:
+            asyncio.run(cache.clear())
-    return _embeddings_udf
+    return _embeddings_udf  # type: ignore[return-value]
-def split_to_chunks_udf(model_name: str, max_tokens: int, sep: List[str]) -> UserDefinedFunction:
+def split_to_chunks_udf(max_tokens: int, sep: list[str]) -> UserDefinedFunction:
     """Create a pandas‑UDF that splits text into token‑bounded chunks.
     Args:
-        model_name (str): Model identifier passed to *tiktoken*.
         max_tokens (int): Maximum tokens allowed per chunk.
-        sep (List[str]): Ordered list of separator strings used by ``TextChunker``.
+        sep (list[str]): Ordered list of separator strings used by ``TextChunker``.
     Returns:
         A pandas UDF producing an ``ArrayType(StringType())`` column whose
             values are lists of chunks respecting the ``max_tokens`` limit.
     """
-    @pandas_udf(ArrayType(StringType()))
+    @pandas_udf(ArrayType(StringType()))  # type: ignore[call-overload,misc]
     def fn(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
-        global _TIKTOKEN_ENC
-        if _TIKTOKEN_ENC is None:
-            _TIKTOKEN_ENC = tiktoken.encoding_for_model(model_name)
-        chunker = TextChunker(_TIKTOKEN_ENC)
+        encoding = tiktoken.get_encoding("o200k_base")
+        chunker = TextChunker(encoding)
         for part in col:
             yield part.map(lambda x: chunker.split(x, max_tokens=max_tokens, sep=sep) if isinstance(x, str) else [])
-    return fn
+    return fn  # type: ignore[return-value]
-def count_tokens_udf(model_name: str = "gpt-4o") -> UserDefinedFunction:
+def count_tokens_udf() -> UserDefinedFunction:
     """Create a pandas‑UDF that counts tokens for every string cell.
     The UDF uses *tiktoken* to approximate tokenisation and caches the
     resulting ``Encoding`` object per executor.
-    Args:
-        model_name (str): Model identifier understood by ``tiktoken``.
     Returns:
         A pandas UDF producing an ``IntegerType`` column with token counts.
     """
-    @pandas_udf(IntegerType())
+    @pandas_udf(IntegerType())  # type: ignore[call-overload]
     def fn(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
-        global _TIKTOKEN_ENC
-        if _TIKTOKEN_ENC is None:
-            _TIKTOKEN_ENC = tiktoken.encoding_for_model(model_name)
+        encoding = tiktoken.get_encoding("o200k_base")
         for part in col:
-            yield part.map(lambda x: len(_TIKTOKEN_ENC.encode(x)) if isinstance(x, str) else 0)
+            yield part.map(lambda x: len(encoding.encode(x)) if isinstance(x, str) else 0)
-    return fn
+    return fn  # type: ignore[return-value]
 def similarity_udf() -> UserDefinedFunction:
-    @pandas_udf(FloatType())
-    def fn(a: pd.Series, b: pd.Series) -> pd.Series:
-        """Compute cosine similarity between two vectors.
+    """Create a pandas-UDF that computes cosine similarity between embedding vectors.
-        Args:
-            a: First vector.
-            b: Second vector.
+    Returns:
+        UserDefinedFunction: A Spark pandas UDF that takes two embedding vector columns
+            and returns their cosine similarity as a FloatType column.
+    """
-        Returns:
-            Cosine similarity between the two vectors.
-        """
+    @pandas_udf(FloatType())  # type: ignore[call-overload]
+    def fn(a: pd.Series, b: pd.Series) -> pd.Series:
         # Import pandas_ext to ensure .ai accessor is available in Spark workers
-        from . import pandas_ext  # noqa: F401
+        from openaivec import pandas_ext
+        # Explicitly reference pandas_ext to satisfy linters
+        assert pandas_ext is not None
         return pd.DataFrame({"a": a, "b": b}).ai.similarity("a", "b")
-    return fn
+    return fn  # type: ignore[return-value]

openaivec 0.12.5__py3-none-any.whl → 1.0.10__py3-none-any.whl

openaivec 0.12.5py3-none-any.whl → 1.0.10py3-none-any.whl