PyPI - openaivec - Versions diffs - 0.14.10__py3-none-any.whl → 0.14.12__py3-none-any.whl - Mend

openaivec 0.14.10py3-none-any.whl → 0.14.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

openaivec/_di.py +21 -0
openaivec/_provider.py +8 -29
openaivec/pandas_ext.py +1 -0
openaivec/spark.py +241 -96
{openaivec-0.14.10.dist-info → openaivec-0.14.12.dist-info}/METADATA +40 -16
{openaivec-0.14.10.dist-info → openaivec-0.14.12.dist-info}/RECORD +8 -8
{openaivec-0.14.10.dist-info → openaivec-0.14.12.dist-info}/WHEEL +0 -0
{openaivec-0.14.10.dist-info → openaivec-0.14.12.dist-info}/licenses/LICENSE +0 -0

openaivec/_di.py CHANGED Viewed

@@ -303,3 +303,24 @@ class Container:
             self._providers.clear()
             self._instances.clear()
             self._resolving.clear()
+    def clear_singletons(self) -> None:
+        """Clear all cached singleton instances from the container.
+        Removes all cached singleton instances while keeping the registered
+        providers intact. After calling this method, the next resolve call
+        for any service will create a new instance using the provider function.
+        Example:
+            ```python
+            container = Container()
+            container.register(str, lambda: "Hello")
+            instance1 = container.resolve(str)
+            container.clear_singletons()
+            instance2 = container.resolve(str)
+            print(instance1 is instance2)
+            # False - different instances after clearing singletons
+            ```
+        """
+        with self._lock:
+            self._instances.clear()

openaivec/_provider.py CHANGED Viewed

@@ -130,35 +130,9 @@ def provide_async_openai_client() -> AsyncOpenAI:
     )
-CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName("gpt-4.1-mini"))
-CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName("text-embedding-3-small"))
-CONTAINER.register(OpenAIAPIKey, lambda: OpenAIAPIKey(os.getenv("OPENAI_API_KEY")))
-CONTAINER.register(AzureOpenAIAPIKey, lambda: AzureOpenAIAPIKey(os.getenv("AZURE_OPENAI_API_KEY")))
-CONTAINER.register(AzureOpenAIBaseURL, lambda: AzureOpenAIBaseURL(os.getenv("AZURE_OPENAI_BASE_URL")))
-CONTAINER.register(
-    cls=AzureOpenAIAPIVersion,
-    provider=lambda: AzureOpenAIAPIVersion(os.getenv("AZURE_OPENAI_API_VERSION", "preview")),
-)
-CONTAINER.register(OpenAI, provide_openai_client)
-CONTAINER.register(AsyncOpenAI, provide_async_openai_client)
-CONTAINER.register(tiktoken.Encoding, lambda: tiktoken.get_encoding("o200k_base"))
-CONTAINER.register(TextChunker, lambda: TextChunker(CONTAINER.resolve(tiktoken.Encoding)))
-CONTAINER.register(
-    SchemaInferer,
-    lambda: SchemaInferer(
-        client=CONTAINER.resolve(OpenAI),
-        model_name=CONTAINER.resolve(ResponsesModelName).value,
-    ),
-)
-def reset_environment_registrations():
-    """Reset environment variable related registrations in the container.
-    This function re-registers environment variable dependent services to pick up
-    current environment variable values. Useful for testing when environment
-    variables are changed after initial container setup.
-    """
+def set_default_registrations():
+    CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName("gpt-4.1-mini"))
+    CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName("text-embedding-3-small"))
     CONTAINER.register(OpenAIAPIKey, lambda: OpenAIAPIKey(os.getenv("OPENAI_API_KEY")))
     CONTAINER.register(AzureOpenAIAPIKey, lambda: AzureOpenAIAPIKey(os.getenv("AZURE_OPENAI_API_KEY")))
     CONTAINER.register(AzureOpenAIBaseURL, lambda: AzureOpenAIBaseURL(os.getenv("AZURE_OPENAI_BASE_URL")))
@@ -168,6 +142,8 @@ def reset_environment_registrations():
     )
     CONTAINER.register(OpenAI, provide_openai_client)
     CONTAINER.register(AsyncOpenAI, provide_async_openai_client)
+    CONTAINER.register(tiktoken.Encoding, lambda: tiktoken.get_encoding("o200k_base"))
+    CONTAINER.register(TextChunker, lambda: TextChunker(CONTAINER.resolve(tiktoken.Encoding)))
     CONTAINER.register(
         SchemaInferer,
         lambda: SchemaInferer(
@@ -175,3 +151,6 @@ def reset_environment_registrations():
             model_name=CONTAINER.resolve(ResponsesModelName).value,
         ),
     )
+set_default_registrations()

openaivec/pandas_ext.py CHANGED Viewed

@@ -454,6 +454,7 @@ class OpenAIVecSeriesAccessor:
         """Parse Series values using an LLM with a provided cache.
         This method allows you to parse the Series content into structured data
         using an LLM, optionally inferring a schema based on the provided purpose.
         Args:
             instructions (str): System prompt for the LLM.
             cache (BatchingMapProxy[str, BaseModel]): Explicit cache instance for

openaivec/spark.py CHANGED Viewed

@@ -1,45 +1,45 @@
 """Asynchronous Spark UDFs for the OpenAI and Azure OpenAI APIs.
 This module provides functions (`responses_udf`, `task_udf`, `embeddings_udf`,
-`count_tokens_udf`, `split_to_chunks_udf`)
+`count_tokens_udf`, `split_to_chunks_udf`, `similarity_udf`, `parse_udf`)
 for creating asynchronous Spark UDFs that communicate with either the public
 OpenAI API or Azure OpenAI using the `openaivec.spark` subpackage.
-It supports UDFs for generating responses and creating embeddings asynchronously.
-The UDFs operate on Spark DataFrames and leverage asyncio for potentially
-improved performance in I/O-bound operations.
+It supports UDFs for generating responses, creating embeddings, parsing text,
+and computing similarities asynchronously. The UDFs operate on Spark DataFrames
+and leverage asyncio for improved performance in I/O-bound operations.
-**Performance Optimization**: All AI-powered UDFs (`responses_udf`, `task_udf`, `embeddings_udf`)
+**Performance Optimization**: All AI-powered UDFs (`responses_udf`, `task_udf`, `embeddings_udf`, `parse_udf`)
 automatically cache duplicate inputs within each partition, significantly reducing
 API calls and costs when processing datasets with overlapping content.
-__all__ = [
-    "count_tokens_udf",
-    "embeddings_udf",
-    "responses_udf",
-    "similarity_udf",
-    "split_to_chunks_udf",
-    "task_udf",
-]
 ## Setup
 First, obtain a Spark session and configure authentication:
 ```python
-import os
 from pyspark.sql import SparkSession
+from openaivec.spark import setup, setup_azure
 spark = SparkSession.builder.getOrCreate()
-sc = spark.sparkContext
-# Configure authentication via SparkContext environment variables
 # Option 1: Using OpenAI
-sc.environment["OPENAI_API_KEY"] = "your-openai-api-key"
+setup(
+    spark,
+    api_key="your-openai-api-key",
+    responses_model_name="gpt-4.1-mini",  # Optional: set default model
+    embeddings_model_name="text-embedding-3-small"  # Optional: set default model
+)
 # Option 2: Using Azure OpenAI
-# sc.environment["AZURE_OPENAI_API_KEY"] = "your-azure-openai-api-key"
-# sc.environment["AZURE_OPENAI_BASE_URL"] = "https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/"
-# sc.environment["AZURE_OPENAI_API_VERSION"] = "preview"
+# setup_azure(
+#     spark,
+#     api_key="your-azure-openai-api-key",
+#     base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
+#     api_version="preview",
+#     responses_model_name="my-gpt4-deployment",  # Optional: set default deployment
+#     embeddings_model_name="my-embedding-deployment"  # Optional: set default deployment
+# )
 ```
 Next, create UDFs and register them:
@@ -83,9 +83,10 @@ spark.udf.register(
     ),
 )
-# Register token counting and text chunking UDFs
+# Register token counting, text chunking, and similarity UDFs
 spark.udf.register("count_tokens", count_tokens_udf())
 spark.udf.register("split_chunks", split_to_chunks_udf(max_tokens=512, sep=[".", "!", "?"]))
+spark.udf.register("compute_similarity", similarity_udf())
 ```
 You can now invoke the UDFs from Spark SQL:
@@ -97,7 +98,8 @@ SELECT
     sentiment_async(text) AS sentiment,
     embed_async(text) AS embedding,
     count_tokens(text) AS token_count,
-    split_chunks(text) AS chunks
+    split_chunks(text) AS chunks,
+    compute_similarity(embed_async(text1), embed_async(text2)) AS similarity
 FROM your_table;
 ```
@@ -123,6 +125,7 @@ Note: This module provides asynchronous support through the pandas extensions.
 import asyncio
 import logging
+import os
 from collections.abc import Iterator
 from enum import Enum
 from typing import Union, get_args, get_origin
@@ -131,14 +134,17 @@ import numpy as np
 import pandas as pd
 import tiktoken
 from pydantic import BaseModel
+from pyspark.sql import SparkSession
 from pyspark.sql.pandas.functions import pandas_udf
 from pyspark.sql.types import ArrayType, BooleanType, FloatType, IntegerType, StringType, StructField, StructType
 from pyspark.sql.udf import UserDefinedFunction
 from typing_extensions import Literal
 from openaivec import pandas_ext
-from openaivec._model import PreparedTask, ResponseFormat
+from openaivec._model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
+from openaivec._provider import CONTAINER
 from openaivec._proxy import AsyncBatchingMapProxy
+from openaivec._schema import InferredSchema, SchemaInferenceInput, SchemaInferer
 from openaivec._serialize import deserialize_base_model, serialize_base_model
 from openaivec._util import TextChunker
@@ -146,6 +152,8 @@ __all__ = [
     "responses_udf",
     "task_udf",
     "embeddings_udf",
+    "infer_schema",
+    "parse_udf",
     "split_to_chunks_udf",
     "count_tokens_udf",
     "similarity_udf",
@@ -155,6 +163,80 @@ __all__ = [
 _LOGGER: logging.Logger = logging.getLogger(__name__)
+def setup(
+    spark: SparkSession, api_key: str, responses_model_name: str | None = None, embeddings_model_name: str | None = None
+):
+    """Setup OpenAI authentication and default model names in Spark environment.
+    1. Configures OpenAI API key in SparkContext environment.
+    2. Configures OpenAI API key in local process environment.
+    3. Optionally registers default model names for responses and embeddings in the DI container.
+    Args:
+        spark (SparkSession): The Spark session to configure.
+        api_key (str): OpenAI API key for authentication.
+        responses_model_name (str | None): Default model name for response generation.
+            If provided, registers `ResponsesModelName` in the DI container.
+        embeddings_model_name (str | None): Default model name for embeddings.
+            If provided, registers `EmbeddingsModelName` in the DI container.
+    """
+    sc = spark.sparkContext
+    sc.environment["OPENAI_API_KEY"] = api_key
+    os.environ["OPENAI_API_KEY"] = api_key
+    if responses_model_name:
+        CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(responses_model_name))
+    if embeddings_model_name:
+        from openaivec._model import EmbeddingsModelName
+        CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(embeddings_model_name))
+    CONTAINER.clear_singletons()
+def setup_azure(
+    spark: SparkSession,
+    api_key: str,
+    base_url: str,
+    api_version: str = "preview",
+    responses_model_name: str | None = None,
+    embeddings_model_name: str | None = None,
+):
+    """Setup Azure OpenAI authentication and default model names in Spark environment.
+    1. Configures Azure OpenAI API key, base URL, and API version in SparkContext environment.
+    2. Configures Azure OpenAI API key, base URL, and API version in local process environment.
+    3. Optionally registers default model names for responses and embeddings in the DI container.
+    Args:
+        spark (SparkSession): The Spark session to configure.
+        api_key (str): Azure OpenAI API key for authentication.
+        base_url (str): Base URL for the Azure OpenAI resource.
+        api_version (str): API version to use. Defaults to "preview".
+        responses_model_name (str | None): Default model name for response generation.
+            If provided, registers `ResponsesModelName` in the DI container.
+        embeddings_model_name (str | None): Default model name for embeddings.
+            If provided, registers `EmbeddingsModelName` in the DI container.
+    """
+    sc = spark.sparkContext
+    sc.environment["AZURE_OPENAI_API_KEY"] = api_key
+    sc.environment["AZURE_OPENAI_BASE_URL"] = base_url
+    sc.environment["AZURE_OPENAI_API_VERSION"] = api_version
+    os.environ["AZURE_OPENAI_API_KEY"] = api_key
+    os.environ["AZURE_OPENAI_BASE_URL"] = base_url
+    os.environ["AZURE_OPENAI_API_VERSION"] = api_version
+    if responses_model_name:
+        CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(responses_model_name))
+    if embeddings_model_name:
+        CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(embeddings_model_name))
+    CONTAINER.clear_singletons()
 def _python_type_to_spark(python_type):
     origin = get_origin(python_type)
@@ -233,7 +315,7 @@ def _safe_dump(x: BaseModel | None) -> dict:
 def responses_udf(
     instructions: str,
     response_format: type[ResponseFormat] = str,
-    model_name: str = "gpt-4.1-mini",
+    model_name: str = CONTAINER.resolve(ResponsesModelName).value,
     batch_size: int | None = None,
     temperature: float | None = 0.0,
     top_p: float = 1.0,
@@ -265,7 +347,7 @@ def responses_udf(
         response_format (type[ResponseFormat]): The desired output format. Either `str` for plain text
             or a Pydantic `BaseModel` for structured JSON output. Defaults to `str`.
         model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
-            For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to "gpt-4.1-mini".
+            For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container.
         batch_size (int | None): Number of rows per async batch request within each partition.
             Larger values reduce API call overhead but increase memory usage.
             Defaults to None (automatic batch size optimization that dynamically
@@ -363,7 +445,7 @@ def responses_udf(
 def task_udf(
     task: PreparedTask[ResponseFormat],
-    model_name: str = "gpt-4.1-mini",
+    model_name: str = CONTAINER.resolve(ResponsesModelName).value,
     batch_size: int | None = None,
     max_concurrency: int = 8,
     **api_kwargs,
@@ -380,7 +462,7 @@ def task_udf(
         task (PreparedTask): A predefined task configuration containing instructions,
             response format, temperature, and top_p settings.
         model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
-            For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to "gpt-4.1-mini".
+            For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container.
         batch_size (int | None): Number of rows per async batch request within each partition.
             Larger values reduce API call overhead but increase memory usage.
             Defaults to None (automatic batch size optimization that dynamically
@@ -416,78 +498,142 @@ def task_udf(
         **Automatic Caching**: Duplicate inputs within each partition are cached,
         reducing API calls and costs significantly on datasets with repeated content.
     """
-    # Serialize task parameters for Spark serialization compatibility
-    task_instructions = task.instructions
-    task_temperature = task.temperature
-    task_top_p = task.top_p
+    return responses_udf(
+        instructions=task.instructions,
+        response_format=task.response_format,
+        model_name=model_name,
+        batch_size=batch_size,
+        temperature=task.temperature,
+        top_p=task.top_p,
+        max_concurrency=max_concurrency,
+        **api_kwargs,
+    )
+def infer_schema(
+    instructions: str,
+    example_table_name: str,
+    example_field_name: str,
+    max_examples: int = 100,
+) -> InferredSchema:
+    """Infer the schema for a response format based on example data.
-    if issubclass(task.response_format, BaseModel):
-        task_response_format_json = serialize_base_model(task.response_format)
+    This function retrieves examples from a Spark table and infers the schema
+    for the response format using the provided instructions. It is useful when
+    you want to dynamically generate a schema based on existing data.
-        # Deserialize the response format from JSON
-        response_format = deserialize_base_model(task_response_format_json)
-        spark_schema = _pydantic_to_spark_schema(response_format)
+    Args:
+        instructions (str): Instructions for the model to infer the schema.
+        example_table_name (str | None): Name of the Spark table containing example data.
+        example_field_name (str | None): Name of the field in the table to use as examples.
+        max_examples (int): Maximum number of examples to retrieve for schema inference.
-        @pandas_udf(returnType=spark_schema)  # type: ignore[call-overload]
-        def task_udf(col: Iterator[pd.Series]) -> Iterator[pd.DataFrame]:
-            pandas_ext.responses_model(model_name)
-            cache = AsyncBatchingMapProxy[str, response_format](
-                batch_size=batch_size,
-                max_concurrency=max_concurrency,
-            )
+    Returns:
+        InferredSchema: An object containing the inferred schema and response format.
+    """
-            try:
-                for part in col:
-                    predictions: pd.Series = asyncio.run(
-                        part.aio.responses_with_cache(
-                            instructions=task_instructions,
-                            response_format=response_format,
-                            temperature=task_temperature,
-                            top_p=task_top_p,
-                            cache=cache,
-                            **api_kwargs,
-                        )
-                    )
-                    yield pd.DataFrame(predictions.map(_safe_dump).tolist())
-            finally:
-                asyncio.run(cache.clear())
+    from pyspark.sql import SparkSession
-        return task_udf  # type: ignore[return-value]
+    spark = SparkSession.builder.getOrCreate()
+    examples: list[str] = (
+        spark.table(example_table_name).rdd.map(lambda row: row[example_field_name]).takeSample(False, max_examples)
+    )
-    elif issubclass(task.response_format, str):
+    input = SchemaInferenceInput(
+        purpose=instructions,
+        examples=examples,
+    )
+    inferer = CONTAINER.resolve(SchemaInferer)
+    return inferer.infer_schema(input)
-        @pandas_udf(returnType=StringType())  # type: ignore[call-overload]
-        def task_string_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
-            pandas_ext.responses_model(model_name)
-            cache = AsyncBatchingMapProxy[str, str](
-                batch_size=batch_size,
-                max_concurrency=max_concurrency,
-            )
-            try:
-                for part in col:
-                    predictions: pd.Series = asyncio.run(
-                        part.aio.responses_with_cache(
-                            instructions=task_instructions,
-                            response_format=str,
-                            temperature=task_temperature,
-                            top_p=task_top_p,
-                            cache=cache,
-                            **api_kwargs,
-                        )
-                    )
-                    yield predictions.map(_safe_cast_str)
-            finally:
-                asyncio.run(cache.clear())
+def parse_udf(
+    instructions: str,
+    response_format: type[ResponseFormat] | None = None,
+    example_table_name: str | None = None,
+    example_field_name: str | None = None,
+    max_examples: int = 100,
+    model_name: str = CONTAINER.resolve(ResponsesModelName).value,
+    batch_size: int | None = None,
+    temperature: float | None = 0.0,
+    top_p: float = 1.0,
+    max_concurrency: int = 8,
+    **api_kwargs,
+) -> UserDefinedFunction:
+    """Create an asynchronous Spark pandas UDF for parsing responses.
+    This function allows users to create UDFs that parse responses based on
+    provided instructions and either a predefined response format or example data.
+    It supports both structured responses using Pydantic models and plain text responses.
+    Each partition maintains its own cache to eliminate duplicate API calls within
+    the partition, significantly reducing API usage and costs when processing
+    datasets with overlapping content.
-        return task_string_udf  # type: ignore[return-value]
+    Args:
+        instructions (str): The system prompt or instructions for the model.
+        response_format (type[ResponseFormat] | None): The desired output format.
+            Either `str` for plain text or a Pydantic `BaseModel` for structured JSON output.
+            If not provided, the schema will be inferred from example data.
+        example_table_name (str | None): Name of the Spark table containing example data.
+            If provided, `example_field_name` must also be specified.
+        example_field_name (str | None): Name of the field in the table to use as examples.
+            If provided, `example_table_name` must also be specified.
+        max_examples (int): Maximum number of examples to retrieve for schema inference.
+            Defaults to 100.
+        model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
+            For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container.
+        batch_size (int | None): Number of rows per async batch request within each partition.
+            Larger values reduce API call overhead but increase memory usage.
+            Defaults to None (automatic batch size optimization that dynamically
+            adjusts based on execution time, targeting 30-60 seconds per batch).
+            Set to a positive integer (e.g., 32-128) for fixed batch size
+        temperature (float | None): Sampling temperature (0.0 to 2.0). Defaults to 0.0.
+        top_p (float): Nucleus sampling parameter. Defaults to 1.0.
+        max_concurrency (int): Maximum number of concurrent API requests **PER EXECUTOR**.
+            Total cluster concurrency = max_concurrency × number_of_executors.
+            Higher values increase throughput but may hit OpenAI rate limits.
+            Recommended: 4-12 per executor. Defaults to 8.
+    Additional Keyword Args:
+        Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
+        ``seed``, ``max_output_tokens``, etc.) are forwarded verbatim to the underlying API calls.
+        These parameters are applied to all API requests made by the UDF and override any
+        parameters set in the response_format or example data.
+    Returns:
+        UserDefinedFunction: A Spark pandas UDF configured to parse responses asynchronously.
+            Output schema is `StringType` for str response format or a struct derived from
+            the response_format for BaseModel.
+    Raises:
+        ValueError: If neither `response_format` nor `example_table_name` and `example_field_name` are provided.
+    """
-    else:
-        raise ValueError(f"Unsupported response_format in task: {task.response_format}")
+    if not response_format and not (example_field_name and example_table_name):
+        raise ValueError("Either response_format or example_table_name and example_field_name must be provided.")
+    schema: InferredSchema | None = None
+    if not response_format:
+        schema = infer_schema(
+            instructions=instructions,
+            example_table_name=example_table_name,
+            example_field_name=example_field_name,
+            max_examples=max_examples,
+        )
+    return responses_udf(
+        instructions=schema.inference_prompt if schema else instructions,
+        response_format=schema.model if schema else response_format,
+        model_name=model_name,
+        batch_size=batch_size,
+        temperature=temperature,
+        top_p=top_p,
+        max_concurrency=max_concurrency,
+        **api_kwargs,
+    )
 def embeddings_udf(
-    model_name: str = "text-embedding-3-small", batch_size: int | None = None, max_concurrency: int = 8
+    model_name: str = CONTAINER.resolve(EmbeddingsModelName).value,
+    batch_size: int | None = None,
+    max_concurrency: int = 8,
 ) -> UserDefinedFunction:
     """Create an asynchronous Spark pandas UDF for generating embeddings.
@@ -511,7 +657,8 @@ def embeddings_udf(
     Args:
         model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-embedding-deployment").
-            For OpenAI, use the model name (e.g., "text-embedding-3-small"). Defaults to "text-embedding-3-small".
+            For OpenAI, use the model name (e.g., "text-embedding-3-small").
+            Defaults to configured model in DI container.
         batch_size (int | None): Number of rows per async batch request within each partition.
             Larger values reduce API call overhead but increase memory usage.
             Defaults to None (automatic batch size optimization that dynamically
@@ -600,17 +747,15 @@ def count_tokens_udf() -> UserDefinedFunction:
 def similarity_udf() -> UserDefinedFunction:
-    @pandas_udf(FloatType())  # type: ignore[call-overload]
-    def fn(a: pd.Series, b: pd.Series) -> pd.Series:
-        """Compute cosine similarity between two vectors.
+    """Create a pandas-UDF that computes cosine similarity between embedding vectors.
-        Args:
-            a: First vector.
-            b: Second vector.
+    Returns:
+        UserDefinedFunction: A Spark pandas UDF that takes two embedding vector columns
+            and returns their cosine similarity as a FloatType column.
+    """
-        Returns:
-            Cosine similarity between the two vectors.
-        """
+    @pandas_udf(FloatType())  # type: ignore[call-overload]
+    def fn(a: pd.Series, b: pd.Series) -> pd.Series:
         # Import pandas_ext to ensure .ai accessor is available in Spark workers
         from openaivec import pandas_ext

{openaivec-0.14.10.dist-info → openaivec-0.14.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: openaivec
-Version: 0.14.10
+Version: 0.14.12
 Summary: Generative mutation for tabular calculation
 Project-URL: Homepage, https://microsoft.github.io/openaivec/
 Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -334,26 +334,34 @@ Scale to enterprise datasets with distributed processing:
 First, obtain a Spark session and configure authentication:
 ```python
-import os
 from pyspark.sql import SparkSession
+from openaivec.spark import setup, setup_azure
 spark = SparkSession.builder.getOrCreate()
-sc = spark.sparkContext
-# Configure authentication via SparkContext environment variables
 # Option 1: Using OpenAI
-sc.environment["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
+setup(
+    spark,
+    api_key="your-openai-api-key",
+    responses_model_name="gpt-4.1-mini",  # Optional: set default model
+    embeddings_model_name="text-embedding-3-small"  # Optional: set default model
+)
 # Option 2: Using Azure OpenAI
-# sc.environment["AZURE_OPENAI_API_KEY"] = os.environ.get("AZURE_OPENAI_API_KEY")
-# sc.environment["AZURE_OPENAI_BASE_URL"] = os.environ.get("AZURE_OPENAI_BASE_URL")
-# sc.environment["AZURE_OPENAI_API_VERSION"] = os.environ.get("AZURE_OPENAI_API_VERSION")
+# setup_azure(
+#     spark,
+#     api_key="your-azure-openai-api-key",
+#     base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
+#     api_version="preview",
+#     responses_model_name="my-gpt4-deployment",  # Optional: set default deployment
+#     embeddings_model_name="my-embedding-deployment"  # Optional: set default deployment
+# )
 ```
 Next, create and register UDFs using the provided functions:
 ```python
-from openaivec.spark import responses_udf, task_udf, embeddings_udf, count_tokens_udf
+from openaivec.spark import responses_udf, task_udf, embeddings_udf, count_tokens_udf, similarity_udf, parse_udf
 from pydantic import BaseModel
 # --- Register Responses UDF (String Output) ---
@@ -387,6 +395,9 @@ spark.udf.register(
 # --- Register Token Counting UDF ---
 spark.udf.register("count_tokens", count_tokens_udf())
+# --- Register Similarity UDF ---
+spark.udf.register("compute_similarity", similarity_udf())
 # --- Register UDFs with Pre-configured Tasks ---
 from openaivec.task import nlp, customer_support
@@ -414,6 +425,17 @@ spark.udf.register(
     )
 )
+# --- Register Parse UDF (Dynamic Schema Inference) ---
+spark.udf.register(
+    "parse_dynamic",
+    parse_udf(
+        instructions="Extract key entities and attributes from the text",
+        example_table_name="sample_texts",  # Infer schema from examples
+        example_field_name="text",
+        max_examples=50
+    )
+)
 ```
 You can now use these UDFs in Spark SQL:
@@ -691,17 +713,19 @@ steps:
    - In the notebook, import and use `openaivec.spark` functions as you normally would. For example:
      ```python
-     import os
-     from openaivec.spark import responses_udf, embeddings_udf
+     from openaivec.spark import setup_azure, responses_udf, embeddings_udf
      # In Microsoft Fabric, spark session is automatically available
      # spark = SparkSession.builder.getOrCreate()
-     sc = spark.sparkContext
      # Configure Azure OpenAI authentication
-     sc.environment["AZURE_OPENAI_API_KEY"] = "<your-api-key>"
-     sc.environment["AZURE_OPENAI_BASE_URL"] = "https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/"
-     sc.environment["AZURE_OPENAI_API_VERSION"] = "preview"
+     setup_azure(
+         spark,
+         api_key="<your-api-key>",
+         base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
+         api_version="preview",
+         responses_model_name="my-gpt4-deployment"  # Your Azure deployment name
+     )
      # Register UDFs
      spark.udf.register(

{openaivec-0.14.10.dist-info → openaivec-0.14.12.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,19 @@
 openaivec/__init__.py,sha256=mXCGNNTjYbmE4CAXGvAs78soxUsoy_mxxnvaCk_CL6Y,361
-openaivec/_di.py,sha256=1MXaBzaH_ZenQnWKQzBY2z-egHwiteMvg7byoUH3ZZI,10658
+openaivec/_di.py,sha256=Cl1ZoNBlQsJL1bpzoMDl08uT9pZFVSlqOdLbS3_MwPE,11462
 openaivec/_dynamic.py,sha256=7ZaC59w2Edemnao57XeZVO4qmSOA-Kus6TchZC3Dd5o,14821
 openaivec/_embeddings.py,sha256=upCjl8m9h1CihP6t7wvIH_vivOAPSgmgooAxIhnUMUw,7449
 openaivec/_log.py,sha256=LHNs6AbJzM4weaRARZFroigxR6D148d7WSIMLk1IhbU,1439
 openaivec/_model.py,sha256=toS2oBubrJa9jrdYy-87Fb2XivjXUlk_8Zn5gKUAcFI,3345
 openaivec/_optimize.py,sha256=3nS8VehbS7iGC1tPDDQh-iAgyKHbVYmMbCRBWM77U_U,3827
 openaivec/_prompt.py,sha256=zLv13q47CKV3jnETUyWAIlnjXFSEMs70c8m0yN7_Hek,20820
-openaivec/_provider.py,sha256=YLrEcb4aWBD1fj0n6PNcJpCtEXK6jkUuRH_WxcLDCuI,7145
+openaivec/_provider.py,sha256=8z8gPYY5-Z7rzDlj_NC6hR__DUqVAH7VLHJn6LalzRg,6158
 openaivec/_proxy.py,sha256=AiGuC1MCFjZCRXCac-pHUI3Np3nf1HIpWY6nC9ZVCFY,29671
 openaivec/_responses.py,sha256=lVJRa_Uc7hQJnYJRgumqwBbu6GToZqsLFS6tIAFO1Fc,24014
 openaivec/_schema.py,sha256=RKjDPqet1TlReYibah0R0NIvCV1VWN5SZxiaBeV0gCY,15492
 openaivec/_serialize.py,sha256=u2Om94Sc_QgJkTlW2BAGw8wd6gYDhc6IRqvS-qevFSs,8399
 openaivec/_util.py,sha256=XfueAycVCQvgRLS7wF7e306b53lebORvZOBzbQjy4vE,6438
-openaivec/pandas_ext.py,sha256=_MdiZWokius62zI_sTp_nd-33fMNlnRHbyqso0eF_Hw,85406
-openaivec/spark.py,sha256=Dbuhlk8Z89Fwk3fbWp1Ud9uTpfNyfjZOIx8ARJMnQf0,25371
+openaivec/pandas_ext.py,sha256=fjBW_TU4zsew3j7g7x67t9ESCwZ0fIuxbh9bZdOmRA0,85407
+openaivec/spark.py,sha256=V0Gg9b9Q-2ycet33ENAN21aA-GltNj57tWoE2pCZIRQ,32601
 openaivec/task/__init__.py,sha256=lrgoc9UIox7XnxZ96dQRl88a-8QfuZRFBHshxctpMB8,6178
 openaivec/task/customer_support/__init__.py,sha256=KWfGyXPdZyfGdRH17x7hPpJJ1N2EP9PPhZx0fvBAwSI,884
 openaivec/task/customer_support/customer_sentiment.py,sha256=NHIr9nm2d2Bu1MSpxFsM3_w1UuQrQEwnHrClVbhdCUw,7612
@@ -31,7 +31,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=Np-yY0d4Kr5WEjGjq4tNFHDNarBLajJr
 openaivec/task/nlp/translation.py,sha256=VYgiXtr2TL1tbqZkBpyVAy4ahrgd8UO4ZjhIL6xMdkI,6609
 openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
 openaivec/task/table/fillna.py,sha256=g_CpLnLzK1C5rCiVq15L3X0kywJK6CtSrKRYxQFuhn8,6606
-openaivec-0.14.10.dist-info/METADATA,sha256=BXQWevriu4qabbZM1paMO1PV_i8zmFPqiodTMwzeJnQ,27567
-openaivec-0.14.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-openaivec-0.14.10.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
-openaivec-0.14.10.dist-info/RECORD,,
+openaivec-0.14.12.dist-info/METADATA,sha256=GC5evUtog4LhK1XhJXfF-jO9DeyDq7l9Ii8KN1sVIBo,28216
+openaivec-0.14.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+openaivec-0.14.12.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
+openaivec-0.14.12.dist-info/RECORD,,

{openaivec-0.14.10.dist-info → openaivec-0.14.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{openaivec-0.14.10.dist-info → openaivec-0.14.12.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

openaivec 0.14.10__py3-none-any.whl → 0.14.12__py3-none-any.whl

openaivec 0.14.10py3-none-any.whl → 0.14.12py3-none-any.whl