openaivec 0.14.13__tar.gz → 0.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. openaivec-0.15.0/AGENTS.md +34 -0
  2. {openaivec-0.14.13 → openaivec-0.15.0}/PKG-INFO +8 -6
  3. {openaivec-0.14.13 → openaivec-0.15.0}/README.md +7 -5
  4. openaivec-0.15.0/docs/contributor-guide.md +3 -0
  5. {openaivec-0.14.13 → openaivec-0.15.0}/docs/index.md +4 -4
  6. {openaivec-0.14.13 → openaivec-0.15.0}/mkdocs.yml +1 -0
  7. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/pandas_ext.py +67 -36
  8. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/spark.py +66 -17
  9. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_pandas_ext.py +25 -12
  10. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_provider.py +3 -3
  11. openaivec-0.15.0/uv.lock +2811 -0
  12. openaivec-0.14.13/uv.lock +0 -2692
  13. {openaivec-0.14.13 → openaivec-0.15.0}/.env.example +0 -0
  14. {openaivec-0.14.13 → openaivec-0.15.0}/.github/copilot-instructions.md +0 -0
  15. {openaivec-0.14.13 → openaivec-0.15.0}/.github/workflows/python-mkdocs.yml +0 -0
  16. {openaivec-0.14.13 → openaivec-0.15.0}/.github/workflows/python-package.yml +0 -0
  17. {openaivec-0.14.13 → openaivec-0.15.0}/.github/workflows/python-test.yml +0 -0
  18. {openaivec-0.14.13 → openaivec-0.15.0}/.github/workflows/python-update.yml +0 -0
  19. {openaivec-0.14.13 → openaivec-0.15.0}/.gitignore +0 -0
  20. {openaivec-0.14.13 → openaivec-0.15.0}/CODE_OF_CONDUCT.md +0 -0
  21. {openaivec-0.14.13 → openaivec-0.15.0}/LICENSE +0 -0
  22. {openaivec-0.14.13 → openaivec-0.15.0}/SECURITY.md +0 -0
  23. {openaivec-0.14.13 → openaivec-0.15.0}/SUPPORT.md +0 -0
  24. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/main.md +0 -0
  25. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/pandas_ext.md +0 -0
  26. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/spark.md +0 -0
  27. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/task.md +0 -0
  28. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/customer_support/customer_sentiment.md +0 -0
  29. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/customer_support/inquiry_classification.md +0 -0
  30. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/customer_support/inquiry_summary.md +0 -0
  31. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/customer_support/intent_analysis.md +0 -0
  32. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/customer_support/response_suggestion.md +0 -0
  33. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/customer_support/urgency_analysis.md +0 -0
  34. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/nlp/dependency_parsing.md +0 -0
  35. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/nlp/keyword_extraction.md +0 -0
  36. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/nlp/morphological_analysis.md +0 -0
  37. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/nlp/named_entity_recognition.md +0 -0
  38. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/nlp/sentiment_analysis.md +0 -0
  39. {openaivec-0.14.13 → openaivec-0.15.0}/docs/api/tasks/nlp/translation.md +0 -0
  40. {openaivec-0.14.13 → openaivec-0.15.0}/docs/robots.txt +0 -0
  41. {openaivec-0.14.13 → openaivec-0.15.0}/pyproject.toml +0 -0
  42. {openaivec-0.14.13 → openaivec-0.15.0}/pytest.ini +0 -0
  43. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/__init__.py +0 -0
  44. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_di.py +0 -0
  45. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_dynamic.py +0 -0
  46. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_embeddings.py +0 -0
  47. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_log.py +0 -0
  48. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_model.py +0 -0
  49. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_optimize.py +0 -0
  50. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_prompt.py +0 -0
  51. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_provider.py +0 -0
  52. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_proxy.py +0 -0
  53. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_responses.py +0 -0
  54. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_schema.py +0 -0
  55. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_serialize.py +0 -0
  56. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/_util.py +0 -0
  57. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/__init__.py +0 -0
  58. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/customer_support/__init__.py +0 -0
  59. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/customer_support/customer_sentiment.py +0 -0
  60. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/customer_support/inquiry_classification.py +0 -0
  61. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/customer_support/inquiry_summary.py +0 -0
  62. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/customer_support/intent_analysis.py +0 -0
  63. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/customer_support/response_suggestion.py +0 -0
  64. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/customer_support/urgency_analysis.py +0 -0
  65. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/nlp/__init__.py +0 -0
  66. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/nlp/dependency_parsing.py +0 -0
  67. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/nlp/keyword_extraction.py +0 -0
  68. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/nlp/morphological_analysis.py +0 -0
  69. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/nlp/named_entity_recognition.py +0 -0
  70. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/nlp/sentiment_analysis.py +0 -0
  71. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/nlp/translation.py +0 -0
  72. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/table/__init__.py +0 -0
  73. {openaivec-0.14.13 → openaivec-0.15.0}/src/openaivec/task/table/fillna.py +0 -0
  74. {openaivec-0.14.13 → openaivec-0.15.0}/tests/__init__.py +0 -0
  75. {openaivec-0.14.13 → openaivec-0.15.0}/tests/conftest.py +0 -0
  76. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_di.py +0 -0
  77. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_dynamic.py +0 -0
  78. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_embeddings.py +0 -0
  79. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_optimize.py +0 -0
  80. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_prompt.py +0 -0
  81. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_proxy.py +0 -0
  82. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_proxy_suggester.py +0 -0
  83. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_responses.py +0 -0
  84. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_schema.py +0 -0
  85. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_serialize.py +0 -0
  86. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_serialize_pydantic_v2_compliance.py +0 -0
  87. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_spark.py +0 -0
  88. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_task.py +0 -0
  89. {openaivec-0.14.13 → openaivec-0.15.0}/tests/test_util.py +0 -0
@@ -0,0 +1,34 @@
1
+ # Repository Guidelines
2
+
3
+ ## Project Layout
4
+ - `src/openaivec/`: batching core (`_proxy.py`, `_responses.py`, `_embeddings.py`), integrations (`pandas_ext.py`, `spark.py`), and tasks (`task/`); keep additions beside the APIs they extend.
5
+ - `tests/`: mirrors the source layout; use common pandas, Spark, and async fixtures.
6
+ - `docs/` holds MkDocs sources, `site/` generated pages, and `artifacts/` scratch assets kept out of releases.
7
+
8
+ ## Core Components & Contracts
9
+ - Remote work goes through `BatchingMapProxy`/`AsyncBatchingMapProxy`; they dedupe inputs, require same-length outputs, release waiters on failure, and show progress only when `show_progress=True` in notebooks.
10
+ - `_responses.py` enforces reasoning rules: o1/o3-family models must use `temperature=None`, and structured scenarios pass a Pydantic `response_format`.
11
+ - Reuse caches from `*_with_cache` or Spark UDF builders per operation and clear them afterward to avoid large payloads.
12
+
13
+ ## Development Workflow
14
+ - `uv sync --all-extras --dev` prepares extras and tooling; iterate with `uv run pytest -m "not slow and not requires_api"` before a full `uv run pytest`.
15
+ - `uv run ruff check . --fix` enforces style, `uv run pyright` guards API changes, and `uv build` validates the distribution.
16
+ - Use `uv pip install -e .` only when external tooling requires an editable install.
17
+
18
+ ## Coding Standards
19
+ - Target Python 3.10+, rely on absolute imports, and keep helpers private with leading underscores; public modules publish alphabetical `__all__`, internal ones set `__all__ = []`.
20
+ - Apply Google-style docstrings with `(type)` Args, Returns/Raises sections, double-backtick literals, and doctest-style `Example:` blocks (`>>>`) when useful.
21
+ - Async helpers end with `_async`; dataframe accessors use descriptive nouns (`responses`, `extract`); raise narrow exceptions (`ValueError`, `TypeError`).
22
+
23
+ ## Testing Guidelines
24
+ - Pytest discovers `tests/test_*.py`; parametrize to cover pandas vectorization, Spark UDFs, and async pathways.
25
+ - Mark network tests `@pytest.mark.requires_api`, long jobs `@pytest.mark.slow`, Spark flows `@pytest.mark.spark`; skip gracefully when credentials are missing.
26
+ - Add regression tests before fixes, assert on structure/length/order rather than verbatim text, and prefer shared fixtures over heavy mocking.
27
+
28
+ ## Collaboration
29
+ - Commits follow `type(scope): summary` (e.g., `fix(pandas): guard empty batch`) and avoid merge commits within feature branches.
30
+ - Pull requests explain motivation, outline the solution, link issues, list doc updates, and include the latest `uv run pytest` and `uv run ruff check . --fix` output; attach screenshots for doc or tutorial changes.
31
+
32
+ ## Environment & Secrets
33
+ - Export `OPENAI_API_KEY` or the Azure trio (`AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_VERSION`) before running `requires_api` tests; Azure endpoints must end with `/openai/v1/`.
34
+ - Keep local secrets under `artifacts/`, never commit credentials, and rely on CI-managed secrets when extending automation.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 0.14.13
3
+ Version: 0.15.0
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -26,6 +26,8 @@ Description-Content-Type: text/markdown
26
26
 
27
27
  # openaivec
28
28
 
29
+ [Contributor guidelines](AGENTS.md)
30
+
29
31
  **Transform your data analysis with AI-powered text processing at scale.**
30
32
 
31
33
  **openaivec** enables data analysts to seamlessly integrate OpenAI's language models into their pandas and Spark workflows. Process thousands of text records with natural language instructions, turning unstructured data into actionable insights with just a few lines of code.
@@ -187,13 +189,13 @@ os.environ["OPENAI_API_KEY"] = "your-api-key-here"
187
189
 
188
190
  # Authentication Option 2: Custom client (optional)
189
191
  # from openai import OpenAI, AsyncOpenAI
190
- # pandas_ext.use(OpenAI())
192
+ # pandas_ext.set_client(OpenAI())
191
193
  # For async operations:
192
- # pandas_ext.use_async(AsyncOpenAI())
194
+ # pandas_ext.set_async_client(AsyncOpenAI())
193
195
 
194
196
  # Configure model (optional - defaults to gpt-4.1-mini)
195
197
  # For Azure OpenAI: use your deployment name, for OpenAI: use model name
196
- pandas_ext.responses_model("gpt-4.1-mini")
198
+ pandas_ext.set_responses_model("gpt-4.1-mini")
197
199
 
198
200
  # Create your data
199
201
  df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
@@ -220,7 +222,7 @@ When using reasoning models (o1-preview, o1-mini, o3-mini, etc.), you must set `
220
222
 
221
223
  ```python
222
224
  # For reasoning models like o1-preview, o1-mini, o3-mini
223
- pandas_ext.responses_model("o1-mini") # Set your reasoning model
225
+ pandas_ext.set_responses_model("o1-mini") # Set your reasoning model
224
226
 
225
227
  # MUST use temperature=None with reasoning models
226
228
  result = df.assign(
@@ -291,7 +293,7 @@ import pandas as pd
291
293
  from openaivec import pandas_ext
292
294
 
293
295
  # Setup (same as synchronous version)
294
- pandas_ext.responses_model("gpt-4.1-mini")
296
+ pandas_ext.set_responses_model("gpt-4.1-mini")
295
297
 
296
298
  df = pd.DataFrame({"text": [
297
299
  "This product is amazing!",
@@ -1,5 +1,7 @@
1
1
  # openaivec
2
2
 
3
+ [Contributor guidelines](AGENTS.md)
4
+
3
5
  **Transform your data analysis with AI-powered text processing at scale.**
4
6
 
5
7
  **openaivec** enables data analysts to seamlessly integrate OpenAI's language models into their pandas and Spark workflows. Process thousands of text records with natural language instructions, turning unstructured data into actionable insights with just a few lines of code.
@@ -161,13 +163,13 @@ os.environ["OPENAI_API_KEY"] = "your-api-key-here"
161
163
 
162
164
  # Authentication Option 2: Custom client (optional)
163
165
  # from openai import OpenAI, AsyncOpenAI
164
- # pandas_ext.use(OpenAI())
166
+ # pandas_ext.set_client(OpenAI())
165
167
  # For async operations:
166
- # pandas_ext.use_async(AsyncOpenAI())
168
+ # pandas_ext.set_async_client(AsyncOpenAI())
167
169
 
168
170
  # Configure model (optional - defaults to gpt-4.1-mini)
169
171
  # For Azure OpenAI: use your deployment name, for OpenAI: use model name
170
- pandas_ext.responses_model("gpt-4.1-mini")
172
+ pandas_ext.set_responses_model("gpt-4.1-mini")
171
173
 
172
174
  # Create your data
173
175
  df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
@@ -194,7 +196,7 @@ When using reasoning models (o1-preview, o1-mini, o3-mini, etc.), you must set `
194
196
 
195
197
  ```python
196
198
  # For reasoning models like o1-preview, o1-mini, o3-mini
197
- pandas_ext.responses_model("o1-mini") # Set your reasoning model
199
+ pandas_ext.set_responses_model("o1-mini") # Set your reasoning model
198
200
 
199
201
  # MUST use temperature=None with reasoning models
200
202
  result = df.assign(
@@ -265,7 +267,7 @@ import pandas as pd
265
267
  from openaivec import pandas_ext
266
268
 
267
269
  # Setup (same as synchronous version)
268
- pandas_ext.responses_model("gpt-4.1-mini")
270
+ pandas_ext.set_responses_model("gpt-4.1-mini")
269
271
 
270
272
  df = pd.DataFrame({"text": [
271
273
  "This product is amazing!",
@@ -0,0 +1,3 @@
1
+ # Contributor Guidelines
2
+
3
+ Refer to [AGENTS.md](https://github.com/microsoft/openaivec/blob/main/AGENTS.md) in the repository root for the authoritative contributor guide.
@@ -84,11 +84,11 @@ from openaivec import pandas_ext
84
84
  from typing import List
85
85
 
86
86
  # Set OpenAI Client (optional: this is default client if environment "OPENAI_API_KEY" is set)
87
- pandas_ext.use(OpenAI())
87
+ pandas_ext.set_client(OpenAI())
88
88
 
89
89
  # Set models for responses and embeddings(optional: these are default models)
90
- pandas_ext.responses_model("gpt-4.1-nano")
91
- pandas_ext.embeddings_model("text-embedding-3-small")
90
+ pandas_ext.set_responses_model("gpt-4.1-nano")
91
+ pandas_ext.set_embeddings_model("text-embedding-3-small")
92
92
 
93
93
 
94
94
  fruits: List[str] = ["apple", "banana", "orange", "grape", "kiwi", "mango", "peach", "pear", "pineapple", "strawberry"]
@@ -236,4 +236,4 @@ results = asyncio.run(analyze_feedback())
236
236
  ### When to Use Async vs Sync
237
237
 
238
238
  - **Use `.aio`** for: Large datasets (1000+ rows), time-sensitive processing, concurrent workflows
239
- - **Use `.ai`** for: Small datasets, interactive analysis, simple one-off operations
239
+ - **Use `.ai`** for: Small datasets, interactive analysis, simple one-off operations
@@ -52,6 +52,7 @@ nav:
52
52
  - Home: index.md
53
53
  - PyPI: https://pypi.org/project/openaivec/
54
54
  - GitHub: https://github.com/microsoft/openaivec
55
+ - Contributor Guidelines: contributor-guide.md
55
56
  - Examples:
56
57
  - Getting Started: examples/pandas.ipynb
57
58
  - Intelligent Fill: examples/intelligent_fill.ipynb
@@ -10,29 +10,32 @@ from openaivec import pandas_ext
10
10
  # (AZURE_OPENAI_API_KEY, AZURE_OPENAI_BASE_URL, AZURE_OPENAI_API_VERSION)
11
11
  # No explicit setup needed - clients are automatically created
12
12
 
13
- # Option 2: Use an existing OpenAI client instance
13
+ # Option 2: Register an existing OpenAI client instance
14
14
  client = OpenAI(api_key="your-api-key")
15
- pandas_ext.use(client)
15
+ pandas_ext.set_client(client)
16
16
 
17
- # Option 3: Use an existing Azure OpenAI client instance
17
+ # Option 3: Register an Azure OpenAI client instance
18
18
  azure_client = AzureOpenAI(
19
19
  api_key="your-azure-key",
20
20
  base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
21
21
  api_version="preview"
22
22
  )
23
- pandas_ext.use(azure_client)
23
+ pandas_ext.set_client(azure_client)
24
24
 
25
- # Option 4: Use async Azure OpenAI client instance
25
+ # Option 4: Register an async Azure OpenAI client instance
26
26
  async_azure_client = AsyncAzureOpenAI(
27
27
  api_key="your-azure-key",
28
28
  base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
29
29
  api_version="preview"
30
30
  )
31
- pandas_ext.use_async(async_azure_client)
31
+ pandas_ext.set_async_client(async_azure_client)
32
32
 
33
33
  # Set up model names (optional, defaults shown)
34
- pandas_ext.responses_model("gpt-4.1-mini")
35
- pandas_ext.embeddings_model("text-embedding-3-small")
34
+ pandas_ext.set_responses_model("gpt-4.1-mini")
35
+ pandas_ext.set_embeddings_model("text-embedding-3-small")
36
+
37
+ # Inspect current configuration
38
+ configured_model = pandas_ext.get_responses_model()
36
39
  ```
37
40
 
38
41
  This module provides `.ai` and `.aio` accessors for pandas Series and DataFrames
@@ -49,15 +52,6 @@ import numpy as np
49
52
  import pandas as pd
50
53
  import tiktoken
51
54
  from openai import AsyncOpenAI, OpenAI
52
-
53
- from openaivec._schema import InferredSchema, SchemaInferenceInput, SchemaInferer
54
-
55
- __all__ = [
56
- "embeddings_model",
57
- "responses_model",
58
- "use",
59
- "use_async",
60
- ]
61
55
  from pydantic import BaseModel
62
56
 
63
57
  from openaivec._embeddings import AsyncBatchEmbeddings, BatchEmbeddings
@@ -65,13 +59,18 @@ from openaivec._model import EmbeddingsModelName, PreparedTask, ResponseFormat,
65
59
  from openaivec._provider import CONTAINER, _check_azure_v1_api_url
66
60
  from openaivec._proxy import AsyncBatchingMapProxy, BatchingMapProxy
67
61
  from openaivec._responses import AsyncBatchResponses, BatchResponses
62
+ from openaivec._schema import InferredSchema, SchemaInferenceInput, SchemaInferer
68
63
  from openaivec.task.table import FillNaResponse, fillna
69
64
 
70
65
  __all__ = [
71
- "use",
72
- "use_async",
73
- "responses_model",
74
- "embeddings_model",
66
+ "get_async_client",
67
+ "get_client",
68
+ "get_embeddings_model",
69
+ "get_responses_model",
70
+ "set_async_client",
71
+ "set_client",
72
+ "set_embeddings_model",
73
+ "set_responses_model",
75
74
  ]
76
75
 
77
76
  _LOGGER = logging.getLogger(__name__)
@@ -95,37 +94,51 @@ def _df_rows_to_json_series(df: pd.DataFrame) -> pd.Series:
95
94
  T = TypeVar("T") # For pipe function return type
96
95
 
97
96
 
98
- def use(client: OpenAI) -> None:
99
- """Register a custom OpenAIcompatible client.
97
+ def set_client(client: OpenAI) -> None:
98
+ """Register a custom OpenAI-compatible client for pandas helpers.
100
99
 
101
100
  Args:
102
- client (OpenAI): A preconfigured `openai.OpenAI` or
103
- `openai.AzureOpenAI` instance.
104
- The same instance is reused by every helper in this module.
101
+ client (OpenAI): A pre-configured `openai.OpenAI` or
102
+ `openai.AzureOpenAI` instance reused by every helper in this module.
105
103
  """
106
- # Check Azure v1 API URL if using AzureOpenAI client
107
104
  if client.__class__.__name__ == "AzureOpenAI" and hasattr(client, "base_url"):
108
105
  _check_azure_v1_api_url(str(client.base_url))
109
106
 
110
107
  CONTAINER.register(OpenAI, lambda: client)
111
108
 
112
109
 
113
- def use_async(client: AsyncOpenAI) -> None:
114
- """Register a custom asynchronous OpenAIcompatible client.
110
+ def get_client() -> OpenAI:
111
+ """Get the currently registered OpenAI-compatible client.
112
+
113
+ Returns:
114
+ OpenAI: The registered `openai.OpenAI` or `openai.AzureOpenAI` instance.
115
+ """
116
+ return CONTAINER.resolve(OpenAI)
117
+
118
+
119
+ def set_async_client(client: AsyncOpenAI) -> None:
120
+ """Register a custom asynchronous OpenAI-compatible client.
115
121
 
116
122
  Args:
117
- client (AsyncOpenAI): A preconfigured `openai.AsyncOpenAI` or
118
- `openai.AsyncAzureOpenAI` instance.
119
- The same instance is reused by every helper in this module.
123
+ client (AsyncOpenAI): A pre-configured `openai.AsyncOpenAI` or
124
+ `openai.AsyncAzureOpenAI` instance reused by every helper in this module.
120
125
  """
121
- # Check Azure v1 API URL if using AsyncAzureOpenAI client
122
126
  if client.__class__.__name__ == "AsyncAzureOpenAI" and hasattr(client, "base_url"):
123
127
  _check_azure_v1_api_url(str(client.base_url))
124
128
 
125
129
  CONTAINER.register(AsyncOpenAI, lambda: client)
126
130
 
127
131
 
128
- def responses_model(name: str) -> None:
132
+ def get_async_client() -> AsyncOpenAI:
133
+ """Get the currently registered asynchronous OpenAI-compatible client.
134
+
135
+ Returns:
136
+ AsyncOpenAI: The registered `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` instance.
137
+ """
138
+ return CONTAINER.resolve(AsyncOpenAI)
139
+
140
+
141
+ def set_responses_model(name: str) -> None:
129
142
  """Override the model used for text responses.
130
143
 
131
144
  Args:
@@ -135,7 +148,16 @@ def responses_model(name: str) -> None:
135
148
  CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(name))
136
149
 
137
150
 
138
- def embeddings_model(name: str) -> None:
151
+ def get_responses_model() -> str:
152
+ """Get the currently registered model name for text responses.
153
+
154
+ Returns:
155
+ str: The model name (for example, ``gpt-4.1-mini``).
156
+ """
157
+ return CONTAINER.resolve(ResponsesModelName).value
158
+
159
+
160
+ def set_embeddings_model(name: str) -> None:
139
161
  """Override the model used for text embeddings.
140
162
 
141
163
  Args:
@@ -145,6 +167,15 @@ def embeddings_model(name: str) -> None:
145
167
  CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(name))
146
168
 
147
169
 
170
+ def get_embeddings_model() -> str:
171
+ """Get the currently registered model name for text embeddings.
172
+
173
+ Returns:
174
+ str: The model name (for example, ``text-embedding-3-small``).
175
+ """
176
+ return CONTAINER.resolve(EmbeddingsModelName).value
177
+
178
+
148
179
  def _extract_value(x, series_name):
149
180
  """Return a homogeneous ``dict`` representation of any Series value.
150
181
 
@@ -639,7 +670,7 @@ class OpenAIVecSeriesAccessor:
639
670
  animals.ai.count_tokens()
640
671
  ```
641
672
  This method uses the `tiktoken` library to count tokens based on the
642
- model name set by `responses_model`.
673
+ model name configured via `set_responses_model`.
643
674
 
644
675
  Returns:
645
676
  pandas.Series: Token counts for each element.
@@ -193,8 +193,6 @@ def setup(
193
193
  CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(responses_model_name))
194
194
 
195
195
  if embeddings_model_name:
196
- from openaivec._model import EmbeddingsModelName
197
-
198
196
  CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(embeddings_model_name))
199
197
 
200
198
  CONTAINER.clear_singletons()
@@ -244,6 +242,50 @@ def setup_azure(
244
242
  CONTAINER.clear_singletons()
245
243
 
246
244
 
245
+ def set_responses_model(model_name: str):
246
+ """Set the default model name for response generation in the DI container.
247
+
248
+ Args:
249
+ model_name (str): The model name to set as default for responses.
250
+ """
251
+ CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(model_name))
252
+ CONTAINER.clear_singletons()
253
+
254
+
255
+ def get_responses_model() -> str | None:
256
+ """Get the default model name for response generation from the DI container.
257
+
258
+ Returns:
259
+ str | None: The default model name for responses, or None if not set.
260
+ """
261
+ try:
262
+ return CONTAINER.resolve(ResponsesModelName).value
263
+ except Exception:
264
+ return None
265
+
266
+
267
+ def set_embeddings_model(model_name: str):
268
+ """Set the default model name for embeddings in the DI container.
269
+
270
+ Args:
271
+ model_name (str): The model name to set as default for embeddings.
272
+ """
273
+ CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(model_name))
274
+ CONTAINER.clear_singletons()
275
+
276
+
277
+ def get_embeddings_model() -> str | None:
278
+ """Get the default model name for embeddings from the DI container.
279
+
280
+ Returns:
281
+ str | None: The default model name for embeddings, or None if not set.
282
+ """
283
+ try:
284
+ return CONTAINER.resolve(EmbeddingsModelName).value
285
+ except Exception:
286
+ return None
287
+
288
+
247
289
  def _python_type_to_spark(python_type):
248
290
  origin = get_origin(python_type)
249
291
 
@@ -322,7 +364,7 @@ def _safe_dump(x: BaseModel | None) -> dict:
322
364
  def responses_udf(
323
365
  instructions: str,
324
366
  response_format: type[ResponseFormat] = str,
325
- model_name: str = CONTAINER.resolve(ResponsesModelName).value,
367
+ model_name: str | None = None,
326
368
  batch_size: int | None = None,
327
369
  max_concurrency: int = 8,
328
370
  **api_kwargs,
@@ -351,8 +393,9 @@ def responses_udf(
351
393
  instructions (str): The system prompt or instructions for the model.
352
394
  response_format (type[ResponseFormat]): The desired output format. Either `str` for plain text
353
395
  or a Pydantic `BaseModel` for structured JSON output. Defaults to `str`.
354
- model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
355
- For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container.
396
+ model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
397
+ For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
398
+ via ResponsesModelName if not provided.
356
399
  batch_size (int | None): Number of rows per async batch request within each partition.
357
400
  Larger values reduce API call overhead but increase memory usage.
358
401
  Defaults to None (automatic batch size optimization that dynamically
@@ -382,13 +425,15 @@ def responses_udf(
382
425
  - Consider your OpenAI tier limits: total_requests = max_concurrency × executors
383
426
  - Use Spark UI to optimize partition sizes relative to batch_size
384
427
  """
428
+ _model_name = model_name or CONTAINER.resolve(ResponsesModelName).value
429
+
385
430
  if issubclass(response_format, BaseModel):
386
431
  spark_schema = _pydantic_to_spark_schema(response_format)
387
432
  json_schema_string = serialize_base_model(response_format)
388
433
 
389
434
  @pandas_udf(returnType=spark_schema) # type: ignore[call-overload]
390
435
  def structure_udf(col: Iterator[pd.Series]) -> Iterator[pd.DataFrame]:
391
- pandas_ext.responses_model(model_name)
436
+ pandas_ext.set_responses_model(_model_name)
392
437
  response_format = deserialize_base_model(json_schema_string)
393
438
  cache = AsyncBatchingMapProxy[str, response_format](
394
439
  batch_size=batch_size,
@@ -415,7 +460,7 @@ def responses_udf(
415
460
 
416
461
  @pandas_udf(returnType=StringType()) # type: ignore[call-overload]
417
462
  def string_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
418
- pandas_ext.responses_model(model_name)
463
+ pandas_ext.set_responses_model(_model_name)
419
464
  cache = AsyncBatchingMapProxy[str, str](
420
465
  batch_size=batch_size,
421
466
  max_concurrency=max_concurrency,
@@ -443,7 +488,7 @@ def responses_udf(
443
488
 
444
489
  def task_udf(
445
490
  task: PreparedTask[ResponseFormat],
446
- model_name: str = CONTAINER.resolve(ResponsesModelName).value,
491
+ model_name: str | None = None,
447
492
  batch_size: int | None = None,
448
493
  max_concurrency: int = 8,
449
494
  **api_kwargs,
@@ -459,8 +504,9 @@ def task_udf(
459
504
  Args:
460
505
  task (PreparedTask): A predefined task configuration containing instructions,
461
506
  response format, and API parameters.
462
- model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
463
- For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container.
507
+ model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
508
+ For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
509
+ via ResponsesModelName if not provided.
464
510
  batch_size (int | None): Number of rows per async batch request within each partition.
465
511
  Larger values reduce API call overhead but increase memory usage.
466
512
  Defaults to None (automatic batch size optimization that dynamically
@@ -550,7 +596,7 @@ def parse_udf(
550
596
  example_table_name: str | None = None,
551
597
  example_field_name: str | None = None,
552
598
  max_examples: int = 100,
553
- model_name: str = CONTAINER.resolve(ResponsesModelName).value,
599
+ model_name: str | None = None,
554
600
  batch_size: int | None = None,
555
601
  max_concurrency: int = 8,
556
602
  **api_kwargs,
@@ -574,8 +620,9 @@ def parse_udf(
574
620
  If provided, `example_table_name` must also be specified.
575
621
  max_examples (int): Maximum number of examples to retrieve for schema inference.
576
622
  Defaults to 100.
577
- model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
578
- For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container.
623
+ model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
624
+ For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
625
+ via ResponsesModelName if not provided.
579
626
  batch_size (int | None): Number of rows per async batch request within each partition.
580
627
  Larger values reduce API call overhead but increase memory usage.
581
628
  Defaults to None (automatic batch size optimization that dynamically
@@ -622,7 +669,7 @@ def parse_udf(
622
669
 
623
670
 
624
671
  def embeddings_udf(
625
- model_name: str = CONTAINER.resolve(EmbeddingsModelName).value,
672
+ model_name: str | None = None,
626
673
  batch_size: int | None = None,
627
674
  max_concurrency: int = 8,
628
675
  **api_kwargs,
@@ -648,9 +695,9 @@ def embeddings_udf(
648
695
  sc.environment["AZURE_OPENAI_API_VERSION"] = "preview"
649
696
 
650
697
  Args:
651
- model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-embedding-deployment").
698
+ model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-embedding-deployment").
652
699
  For OpenAI, use the model name (e.g., "text-embedding-3-small").
653
- Defaults to configured model in DI container.
700
+ Defaults to configured model in DI container via EmbeddingsModelName if not provided.
654
701
  batch_size (int | None): Number of rows per async batch request within each partition.
655
702
  Larger values reduce API call overhead but increase memory usage.
656
703
  Defaults to None (automatic batch size optimization that dynamically
@@ -678,9 +725,11 @@ def embeddings_udf(
678
725
  - Use larger batch_size for embeddings compared to response generation
679
726
  """
680
727
 
728
+ _model_name = model_name or CONTAINER.resolve(EmbeddingsModelName).value
729
+
681
730
  @pandas_udf(returnType=ArrayType(FloatType())) # type: ignore[call-overload,misc]
682
731
  def _embeddings_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
683
- pandas_ext.embeddings_model(model_name)
732
+ pandas_ext.set_embeddings_model(_model_name)
684
733
  cache = AsyncBatchingMapProxy[str, np.ndarray](
685
734
  batch_size=batch_size,
686
735
  max_concurrency=max_concurrency,
@@ -15,10 +15,10 @@ class TestPandasExt:
15
15
  @pytest.fixture(autouse=True)
16
16
  def setup_pandas_ext(self, openai_client, async_openai_client, responses_model_name, embeddings_model_name):
17
17
  """Setup pandas_ext with test clients and models."""
18
- pandas_ext.use(openai_client)
19
- pandas_ext.use_async(async_openai_client)
20
- pandas_ext.responses_model(responses_model_name)
21
- pandas_ext.embeddings_model(embeddings_model_name)
18
+ pandas_ext.set_client(openai_client)
19
+ pandas_ext.set_async_client(async_openai_client)
20
+ pandas_ext.set_responses_model(responses_model_name)
21
+ pandas_ext.set_embeddings_model(embeddings_model_name)
22
22
  yield
23
23
 
24
24
  # ===== BASIC SERIES METHODS =====
@@ -744,18 +744,31 @@ class TestPandasExt:
744
744
 
745
745
  # ===== CONFIGURATION & PARAMETER TESTS =====
746
746
 
747
- def test_configuration_methods(self):
748
- """Test configuration methods use, use_async, responses_model, embeddings_model."""
747
+ def test_configuration_methods(self, openai_client, async_openai_client):
748
+ """Test configuration helpers for clients and model names."""
749
749
  # Test that configuration methods exist and are callable
750
- assert callable(pandas_ext.use)
751
- assert callable(pandas_ext.use_async)
752
- assert callable(pandas_ext.responses_model)
753
- assert callable(pandas_ext.embeddings_model)
750
+ assert callable(pandas_ext.set_client)
751
+ assert callable(pandas_ext.get_client)
752
+ assert callable(pandas_ext.set_async_client)
753
+ assert callable(pandas_ext.get_async_client)
754
+ assert callable(pandas_ext.set_responses_model)
755
+ assert callable(pandas_ext.get_responses_model)
756
+ assert callable(pandas_ext.set_embeddings_model)
757
+ assert callable(pandas_ext.get_embeddings_model)
754
758
 
755
759
  # Test model configuration
756
760
  try:
757
- pandas_ext.responses_model("gpt-4.1-mini")
758
- pandas_ext.embeddings_model("text-embedding-3-small")
761
+ pandas_ext.set_client(openai_client)
762
+ assert pandas_ext.get_client() is openai_client
763
+
764
+ pandas_ext.set_async_client(async_openai_client)
765
+ assert pandas_ext.get_async_client() is async_openai_client
766
+
767
+ pandas_ext.set_responses_model("gpt-4.1-mini")
768
+ assert pandas_ext.get_responses_model() == "gpt-4.1-mini"
769
+
770
+ pandas_ext.set_embeddings_model("text-embedding-3-small")
771
+ assert pandas_ext.get_embeddings_model() == "text-embedding-3-small"
759
772
  except Exception as e:
760
773
  pytest.fail(f"Model configuration failed unexpectedly: {e}")
761
774
 
@@ -381,8 +381,8 @@ class TestAzureV1ApiWarning:
381
381
  else:
382
382
  assert len(w) == 0, f"Unexpected warning for URL: {legacy_url}"
383
383
 
384
- def test_pandas_ext_use_azure_warning(self):
385
- """Test that pandas_ext.use() shows warning for legacy Azure URLs."""
384
+ def test_pandas_ext_set_client_azure_warning(self):
385
+ """Test that pandas_ext.set_client() shows warning for legacy Azure URLs."""
386
386
  from openai import AzureOpenAI
387
387
 
388
388
  from openaivec import pandas_ext
@@ -394,7 +394,7 @@ class TestAzureV1ApiWarning:
394
394
 
395
395
  with warnings.catch_warnings(record=True) as w:
396
396
  warnings.simplefilter("always")
397
- pandas_ext.use(legacy_client)
397
+ pandas_ext.set_client(legacy_client)
398
398
  assert len(w) > 0, "Expected warning for legacy Azure URL"
399
399
  assert "v1 API is recommended" in str(w[0].message)
400
400