openaivec 0.14.14__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/pandas_ext.py +67 -36
- openaivec/spark.py +3 -3
- {openaivec-0.14.14.dist-info → openaivec-0.15.0.dist-info}/METADATA +8 -6
- {openaivec-0.14.14.dist-info → openaivec-0.15.0.dist-info}/RECORD +6 -6
- {openaivec-0.14.14.dist-info → openaivec-0.15.0.dist-info}/WHEEL +0 -0
- {openaivec-0.14.14.dist-info → openaivec-0.15.0.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -10,29 +10,32 @@ from openaivec import pandas_ext
|
|
|
10
10
|
# (AZURE_OPENAI_API_KEY, AZURE_OPENAI_BASE_URL, AZURE_OPENAI_API_VERSION)
|
|
11
11
|
# No explicit setup needed - clients are automatically created
|
|
12
12
|
|
|
13
|
-
# Option 2:
|
|
13
|
+
# Option 2: Register an existing OpenAI client instance
|
|
14
14
|
client = OpenAI(api_key="your-api-key")
|
|
15
|
-
pandas_ext.
|
|
15
|
+
pandas_ext.set_client(client)
|
|
16
16
|
|
|
17
|
-
# Option 3:
|
|
17
|
+
# Option 3: Register an Azure OpenAI client instance
|
|
18
18
|
azure_client = AzureOpenAI(
|
|
19
19
|
api_key="your-azure-key",
|
|
20
20
|
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
21
21
|
api_version="preview"
|
|
22
22
|
)
|
|
23
|
-
pandas_ext.
|
|
23
|
+
pandas_ext.set_client(azure_client)
|
|
24
24
|
|
|
25
|
-
# Option 4:
|
|
25
|
+
# Option 4: Register an async Azure OpenAI client instance
|
|
26
26
|
async_azure_client = AsyncAzureOpenAI(
|
|
27
27
|
api_key="your-azure-key",
|
|
28
28
|
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
29
29
|
api_version="preview"
|
|
30
30
|
)
|
|
31
|
-
pandas_ext.
|
|
31
|
+
pandas_ext.set_async_client(async_azure_client)
|
|
32
32
|
|
|
33
33
|
# Set up model names (optional, defaults shown)
|
|
34
|
-
pandas_ext.
|
|
35
|
-
pandas_ext.
|
|
34
|
+
pandas_ext.set_responses_model("gpt-4.1-mini")
|
|
35
|
+
pandas_ext.set_embeddings_model("text-embedding-3-small")
|
|
36
|
+
|
|
37
|
+
# Inspect current configuration
|
|
38
|
+
configured_model = pandas_ext.get_responses_model()
|
|
36
39
|
```
|
|
37
40
|
|
|
38
41
|
This module provides `.ai` and `.aio` accessors for pandas Series and DataFrames
|
|
@@ -49,15 +52,6 @@ import numpy as np
|
|
|
49
52
|
import pandas as pd
|
|
50
53
|
import tiktoken
|
|
51
54
|
from openai import AsyncOpenAI, OpenAI
|
|
52
|
-
|
|
53
|
-
from openaivec._schema import InferredSchema, SchemaInferenceInput, SchemaInferer
|
|
54
|
-
|
|
55
|
-
__all__ = [
|
|
56
|
-
"embeddings_model",
|
|
57
|
-
"responses_model",
|
|
58
|
-
"use",
|
|
59
|
-
"use_async",
|
|
60
|
-
]
|
|
61
55
|
from pydantic import BaseModel
|
|
62
56
|
|
|
63
57
|
from openaivec._embeddings import AsyncBatchEmbeddings, BatchEmbeddings
|
|
@@ -65,13 +59,18 @@ from openaivec._model import EmbeddingsModelName, PreparedTask, ResponseFormat,
|
|
|
65
59
|
from openaivec._provider import CONTAINER, _check_azure_v1_api_url
|
|
66
60
|
from openaivec._proxy import AsyncBatchingMapProxy, BatchingMapProxy
|
|
67
61
|
from openaivec._responses import AsyncBatchResponses, BatchResponses
|
|
62
|
+
from openaivec._schema import InferredSchema, SchemaInferenceInput, SchemaInferer
|
|
68
63
|
from openaivec.task.table import FillNaResponse, fillna
|
|
69
64
|
|
|
70
65
|
__all__ = [
|
|
71
|
-
"
|
|
72
|
-
"
|
|
73
|
-
"
|
|
74
|
-
"
|
|
66
|
+
"get_async_client",
|
|
67
|
+
"get_client",
|
|
68
|
+
"get_embeddings_model",
|
|
69
|
+
"get_responses_model",
|
|
70
|
+
"set_async_client",
|
|
71
|
+
"set_client",
|
|
72
|
+
"set_embeddings_model",
|
|
73
|
+
"set_responses_model",
|
|
75
74
|
]
|
|
76
75
|
|
|
77
76
|
_LOGGER = logging.getLogger(__name__)
|
|
@@ -95,37 +94,51 @@ def _df_rows_to_json_series(df: pd.DataFrame) -> pd.Series:
|
|
|
95
94
|
T = TypeVar("T") # For pipe function return type
|
|
96
95
|
|
|
97
96
|
|
|
98
|
-
def
|
|
99
|
-
"""Register a custom OpenAI
|
|
97
|
+
def set_client(client: OpenAI) -> None:
|
|
98
|
+
"""Register a custom OpenAI-compatible client for pandas helpers.
|
|
100
99
|
|
|
101
100
|
Args:
|
|
102
|
-
client (OpenAI): A pre
|
|
103
|
-
`openai.AzureOpenAI` instance.
|
|
104
|
-
The same instance is reused by every helper in this module.
|
|
101
|
+
client (OpenAI): A pre-configured `openai.OpenAI` or
|
|
102
|
+
`openai.AzureOpenAI` instance reused by every helper in this module.
|
|
105
103
|
"""
|
|
106
|
-
# Check Azure v1 API URL if using AzureOpenAI client
|
|
107
104
|
if client.__class__.__name__ == "AzureOpenAI" and hasattr(client, "base_url"):
|
|
108
105
|
_check_azure_v1_api_url(str(client.base_url))
|
|
109
106
|
|
|
110
107
|
CONTAINER.register(OpenAI, lambda: client)
|
|
111
108
|
|
|
112
109
|
|
|
113
|
-
def
|
|
114
|
-
"""
|
|
110
|
+
def get_client() -> OpenAI:
|
|
111
|
+
"""Get the currently registered OpenAI-compatible client.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
OpenAI: The registered `openai.OpenAI` or `openai.AzureOpenAI` instance.
|
|
115
|
+
"""
|
|
116
|
+
return CONTAINER.resolve(OpenAI)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def set_async_client(client: AsyncOpenAI) -> None:
|
|
120
|
+
"""Register a custom asynchronous OpenAI-compatible client.
|
|
115
121
|
|
|
116
122
|
Args:
|
|
117
|
-
client (AsyncOpenAI): A pre
|
|
118
|
-
`openai.AsyncAzureOpenAI` instance.
|
|
119
|
-
The same instance is reused by every helper in this module.
|
|
123
|
+
client (AsyncOpenAI): A pre-configured `openai.AsyncOpenAI` or
|
|
124
|
+
`openai.AsyncAzureOpenAI` instance reused by every helper in this module.
|
|
120
125
|
"""
|
|
121
|
-
# Check Azure v1 API URL if using AsyncAzureOpenAI client
|
|
122
126
|
if client.__class__.__name__ == "AsyncAzureOpenAI" and hasattr(client, "base_url"):
|
|
123
127
|
_check_azure_v1_api_url(str(client.base_url))
|
|
124
128
|
|
|
125
129
|
CONTAINER.register(AsyncOpenAI, lambda: client)
|
|
126
130
|
|
|
127
131
|
|
|
128
|
-
def
|
|
132
|
+
def get_async_client() -> AsyncOpenAI:
|
|
133
|
+
"""Get the currently registered asynchronous OpenAI-compatible client.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
AsyncOpenAI: The registered `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` instance.
|
|
137
|
+
"""
|
|
138
|
+
return CONTAINER.resolve(AsyncOpenAI)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def set_responses_model(name: str) -> None:
|
|
129
142
|
"""Override the model used for text responses.
|
|
130
143
|
|
|
131
144
|
Args:
|
|
@@ -135,7 +148,16 @@ def responses_model(name: str) -> None:
|
|
|
135
148
|
CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(name))
|
|
136
149
|
|
|
137
150
|
|
|
138
|
-
def
|
|
151
|
+
def get_responses_model() -> str:
|
|
152
|
+
"""Get the currently registered model name for text responses.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
str: The model name (for example, ``gpt-4.1-mini``).
|
|
156
|
+
"""
|
|
157
|
+
return CONTAINER.resolve(ResponsesModelName).value
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def set_embeddings_model(name: str) -> None:
|
|
139
161
|
"""Override the model used for text embeddings.
|
|
140
162
|
|
|
141
163
|
Args:
|
|
@@ -145,6 +167,15 @@ def embeddings_model(name: str) -> None:
|
|
|
145
167
|
CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(name))
|
|
146
168
|
|
|
147
169
|
|
|
170
|
+
def get_embeddings_model() -> str:
|
|
171
|
+
"""Get the currently registered model name for text embeddings.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
str: The model name (for example, ``text-embedding-3-small``).
|
|
175
|
+
"""
|
|
176
|
+
return CONTAINER.resolve(EmbeddingsModelName).value
|
|
177
|
+
|
|
178
|
+
|
|
148
179
|
def _extract_value(x, series_name):
|
|
149
180
|
"""Return a homogeneous ``dict`` representation of any Series value.
|
|
150
181
|
|
|
@@ -639,7 +670,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
639
670
|
animals.ai.count_tokens()
|
|
640
671
|
```
|
|
641
672
|
This method uses the `tiktoken` library to count tokens based on the
|
|
642
|
-
model name
|
|
673
|
+
model name configured via `set_responses_model`.
|
|
643
674
|
|
|
644
675
|
Returns:
|
|
645
676
|
pandas.Series: Token counts for each element.
|
openaivec/spark.py
CHANGED
|
@@ -433,7 +433,7 @@ def responses_udf(
|
|
|
433
433
|
|
|
434
434
|
@pandas_udf(returnType=spark_schema) # type: ignore[call-overload]
|
|
435
435
|
def structure_udf(col: Iterator[pd.Series]) -> Iterator[pd.DataFrame]:
|
|
436
|
-
pandas_ext.
|
|
436
|
+
pandas_ext.set_responses_model(_model_name)
|
|
437
437
|
response_format = deserialize_base_model(json_schema_string)
|
|
438
438
|
cache = AsyncBatchingMapProxy[str, response_format](
|
|
439
439
|
batch_size=batch_size,
|
|
@@ -460,7 +460,7 @@ def responses_udf(
|
|
|
460
460
|
|
|
461
461
|
@pandas_udf(returnType=StringType()) # type: ignore[call-overload]
|
|
462
462
|
def string_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
|
|
463
|
-
pandas_ext.
|
|
463
|
+
pandas_ext.set_responses_model(_model_name)
|
|
464
464
|
cache = AsyncBatchingMapProxy[str, str](
|
|
465
465
|
batch_size=batch_size,
|
|
466
466
|
max_concurrency=max_concurrency,
|
|
@@ -729,7 +729,7 @@ def embeddings_udf(
|
|
|
729
729
|
|
|
730
730
|
@pandas_udf(returnType=ArrayType(FloatType())) # type: ignore[call-overload,misc]
|
|
731
731
|
def _embeddings_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
|
|
732
|
-
pandas_ext.
|
|
732
|
+
pandas_ext.set_embeddings_model(_model_name)
|
|
733
733
|
cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
734
734
|
batch_size=batch_size,
|
|
735
735
|
max_concurrency=max_concurrency,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openaivec
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.15.0
|
|
4
4
|
Summary: Generative mutation for tabular calculation
|
|
5
5
|
Project-URL: Homepage, https://microsoft.github.io/openaivec/
|
|
6
6
|
Project-URL: Repository, https://github.com/microsoft/openaivec
|
|
@@ -26,6 +26,8 @@ Description-Content-Type: text/markdown
|
|
|
26
26
|
|
|
27
27
|
# openaivec
|
|
28
28
|
|
|
29
|
+
[Contributor guidelines](AGENTS.md)
|
|
30
|
+
|
|
29
31
|
**Transform your data analysis with AI-powered text processing at scale.**
|
|
30
32
|
|
|
31
33
|
**openaivec** enables data analysts to seamlessly integrate OpenAI's language models into their pandas and Spark workflows. Process thousands of text records with natural language instructions, turning unstructured data into actionable insights with just a few lines of code.
|
|
@@ -187,13 +189,13 @@ os.environ["OPENAI_API_KEY"] = "your-api-key-here"
|
|
|
187
189
|
|
|
188
190
|
# Authentication Option 2: Custom client (optional)
|
|
189
191
|
# from openai import OpenAI, AsyncOpenAI
|
|
190
|
-
# pandas_ext.
|
|
192
|
+
# pandas_ext.set_client(OpenAI())
|
|
191
193
|
# For async operations:
|
|
192
|
-
# pandas_ext.
|
|
194
|
+
# pandas_ext.set_async_client(AsyncOpenAI())
|
|
193
195
|
|
|
194
196
|
# Configure model (optional - defaults to gpt-4.1-mini)
|
|
195
197
|
# For Azure OpenAI: use your deployment name, for OpenAI: use model name
|
|
196
|
-
pandas_ext.
|
|
198
|
+
pandas_ext.set_responses_model("gpt-4.1-mini")
|
|
197
199
|
|
|
198
200
|
# Create your data
|
|
199
201
|
df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
|
|
@@ -220,7 +222,7 @@ When using reasoning models (o1-preview, o1-mini, o3-mini, etc.), you must set `
|
|
|
220
222
|
|
|
221
223
|
```python
|
|
222
224
|
# For reasoning models like o1-preview, o1-mini, o3-mini
|
|
223
|
-
pandas_ext.
|
|
225
|
+
pandas_ext.set_responses_model("o1-mini") # Set your reasoning model
|
|
224
226
|
|
|
225
227
|
# MUST use temperature=None with reasoning models
|
|
226
228
|
result = df.assign(
|
|
@@ -291,7 +293,7 @@ import pandas as pd
|
|
|
291
293
|
from openaivec import pandas_ext
|
|
292
294
|
|
|
293
295
|
# Setup (same as synchronous version)
|
|
294
|
-
pandas_ext.
|
|
296
|
+
pandas_ext.set_responses_model("gpt-4.1-mini")
|
|
295
297
|
|
|
296
298
|
df = pd.DataFrame({"text": [
|
|
297
299
|
"This product is amazing!",
|
|
@@ -12,8 +12,8 @@ openaivec/_responses.py,sha256=qBrYv4qblDIs5dRvj9t96r8UfAJmy4ZvtAe6csNZ7oM,20412
|
|
|
12
12
|
openaivec/_schema.py,sha256=iOeR5J_ihZRDZtzmqvOK1ZtInKcx4OnoR38DB3VmmQw,15666
|
|
13
13
|
openaivec/_serialize.py,sha256=u2Om94Sc_QgJkTlW2BAGw8wd6gYDhc6IRqvS-qevFSs,8399
|
|
14
14
|
openaivec/_util.py,sha256=XfueAycVCQvgRLS7wF7e306b53lebORvZOBzbQjy4vE,6438
|
|
15
|
-
openaivec/pandas_ext.py,sha256=
|
|
16
|
-
openaivec/spark.py,sha256=
|
|
15
|
+
openaivec/pandas_ext.py,sha256=1euz52rwKpUBvWRCKvkDjwCFf_zNYnf60wF5OXHiCqw,86727
|
|
16
|
+
openaivec/spark.py,sha256=8-Hap36D0kcyV8RMA-PyFjZxfAnMfgtcp9gKASRnUwU,34032
|
|
17
17
|
openaivec/task/__init__.py,sha256=RkYIKrcE83M_9Um9cSMkeGzL9kPRAovajfRvr31YxLE,6178
|
|
18
18
|
openaivec/task/customer_support/__init__.py,sha256=KWfGyXPdZyfGdRH17x7hPpJJ1N2EP9PPhZx0fvBAwSI,884
|
|
19
19
|
openaivec/task/customer_support/customer_sentiment.py,sha256=d8spZUtImjePK0xWGvIW98ghbdyOZ0KEZmaUpG8QB7M,7532
|
|
@@ -31,7 +31,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=u-zpqAaQYcr7I3mqMv_CTJXkfxtoLft3
|
|
|
31
31
|
openaivec/task/nlp/translation.py,sha256=kgWj2oN8pUId3vuHTJNx636gB49AGEKXWICA_XJgE_0,6628
|
|
32
32
|
openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
|
|
33
33
|
openaivec/task/table/fillna.py,sha256=zL6m5hGD4kamV7qHETnn__B59wIY540Ks0EzNgUJgdI,6888
|
|
34
|
-
openaivec-0.
|
|
35
|
-
openaivec-0.
|
|
36
|
-
openaivec-0.
|
|
37
|
-
openaivec-0.
|
|
34
|
+
openaivec-0.15.0.dist-info/METADATA,sha256=cVTYsT6TOMij_vagDgsIbo886U24Ys5dkah7ZvdEkdw,28278
|
|
35
|
+
openaivec-0.15.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
36
|
+
openaivec-0.15.0.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
|
|
37
|
+
openaivec-0.15.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|