openaivec 0.12.5__py3-none-any.whl → 1.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/__init__.py +13 -4
- openaivec/_cache/__init__.py +12 -0
- openaivec/_cache/optimize.py +109 -0
- openaivec/_cache/proxy.py +806 -0
- openaivec/{di.py → _di.py} +36 -12
- openaivec/_embeddings.py +203 -0
- openaivec/{log.py → _log.py} +2 -2
- openaivec/_model.py +113 -0
- openaivec/{prompt.py → _prompt.py} +95 -28
- openaivec/_provider.py +207 -0
- openaivec/_responses.py +511 -0
- openaivec/_schema/__init__.py +9 -0
- openaivec/_schema/infer.py +340 -0
- openaivec/_schema/spec.py +350 -0
- openaivec/_serialize.py +234 -0
- openaivec/{util.py → _util.py} +25 -85
- openaivec/pandas_ext.py +1496 -318
- openaivec/spark.py +485 -183
- openaivec/task/__init__.py +9 -7
- openaivec/task/customer_support/__init__.py +9 -15
- openaivec/task/customer_support/customer_sentiment.py +17 -15
- openaivec/task/customer_support/inquiry_classification.py +23 -22
- openaivec/task/customer_support/inquiry_summary.py +14 -13
- openaivec/task/customer_support/intent_analysis.py +21 -19
- openaivec/task/customer_support/response_suggestion.py +16 -16
- openaivec/task/customer_support/urgency_analysis.py +24 -25
- openaivec/task/nlp/__init__.py +4 -4
- openaivec/task/nlp/dependency_parsing.py +10 -12
- openaivec/task/nlp/keyword_extraction.py +11 -14
- openaivec/task/nlp/morphological_analysis.py +12 -14
- openaivec/task/nlp/named_entity_recognition.py +16 -18
- openaivec/task/nlp/sentiment_analysis.py +14 -11
- openaivec/task/nlp/translation.py +6 -9
- openaivec/task/table/__init__.py +2 -2
- openaivec/task/table/fillna.py +11 -11
- openaivec-1.0.10.dist-info/METADATA +399 -0
- openaivec-1.0.10.dist-info/RECORD +39 -0
- {openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/WHEEL +1 -1
- openaivec/embeddings.py +0 -172
- openaivec/model.py +0 -67
- openaivec/provider.py +0 -45
- openaivec/responses.py +0 -393
- openaivec/serialize.py +0 -225
- openaivec-0.12.5.dist-info/METADATA +0 -696
- openaivec-0.12.5.dist-info/RECORD +0 -33
- {openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -7,28 +7,35 @@ from openaivec import pandas_ext
|
|
|
7
7
|
|
|
8
8
|
# Option 1: Use environment variables (automatic detection)
|
|
9
9
|
# Set OPENAI_API_KEY or Azure OpenAI environment variables
|
|
10
|
-
# (AZURE_OPENAI_API_KEY,
|
|
10
|
+
# (AZURE_OPENAI_API_KEY, AZURE_OPENAI_BASE_URL, AZURE_OPENAI_API_VERSION)
|
|
11
11
|
# No explicit setup needed - clients are automatically created
|
|
12
12
|
|
|
13
|
-
# Option 2:
|
|
13
|
+
# Option 2: Register an existing OpenAI client instance
|
|
14
14
|
client = OpenAI(api_key="your-api-key")
|
|
15
|
-
pandas_ext.
|
|
15
|
+
pandas_ext.set_client(client)
|
|
16
16
|
|
|
17
|
-
# Option 3:
|
|
17
|
+
# Option 3: Register an Azure OpenAI client instance
|
|
18
18
|
azure_client = AzureOpenAI(
|
|
19
19
|
api_key="your-azure-key",
|
|
20
|
-
|
|
21
|
-
api_version="
|
|
20
|
+
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
21
|
+
api_version="preview"
|
|
22
22
|
)
|
|
23
|
-
pandas_ext.
|
|
23
|
+
pandas_ext.set_client(azure_client)
|
|
24
24
|
|
|
25
|
-
# Option 4:
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
# Option 4: Register an async Azure OpenAI client instance
|
|
26
|
+
async_azure_client = AsyncAzureOpenAI(
|
|
27
|
+
api_key="your-azure-key",
|
|
28
|
+
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
29
|
+
api_version="preview"
|
|
30
|
+
)
|
|
31
|
+
pandas_ext.set_async_client(async_azure_client)
|
|
28
32
|
|
|
29
33
|
# Set up model names (optional, defaults shown)
|
|
30
|
-
pandas_ext.
|
|
31
|
-
pandas_ext.
|
|
34
|
+
pandas_ext.set_responses_model("gpt-4.1-mini")
|
|
35
|
+
pandas_ext.set_embeddings_model("text-embedding-3-small")
|
|
36
|
+
|
|
37
|
+
# Inspect current configuration
|
|
38
|
+
configured_model = pandas_ext.get_responses_model()
|
|
32
39
|
```
|
|
33
40
|
|
|
34
41
|
This module provides `.ai` and `.aio` accessors for pandas Series and DataFrames
|
|
@@ -38,7 +45,8 @@ to easily interact with OpenAI APIs for tasks like generating responses or embed
|
|
|
38
45
|
import inspect
|
|
39
46
|
import json
|
|
40
47
|
import logging
|
|
41
|
-
from
|
|
48
|
+
from collections.abc import Awaitable, Callable
|
|
49
|
+
from typing import TypeVar
|
|
42
50
|
|
|
43
51
|
import numpy as np
|
|
44
52
|
import pandas as pd
|
|
@@ -46,87 +54,126 @@ import tiktoken
|
|
|
46
54
|
from openai import AsyncOpenAI, OpenAI
|
|
47
55
|
from pydantic import BaseModel
|
|
48
56
|
|
|
49
|
-
from .
|
|
50
|
-
from .
|
|
51
|
-
from .
|
|
52
|
-
from .
|
|
53
|
-
from .
|
|
54
|
-
from .
|
|
57
|
+
from openaivec._cache import AsyncBatchingMapProxy, BatchingMapProxy
|
|
58
|
+
from openaivec._embeddings import AsyncBatchEmbeddings, BatchEmbeddings
|
|
59
|
+
from openaivec._model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
|
|
60
|
+
from openaivec._provider import CONTAINER, _check_azure_v1_api_url
|
|
61
|
+
from openaivec._responses import AsyncBatchResponses, BatchResponses
|
|
62
|
+
from openaivec._schema import SchemaInferenceInput, SchemaInferenceOutput, SchemaInferer
|
|
63
|
+
from openaivec.task.table import FillNaResponse, fillna
|
|
55
64
|
|
|
56
65
|
__all__ = [
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"
|
|
60
|
-
"
|
|
66
|
+
"get_async_client",
|
|
67
|
+
"get_client",
|
|
68
|
+
"get_embeddings_model",
|
|
69
|
+
"get_responses_model",
|
|
70
|
+
"set_async_client",
|
|
71
|
+
"set_client",
|
|
72
|
+
"set_embeddings_model",
|
|
73
|
+
"set_responses_model",
|
|
61
74
|
]
|
|
62
75
|
|
|
63
76
|
_LOGGER = logging.getLogger(__name__)
|
|
64
77
|
|
|
65
78
|
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
# Internal helpers (not exported)
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
def _df_rows_to_json_series(df: pd.DataFrame) -> pd.Series:
|
|
83
|
+
"""Return a Series of JSON strings (UTF-8, no ASCII escaping) representing DataFrame rows.
|
|
84
|
+
|
|
85
|
+
Each element is the JSON serialisation of the corresponding row as a dict. Index and
|
|
86
|
+
name are preserved so downstream operations retain alignment. This consolidates the
|
|
87
|
+
previously duplicated inline pipeline used by responses*/task* DataFrame helpers.
|
|
88
|
+
"""
|
|
89
|
+
return pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
90
|
+
lambda x: json.dumps(x, ensure_ascii=False)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
66
94
|
T = TypeVar("T") # For pipe function return type
|
|
67
95
|
|
|
68
|
-
_DI = Container()
|
|
69
|
-
_DI.register(OpenAI, provide_openai_client)
|
|
70
|
-
_DI.register(AsyncOpenAI, provide_async_openai_client)
|
|
71
|
-
_DI.register(ResponsesModelName, lambda: ResponsesModelName("gpt-4.1-mini"))
|
|
72
|
-
_DI.register(EmbeddingsModelName, lambda: EmbeddingsModelName("text-embedding-3-small"))
|
|
73
96
|
|
|
97
|
+
def set_client(client: OpenAI) -> None:
|
|
98
|
+
"""Register a custom OpenAI-compatible client for pandas helpers.
|
|
74
99
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
"The model name '%s' is not supported by tiktoken. Using 'o200k_base' encoding instead.",
|
|
82
|
-
model_name,
|
|
83
|
-
)
|
|
84
|
-
return tiktoken.get_encoding("o200k_base")
|
|
100
|
+
Args:
|
|
101
|
+
client (OpenAI): A pre-configured `openai.OpenAI` or
|
|
102
|
+
`openai.AzureOpenAI` instance reused by every helper in this module.
|
|
103
|
+
"""
|
|
104
|
+
if client.__class__.__name__ == "AzureOpenAI" and hasattr(client, "base_url"):
|
|
105
|
+
_check_azure_v1_api_url(str(client.base_url))
|
|
85
106
|
|
|
107
|
+
CONTAINER.register(OpenAI, lambda: client)
|
|
86
108
|
|
|
87
|
-
_DI.register(tiktoken.Encoding, _provide_tiktoken_encoding)
|
|
88
109
|
|
|
110
|
+
def get_client() -> OpenAI:
|
|
111
|
+
"""Get the currently registered OpenAI-compatible client.
|
|
89
112
|
|
|
90
|
-
|
|
91
|
-
|
|
113
|
+
Returns:
|
|
114
|
+
OpenAI: The registered `openai.OpenAI` or `openai.AzureOpenAI` instance.
|
|
115
|
+
"""
|
|
116
|
+
return CONTAINER.resolve(OpenAI)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def set_async_client(client: AsyncOpenAI) -> None:
|
|
120
|
+
"""Register a custom asynchronous OpenAI-compatible client.
|
|
92
121
|
|
|
93
122
|
Args:
|
|
94
|
-
client (
|
|
95
|
-
`openai.
|
|
96
|
-
The same instance is reused by every helper in this module.
|
|
123
|
+
client (AsyncOpenAI): A pre-configured `openai.AsyncOpenAI` or
|
|
124
|
+
`openai.AsyncAzureOpenAI` instance reused by every helper in this module.
|
|
97
125
|
"""
|
|
98
|
-
|
|
126
|
+
if client.__class__.__name__ == "AsyncAzureOpenAI" and hasattr(client, "base_url"):
|
|
127
|
+
_check_azure_v1_api_url(str(client.base_url))
|
|
99
128
|
|
|
129
|
+
CONTAINER.register(AsyncOpenAI, lambda: client)
|
|
100
130
|
|
|
101
|
-
def use_async(client: AsyncOpenAI) -> None:
|
|
102
|
-
"""Register a custom asynchronous OpenAI‑compatible client.
|
|
103
131
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
132
|
+
def get_async_client() -> AsyncOpenAI:
|
|
133
|
+
"""Get the currently registered asynchronous OpenAI-compatible client.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
AsyncOpenAI: The registered `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` instance.
|
|
108
137
|
"""
|
|
109
|
-
|
|
138
|
+
return CONTAINER.resolve(AsyncOpenAI)
|
|
110
139
|
|
|
111
140
|
|
|
112
|
-
def
|
|
141
|
+
def set_responses_model(name: str) -> None:
|
|
113
142
|
"""Override the model used for text responses.
|
|
114
143
|
|
|
115
144
|
Args:
|
|
116
|
-
name (str):
|
|
145
|
+
name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name
|
|
117
146
|
(for example, ``gpt-4.1-mini``).
|
|
118
147
|
"""
|
|
119
|
-
|
|
120
|
-
|
|
148
|
+
CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(name))
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def get_responses_model() -> str:
|
|
152
|
+
"""Get the currently registered model name for text responses.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
str: The model name (for example, ``gpt-4.1-mini``).
|
|
156
|
+
"""
|
|
157
|
+
return CONTAINER.resolve(ResponsesModelName).value
|
|
121
158
|
|
|
122
159
|
|
|
123
|
-
def
|
|
160
|
+
def set_embeddings_model(name: str) -> None:
|
|
124
161
|
"""Override the model used for text embeddings.
|
|
125
162
|
|
|
126
163
|
Args:
|
|
127
|
-
name (str):
|
|
164
|
+
name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name,
|
|
165
|
+
e.g. ``text-embedding-3-small``.
|
|
128
166
|
"""
|
|
129
|
-
|
|
167
|
+
CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(name))
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def get_embeddings_model() -> str:
|
|
171
|
+
"""Get the currently registered model name for text embeddings.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
str: The model name (for example, ``text-embedding-3-small``).
|
|
175
|
+
"""
|
|
176
|
+
return CONTAINER.resolve(EmbeddingsModelName).value
|
|
130
177
|
|
|
131
178
|
|
|
132
179
|
def _extract_value(x, series_name):
|
|
@@ -160,124 +207,463 @@ class OpenAIVecSeriesAccessor:
|
|
|
160
207
|
def __init__(self, series_obj: pd.Series):
|
|
161
208
|
self._obj = series_obj
|
|
162
209
|
|
|
210
|
+
def responses_with_cache(
|
|
211
|
+
self,
|
|
212
|
+
instructions: str,
|
|
213
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
214
|
+
response_format: type[ResponseFormat] = str,
|
|
215
|
+
**api_kwargs,
|
|
216
|
+
) -> pd.Series:
|
|
217
|
+
"""Call an LLM once for every Series element using a provided cache.
|
|
218
|
+
|
|
219
|
+
This is a lower-level method that allows explicit cache management for advanced
|
|
220
|
+
use cases. Most users should use the standard ``responses`` method instead.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
instructions (str): System prompt prepended to every user message.
|
|
224
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
|
|
225
|
+
batching and deduplication control.
|
|
226
|
+
response_format (type[ResponseFormat], optional): Pydantic model or built-in
|
|
227
|
+
type the assistant should return. Defaults to ``str``.
|
|
228
|
+
**api_kwargs: Arbitrary OpenAI Responses API parameters (e.g. ``temperature``,
|
|
229
|
+
``top_p``, ``frequency_penalty``, ``presence_penalty``, ``seed``, etc.) are
|
|
230
|
+
forwarded verbatim to the underlying client.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
client: BatchResponses = BatchResponses(
|
|
237
|
+
client=CONTAINER.resolve(OpenAI),
|
|
238
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
239
|
+
system_message=instructions,
|
|
240
|
+
response_format=response_format,
|
|
241
|
+
cache=cache,
|
|
242
|
+
api_kwargs=api_kwargs,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
246
|
+
|
|
163
247
|
def responses(
|
|
164
248
|
self,
|
|
165
249
|
instructions: str,
|
|
166
|
-
response_format:
|
|
167
|
-
batch_size: int =
|
|
168
|
-
|
|
169
|
-
|
|
250
|
+
response_format: type[ResponseFormat] = str,
|
|
251
|
+
batch_size: int | None = None,
|
|
252
|
+
show_progress: bool = True,
|
|
253
|
+
**api_kwargs,
|
|
170
254
|
) -> pd.Series:
|
|
171
255
|
"""Call an LLM once for every Series element.
|
|
172
256
|
|
|
173
257
|
Example:
|
|
174
258
|
```python
|
|
175
259
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
260
|
+
# Basic usage
|
|
176
261
|
animals.ai.responses("translate to French")
|
|
262
|
+
|
|
263
|
+
# With progress bar in Jupyter notebooks
|
|
264
|
+
large_series = pd.Series(["data"] * 1000)
|
|
265
|
+
large_series.ai.responses(
|
|
266
|
+
"analyze this data",
|
|
267
|
+
batch_size=32,
|
|
268
|
+
show_progress=True
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# With custom temperature
|
|
272
|
+
animals.ai.responses(
|
|
273
|
+
"translate creatively",
|
|
274
|
+
temperature=0.8
|
|
275
|
+
)
|
|
177
276
|
```
|
|
178
|
-
This method returns a Series of strings, each containing the
|
|
179
|
-
assistant's response to the corresponding input.
|
|
180
|
-
The model used is set by the `responses_model` function.
|
|
181
|
-
The default model is `gpt-4.1-mini`.
|
|
182
277
|
|
|
183
278
|
Args:
|
|
184
279
|
instructions (str): System prompt prepended to every user message.
|
|
185
|
-
response_format (
|
|
280
|
+
response_format (type[ResponseFormat], optional): Pydantic model or built‑in
|
|
186
281
|
type the assistant should return. Defaults to ``str``.
|
|
187
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
188
|
-
request. Defaults to ``
|
|
189
|
-
|
|
190
|
-
|
|
282
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
283
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
284
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
285
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
286
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
191
287
|
|
|
192
288
|
Returns:
|
|
193
289
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
194
290
|
"""
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
system_message=instructions,
|
|
291
|
+
return self.responses_with_cache(
|
|
292
|
+
instructions=instructions,
|
|
293
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
199
294
|
response_format=response_format,
|
|
200
|
-
|
|
201
|
-
|
|
295
|
+
**api_kwargs,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
def embeddings_with_cache(
|
|
299
|
+
self,
|
|
300
|
+
cache: BatchingMapProxy[str, np.ndarray],
|
|
301
|
+
**api_kwargs,
|
|
302
|
+
) -> pd.Series:
|
|
303
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache.
|
|
304
|
+
|
|
305
|
+
This method allows external control over caching behavior by accepting
|
|
306
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
307
|
+
across multiple operations or custom batch size management.
|
|
308
|
+
|
|
309
|
+
Example:
|
|
310
|
+
```python
|
|
311
|
+
from openaivec._cache import BatchingMapProxy
|
|
312
|
+
import numpy as np
|
|
313
|
+
|
|
314
|
+
# Create a shared cache with custom batch size
|
|
315
|
+
shared_cache = BatchingMapProxy[str, np.ndarray](batch_size=64)
|
|
316
|
+
|
|
317
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
318
|
+
embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
323
|
+
instance for managing API call batching and deduplication.
|
|
324
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
325
|
+
**api_kwargs: Additional keyword arguments to pass to the OpenAI API.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
329
|
+
(dtype ``float32``).
|
|
330
|
+
"""
|
|
331
|
+
client: BatchEmbeddings = BatchEmbeddings(
|
|
332
|
+
client=CONTAINER.resolve(OpenAI),
|
|
333
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
334
|
+
cache=cache,
|
|
335
|
+
api_kwargs=api_kwargs,
|
|
202
336
|
)
|
|
203
337
|
|
|
204
338
|
return pd.Series(
|
|
205
|
-
client.
|
|
339
|
+
client.create(self._obj.tolist()),
|
|
206
340
|
index=self._obj.index,
|
|
207
341
|
name=self._obj.name,
|
|
208
342
|
)
|
|
209
343
|
|
|
210
|
-
def
|
|
211
|
-
"""
|
|
344
|
+
def embeddings(self, batch_size: int | None = None, show_progress: bool = True, **api_kwargs) -> pd.Series:
|
|
345
|
+
"""Compute OpenAI embeddings for every Series element.
|
|
346
|
+
|
|
347
|
+
Example:
|
|
348
|
+
```python
|
|
349
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
350
|
+
# Basic usage
|
|
351
|
+
animals.ai.embeddings()
|
|
212
352
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
353
|
+
# With progress bar for large datasets
|
|
354
|
+
large_texts = pd.Series(["text"] * 5000)
|
|
355
|
+
embeddings = large_texts.ai.embeddings(
|
|
356
|
+
batch_size=100,
|
|
357
|
+
show_progress=True
|
|
358
|
+
)
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
363
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
364
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
365
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
366
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
370
|
+
(dtype ``float32``).
|
|
371
|
+
"""
|
|
372
|
+
return self.embeddings_with_cache(
|
|
373
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
374
|
+
**api_kwargs,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
def task_with_cache(
|
|
378
|
+
self,
|
|
379
|
+
task: PreparedTask[ResponseFormat],
|
|
380
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
381
|
+
**api_kwargs,
|
|
382
|
+
) -> pd.Series:
|
|
383
|
+
"""Execute a prepared task on every Series element using a provided cache.
|
|
384
|
+
|
|
385
|
+
This mirrors ``responses_with_cache`` but uses the task's stored instructions
|
|
386
|
+
and response format. A supplied ``BatchingMapProxy`` enables cross‑operation
|
|
387
|
+
deduplicated reuse and external batch size / progress control.
|
|
388
|
+
|
|
389
|
+
Example:
|
|
390
|
+
```python
|
|
391
|
+
from openaivec._cache import BatchingMapProxy
|
|
392
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
393
|
+
reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
task (PreparedTask): Prepared task (instructions + response_format).
|
|
398
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
399
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
400
|
+
|
|
401
|
+
Note:
|
|
402
|
+
Core routing keys (``model``, system instructions, user input) are managed
|
|
403
|
+
internally and cannot be overridden.
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
pandas.Series: Task results aligned with the original Series index.
|
|
407
|
+
"""
|
|
408
|
+
client: BatchResponses = BatchResponses(
|
|
409
|
+
client=CONTAINER.resolve(OpenAI),
|
|
410
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
411
|
+
system_message=task.instructions,
|
|
412
|
+
response_format=task.response_format,
|
|
413
|
+
cache=cache,
|
|
414
|
+
api_kwargs=api_kwargs,
|
|
415
|
+
)
|
|
416
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
417
|
+
|
|
418
|
+
def task(
|
|
419
|
+
self,
|
|
420
|
+
task: PreparedTask,
|
|
421
|
+
batch_size: int | None = None,
|
|
422
|
+
show_progress: bool = True,
|
|
423
|
+
**api_kwargs,
|
|
424
|
+
) -> pd.Series:
|
|
425
|
+
"""Execute a prepared task on every Series element.
|
|
216
426
|
|
|
217
427
|
Example:
|
|
218
428
|
```python
|
|
219
|
-
from openaivec.
|
|
429
|
+
from openaivec._model import PreparedTask
|
|
220
430
|
|
|
221
431
|
# Assume you have a prepared task for sentiment analysis
|
|
222
432
|
sentiment_task = PreparedTask(...)
|
|
223
433
|
|
|
224
434
|
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
435
|
+
# Basic usage
|
|
225
436
|
results = reviews.ai.task(sentiment_task)
|
|
437
|
+
|
|
438
|
+
# With progress bar for large datasets
|
|
439
|
+
large_reviews = pd.Series(["review text"] * 2000)
|
|
440
|
+
results = large_reviews.ai.task(
|
|
441
|
+
sentiment_task,
|
|
442
|
+
batch_size=50,
|
|
443
|
+
show_progress=True
|
|
444
|
+
)
|
|
226
445
|
```
|
|
227
|
-
This method returns a Series containing the task results for each
|
|
228
|
-
corresponding input element, following the task's defined structure.
|
|
229
446
|
|
|
230
447
|
Args:
|
|
231
448
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
232
|
-
response format
|
|
233
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
234
|
-
request to optimize API usage. Defaults to
|
|
449
|
+
response format for processing the inputs.
|
|
450
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
451
|
+
request to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
452
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
453
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
454
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
455
|
+
|
|
456
|
+
Note:
|
|
457
|
+
Core batching / routing keys (``model``, ``instructions`` / system message,
|
|
458
|
+
user ``input``) are managed by the library and cannot be overridden.
|
|
235
459
|
|
|
236
460
|
Returns:
|
|
237
|
-
pandas.Series: Series whose values are instances of the task's
|
|
238
|
-
response format, aligned with the original Series index.
|
|
461
|
+
pandas.Series: Series whose values are instances of the task's response format.
|
|
239
462
|
"""
|
|
240
|
-
|
|
241
|
-
|
|
463
|
+
return self.task_with_cache(
|
|
464
|
+
task=task,
|
|
465
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
466
|
+
**api_kwargs,
|
|
242
467
|
)
|
|
243
468
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
469
|
+
def parse_with_cache(
|
|
470
|
+
self,
|
|
471
|
+
instructions: str,
|
|
472
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
473
|
+
response_format: type[ResponseFormat] | None = None,
|
|
474
|
+
max_examples: int = 100,
|
|
475
|
+
**api_kwargs,
|
|
476
|
+
) -> pd.Series:
|
|
477
|
+
"""Parse Series values using an LLM with a provided cache.
|
|
478
|
+
|
|
479
|
+
This method allows external control over caching behavior while parsing
|
|
480
|
+
Series content into structured data. If no response format is provided,
|
|
481
|
+
the method automatically infers an appropriate schema by analyzing the
|
|
482
|
+
data patterns.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
instructions (str): Plain language description of what information
|
|
486
|
+
to extract (e.g., "Extract customer information including name
|
|
487
|
+
and contact details"). This guides both the extraction process
|
|
488
|
+
and schema inference.
|
|
489
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
490
|
+
instance for managing API call batching and deduplication.
|
|
491
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
492
|
+
response_format (type[ResponseFormat] | None, optional): Target structure
|
|
493
|
+
for the parsed data. Can be a Pydantic model class, built-in type
|
|
494
|
+
(str, int, float, bool, list, dict), or None. If None, the method
|
|
495
|
+
infers an appropriate schema based on the instructions and data.
|
|
496
|
+
Defaults to None.
|
|
497
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
498
|
+
analyze when inferring the schema. Only used when response_format
|
|
499
|
+
is None. Defaults to 100.
|
|
500
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
501
|
+
frequency_penalty, presence_penalty, seed, etc.) forwarded to
|
|
502
|
+
the underlying API calls.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
pandas.Series: Series containing parsed structured data. Each value
|
|
506
|
+
is an instance of the specified response_format or the inferred
|
|
507
|
+
schema model, aligned with the original Series index.
|
|
508
|
+
"""
|
|
509
|
+
|
|
510
|
+
schema: SchemaInferenceOutput | None = None
|
|
511
|
+
if response_format is None:
|
|
512
|
+
schema = self.infer_schema(instructions=instructions, max_examples=max_examples, **api_kwargs)
|
|
513
|
+
|
|
514
|
+
return self.responses_with_cache(
|
|
515
|
+
instructions=schema.inference_prompt if schema else instructions,
|
|
516
|
+
cache=cache,
|
|
517
|
+
response_format=response_format or schema.model,
|
|
518
|
+
**api_kwargs,
|
|
248
519
|
)
|
|
249
520
|
|
|
250
|
-
def
|
|
251
|
-
|
|
521
|
+
def parse(
|
|
522
|
+
self,
|
|
523
|
+
instructions: str,
|
|
524
|
+
response_format: type[ResponseFormat] | None = None,
|
|
525
|
+
max_examples: int = 100,
|
|
526
|
+
batch_size: int | None = None,
|
|
527
|
+
show_progress: bool = True,
|
|
528
|
+
**api_kwargs,
|
|
529
|
+
) -> pd.Series:
|
|
530
|
+
"""Parse Series values into structured data using an LLM.
|
|
531
|
+
|
|
532
|
+
This method extracts structured information from unstructured text in
|
|
533
|
+
the Series. When no response format is provided, it automatically
|
|
534
|
+
infers an appropriate schema by analyzing patterns in the data.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
instructions (str): Plain language description of what information
|
|
538
|
+
to extract (e.g., "Extract product details including price,
|
|
539
|
+
category, and availability"). This guides both the extraction
|
|
540
|
+
process and schema inference.
|
|
541
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
542
|
+
structure for the parsed data. Can be a Pydantic model class,
|
|
543
|
+
built-in type (str, int, float, bool, list, dict), or None.
|
|
544
|
+
If None, automatically infers a schema. Defaults to None.
|
|
545
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
546
|
+
analyze when inferring schema. Only used when response_format
|
|
547
|
+
is None. Defaults to 100.
|
|
548
|
+
batch_size (int | None, optional): Number of requests to process
|
|
549
|
+
per batch. None enables automatic optimization. Defaults to None.
|
|
550
|
+
show_progress (bool, optional): Display progress bar in Jupyter
|
|
551
|
+
notebooks. Defaults to True.
|
|
552
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
553
|
+
frequency_penalty, presence_penalty, seed, etc.).
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
pandas.Series: Series containing parsed structured data as instances
|
|
557
|
+
of response_format or the inferred schema model.
|
|
252
558
|
|
|
253
559
|
Example:
|
|
254
560
|
```python
|
|
255
|
-
|
|
256
|
-
|
|
561
|
+
# With explicit schema
|
|
562
|
+
from pydantic import BaseModel
|
|
563
|
+
class Product(BaseModel):
|
|
564
|
+
name: str
|
|
565
|
+
price: float
|
|
566
|
+
in_stock: bool
|
|
567
|
+
|
|
568
|
+
descriptions = pd.Series([
|
|
569
|
+
"iPhone 15 Pro - $999, available now",
|
|
570
|
+
"Samsung Galaxy S24 - $899, out of stock"
|
|
571
|
+
])
|
|
572
|
+
products = descriptions.ai.parse(
|
|
573
|
+
"Extract product information",
|
|
574
|
+
response_format=Product
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# With automatic schema inference
|
|
578
|
+
reviews = pd.Series([
|
|
579
|
+
"Great product! 5 stars. Fast shipping.",
|
|
580
|
+
"Poor quality. 2 stars. Slow delivery."
|
|
581
|
+
])
|
|
582
|
+
parsed = reviews.ai.parse(
|
|
583
|
+
"Extract review rating and shipping feedback"
|
|
584
|
+
)
|
|
257
585
|
```
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
586
|
+
"""
|
|
587
|
+
return self.parse_with_cache(
|
|
588
|
+
instructions=instructions,
|
|
589
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
590
|
+
response_format=response_format,
|
|
591
|
+
max_examples=max_examples,
|
|
592
|
+
**api_kwargs,
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> SchemaInferenceOutput:
|
|
596
|
+
"""Infer a structured data schema from Series content using AI.
|
|
597
|
+
|
|
598
|
+
This method analyzes a sample of Series values to automatically generate
|
|
599
|
+
a Pydantic model that captures the relevant information structure. The
|
|
600
|
+
inferred schema supports both flat and hierarchical (nested) structures,
|
|
601
|
+
making it suitable for complex data extraction tasks.
|
|
262
602
|
|
|
263
603
|
Args:
|
|
264
|
-
|
|
265
|
-
|
|
604
|
+
instructions (str): Plain language description of the extraction goal
|
|
605
|
+
(e.g., "Extract customer information for CRM system", "Parse
|
|
606
|
+
event details for calendar integration"). This guides which
|
|
607
|
+
fields to include and their purpose.
|
|
608
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
609
|
+
analyze for pattern detection. The method samples randomly up
|
|
610
|
+
to this limit. Higher values may improve schema quality but
|
|
611
|
+
increase inference time. Defaults to 100.
|
|
612
|
+
**api_kwargs: Additional OpenAI API parameters for fine-tuning
|
|
613
|
+
the inference process.
|
|
266
614
|
|
|
267
615
|
Returns:
|
|
268
|
-
|
|
269
|
-
|
|
616
|
+
InferredSchema: A comprehensive schema object containing:
|
|
617
|
+
- instructions: Refined extraction objective statement
|
|
618
|
+
- fields: Hierarchical field specifications with names, types,
|
|
619
|
+
descriptions, and nested structures where applicable
|
|
620
|
+
- inference_prompt: Optimized prompt for consistent extraction
|
|
621
|
+
- model: Dynamically generated Pydantic model class supporting
|
|
622
|
+
both flat and nested structures
|
|
623
|
+
- task: PreparedTask configured for batch extraction using
|
|
624
|
+
the inferred schema
|
|
625
|
+
|
|
626
|
+
Example:
|
|
627
|
+
```python
|
|
628
|
+
# Simple flat structure
|
|
629
|
+
reviews = pd.Series([
|
|
630
|
+
"5 stars! Great product, fast shipping to NYC.",
|
|
631
|
+
"2 stars. Product broke, slow delivery to LA."
|
|
632
|
+
])
|
|
633
|
+
schema = reviews.ai.infer_schema(
|
|
634
|
+
"Extract review ratings and shipping information"
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
# Hierarchical structure
|
|
638
|
+
orders = pd.Series([
|
|
639
|
+
"Order #123: John Doe, 123 Main St, NYC. Items: iPhone ($999), Case ($29)",
|
|
640
|
+
"Order #456: Jane Smith, 456 Oak Ave, LA. Items: iPad ($799)"
|
|
641
|
+
])
|
|
642
|
+
schema = orders.ai.infer_schema(
|
|
643
|
+
"Extract order details including customer and items"
|
|
644
|
+
)
|
|
645
|
+
# Inferred schema may include nested structures like:
|
|
646
|
+
# - customer: {name: str, address: str, city: str}
|
|
647
|
+
# - items: [{product: str, price: float}]
|
|
648
|
+
|
|
649
|
+
# Apply the schema for extraction
|
|
650
|
+
extracted = orders.ai.task(schema.task)
|
|
651
|
+
```
|
|
652
|
+
|
|
653
|
+
Note:
|
|
654
|
+
The inference process uses multiple AI iterations to ensure schema
|
|
655
|
+
validity. Nested structures are automatically detected when the
|
|
656
|
+
data contains hierarchical relationships. The generated Pydantic
|
|
657
|
+
model ensures type safety and validation for all extracted data.
|
|
270
658
|
"""
|
|
271
|
-
|
|
272
|
-
client=_DI.resolve(OpenAI),
|
|
273
|
-
model_name=_DI.resolve(EmbeddingsModelName).value,
|
|
274
|
-
)
|
|
659
|
+
inferer = CONTAINER.resolve(SchemaInferer)
|
|
275
660
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
661
|
+
input: SchemaInferenceInput = SchemaInferenceInput(
|
|
662
|
+
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(),
|
|
663
|
+
instructions=instructions,
|
|
664
|
+
**api_kwargs,
|
|
280
665
|
)
|
|
666
|
+
return inferer.infer_schema(input)
|
|
281
667
|
|
|
282
668
|
def count_tokens(self) -> pd.Series:
|
|
283
669
|
"""Count `tiktoken` tokens per row.
|
|
@@ -288,12 +674,12 @@ class OpenAIVecSeriesAccessor:
|
|
|
288
674
|
animals.ai.count_tokens()
|
|
289
675
|
```
|
|
290
676
|
This method uses the `tiktoken` library to count tokens based on the
|
|
291
|
-
model name
|
|
677
|
+
model name configured via `set_responses_model`.
|
|
292
678
|
|
|
293
679
|
Returns:
|
|
294
680
|
pandas.Series: Token counts for each element.
|
|
295
681
|
"""
|
|
296
|
-
encoding: tiktoken.Encoding =
|
|
682
|
+
encoding: tiktoken.Encoding = CONTAINER.resolve(tiktoken.Encoding)
|
|
297
683
|
return self._obj.map(encoding.encode).map(len).rename("num_tokens")
|
|
298
684
|
|
|
299
685
|
def extract(self) -> pd.DataFrame:
|
|
@@ -333,47 +719,65 @@ class OpenAIVecDataFrameAccessor:
|
|
|
333
719
|
def __init__(self, df_obj: pd.DataFrame):
|
|
334
720
|
self._obj = df_obj
|
|
335
721
|
|
|
336
|
-
def
|
|
337
|
-
|
|
722
|
+
def responses_with_cache(
|
|
723
|
+
self,
|
|
724
|
+
instructions: str,
|
|
725
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
726
|
+
response_format: type[ResponseFormat] = str,
|
|
727
|
+
**api_kwargs,
|
|
728
|
+
) -> pd.Series:
|
|
729
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache.
|
|
730
|
+
|
|
731
|
+
This method allows external control over caching behavior by accepting
|
|
732
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
733
|
+
across multiple operations or custom batch size management.
|
|
338
734
|
|
|
339
735
|
Example:
|
|
340
736
|
```python
|
|
737
|
+
from openaivec._cache import BatchingMapProxy
|
|
738
|
+
|
|
739
|
+
# Create a shared cache with custom batch size
|
|
740
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
741
|
+
|
|
341
742
|
df = pd.DataFrame([
|
|
342
|
-
{"
|
|
343
|
-
{"
|
|
344
|
-
{"
|
|
743
|
+
{"name": "cat", "legs": 4},
|
|
744
|
+
{"name": "dog", "legs": 4},
|
|
745
|
+
{"name": "elephant", "legs": 4},
|
|
345
746
|
])
|
|
346
|
-
df.ai.
|
|
747
|
+
result = df.ai.responses_with_cache(
|
|
748
|
+
"what is the animal's name?",
|
|
749
|
+
cache=shared_cache
|
|
750
|
+
)
|
|
347
751
|
```
|
|
348
|
-
This method returns a DataFrame with the same index as the original,
|
|
349
|
-
where each column corresponds to a key in the dictionaries.
|
|
350
|
-
The source column is dropped.
|
|
351
752
|
|
|
352
753
|
Args:
|
|
353
|
-
|
|
754
|
+
instructions (str): System prompt for the assistant.
|
|
755
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
756
|
+
instance for managing API call batching and deduplication.
|
|
757
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
758
|
+
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
759
|
+
responses. Defaults to ``str``.
|
|
760
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
354
761
|
|
|
355
762
|
Returns:
|
|
356
|
-
pandas.
|
|
763
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
357
764
|
"""
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
.pipe(lambda df: df.join(df[column].ai.extract()))
|
|
364
|
-
.pipe(lambda df: df.set_index(self._obj.index))
|
|
365
|
-
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
765
|
+
return _df_rows_to_json_series(self._obj).ai.responses_with_cache(
|
|
766
|
+
instructions=instructions,
|
|
767
|
+
cache=cache,
|
|
768
|
+
response_format=response_format,
|
|
769
|
+
**api_kwargs,
|
|
366
770
|
)
|
|
367
771
|
|
|
368
772
|
def responses(
|
|
369
773
|
self,
|
|
370
774
|
instructions: str,
|
|
371
|
-
response_format:
|
|
372
|
-
batch_size: int =
|
|
373
|
-
|
|
374
|
-
|
|
775
|
+
response_format: type[ResponseFormat] = str,
|
|
776
|
+
batch_size: int | None = None,
|
|
777
|
+
show_progress: bool = True,
|
|
778
|
+
**api_kwargs,
|
|
375
779
|
) -> pd.Series:
|
|
376
|
-
"""Generate a response for each row after
|
|
780
|
+
"""Generate a response for each row after serializing it to JSON.
|
|
377
781
|
|
|
378
782
|
Example:
|
|
379
783
|
```python
|
|
@@ -382,51 +786,75 @@ class OpenAIVecDataFrameAccessor:
|
|
|
382
786
|
{"name": "dog", "legs": 4},
|
|
383
787
|
{"name": "elephant", "legs": 4},
|
|
384
788
|
])
|
|
789
|
+
# Basic usage
|
|
385
790
|
df.ai.responses("what is the animal's name?")
|
|
791
|
+
|
|
792
|
+
# With progress bar for large datasets
|
|
793
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
794
|
+
large_df.ai.responses(
|
|
795
|
+
"generate a name for this ID",
|
|
796
|
+
batch_size=20,
|
|
797
|
+
show_progress=True
|
|
798
|
+
)
|
|
386
799
|
```
|
|
387
|
-
This method returns a Series of strings, each containing the
|
|
388
|
-
assistant's response to the corresponding input.
|
|
389
|
-
Each row is serialised to JSON before being sent to the assistant.
|
|
390
|
-
The model used is set by the `responses_model` function.
|
|
391
|
-
The default model is `gpt-4.1-mini`.
|
|
392
800
|
|
|
393
801
|
Args:
|
|
394
802
|
instructions (str): System prompt for the assistant.
|
|
395
|
-
response_format (
|
|
803
|
+
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
396
804
|
responses. Defaults to ``str``.
|
|
397
|
-
batch_size (int, optional): Number of requests sent in one batch.
|
|
398
|
-
Defaults to ``
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
response_format=response_format,
|
|
412
|
-
batch_size=batch_size,
|
|
413
|
-
temperature=temperature,
|
|
414
|
-
top_p=top_p,
|
|
415
|
-
)
|
|
416
|
-
)
|
|
805
|
+
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
806
|
+
Defaults to ``None`` (automatic batch size optimization
|
|
807
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
808
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
809
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
810
|
+
|
|
811
|
+
Returns:
|
|
812
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
813
|
+
"""
|
|
814
|
+
return self.responses_with_cache(
|
|
815
|
+
instructions=instructions,
|
|
816
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
817
|
+
response_format=response_format,
|
|
818
|
+
**api_kwargs,
|
|
417
819
|
)
|
|
418
820
|
|
|
419
|
-
def
|
|
420
|
-
|
|
821
|
+
def task_with_cache(
|
|
822
|
+
self,
|
|
823
|
+
task: PreparedTask[ResponseFormat],
|
|
824
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
825
|
+
**api_kwargs,
|
|
826
|
+
) -> pd.Series:
|
|
827
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
|
|
828
|
+
|
|
829
|
+
Args:
|
|
830
|
+
task (PreparedTask): Prepared task (instructions + response_format).
|
|
831
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
832
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
833
|
+
|
|
834
|
+
Note:
|
|
835
|
+
Core routing keys are managed internally.
|
|
421
836
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
837
|
+
Returns:
|
|
838
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
839
|
+
"""
|
|
840
|
+
return _df_rows_to_json_series(self._obj).ai.task_with_cache(
|
|
841
|
+
task=task,
|
|
842
|
+
cache=cache,
|
|
843
|
+
**api_kwargs,
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
def task(
|
|
847
|
+
self,
|
|
848
|
+
task: PreparedTask,
|
|
849
|
+
batch_size: int | None = None,
|
|
850
|
+
show_progress: bool = True,
|
|
851
|
+
**api_kwargs,
|
|
852
|
+
) -> pd.Series:
|
|
853
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON.
|
|
426
854
|
|
|
427
855
|
Example:
|
|
428
856
|
```python
|
|
429
|
-
from openaivec.
|
|
857
|
+
from openaivec._model import PreparedTask
|
|
430
858
|
|
|
431
859
|
# Assume you have a prepared task for data analysis
|
|
432
860
|
analysis_task = PreparedTask(...)
|
|
@@ -436,30 +864,237 @@ class OpenAIVecDataFrameAccessor:
|
|
|
436
864
|
{"name": "dog", "legs": 4},
|
|
437
865
|
{"name": "elephant", "legs": 4},
|
|
438
866
|
])
|
|
867
|
+
# Basic usage
|
|
439
868
|
results = df.ai.task(analysis_task)
|
|
869
|
+
|
|
870
|
+
# With progress bar for large datasets
|
|
871
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
872
|
+
results = large_df.ai.task(
|
|
873
|
+
analysis_task,
|
|
874
|
+
batch_size=50,
|
|
875
|
+
show_progress=True
|
|
876
|
+
)
|
|
440
877
|
```
|
|
441
|
-
This method returns a Series containing the task results for each
|
|
442
|
-
corresponding row, following the task's defined structure.
|
|
443
878
|
|
|
444
879
|
Args:
|
|
445
880
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
446
|
-
response format
|
|
447
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
448
|
-
to optimize API usage. Defaults to
|
|
881
|
+
response format for processing the inputs.
|
|
882
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
883
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
884
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
885
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
886
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
887
|
+
|
|
888
|
+
Note:
|
|
889
|
+
Core batching / routing keys (``model``, ``instructions`` / system message, user ``input``)
|
|
890
|
+
are managed by the library and cannot be overridden.
|
|
449
891
|
|
|
450
892
|
Returns:
|
|
451
893
|
pandas.Series: Series whose values are instances of the task's
|
|
452
894
|
response format, aligned with the DataFrame's original index.
|
|
453
895
|
"""
|
|
454
|
-
return self._obj.
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
896
|
+
return _df_rows_to_json_series(self._obj).ai.task(
|
|
897
|
+
task=task,
|
|
898
|
+
batch_size=batch_size,
|
|
899
|
+
show_progress=show_progress,
|
|
900
|
+
**api_kwargs,
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
def parse_with_cache(
|
|
904
|
+
self,
|
|
905
|
+
instructions: str,
|
|
906
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
907
|
+
response_format: type[ResponseFormat] | None = None,
|
|
908
|
+
max_examples: int = 100,
|
|
909
|
+
**api_kwargs,
|
|
910
|
+
) -> pd.Series:
|
|
911
|
+
"""Parse DataFrame rows into structured data using an LLM with a provided cache.
|
|
912
|
+
|
|
913
|
+
This method processes each DataFrame row (converted to JSON) and extracts
|
|
914
|
+
structured information using an LLM. External cache control enables
|
|
915
|
+
deduplication across operations and custom batch management.
|
|
916
|
+
|
|
917
|
+
Args:
|
|
918
|
+
instructions (str): Plain language description of what information
|
|
919
|
+
to extract from each row (e.g., "Extract shipping details and
|
|
920
|
+
order status"). Guides both extraction and schema inference.
|
|
921
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
922
|
+
instance for managing API call batching and deduplication.
|
|
923
|
+
Set cache.batch_size=None for automatic optimization.
|
|
924
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
925
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
926
|
+
type, or None for automatic schema inference. Defaults to None.
|
|
927
|
+
max_examples (int, optional): Maximum rows to analyze when inferring
|
|
928
|
+
schema (only used when response_format is None). Defaults to 100.
|
|
929
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
930
|
+
frequency_penalty, presence_penalty, seed, etc.).
|
|
931
|
+
|
|
932
|
+
Returns:
|
|
933
|
+
pandas.Series: Series containing parsed structured data as instances
|
|
934
|
+
of response_format or the inferred schema model, indexed like
|
|
935
|
+
the original DataFrame.
|
|
936
|
+
"""
|
|
937
|
+
return _df_rows_to_json_series(self._obj).ai.parse_with_cache(
|
|
938
|
+
instructions=instructions,
|
|
939
|
+
cache=cache,
|
|
940
|
+
response_format=response_format,
|
|
941
|
+
max_examples=max_examples,
|
|
942
|
+
**api_kwargs,
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
def parse(
|
|
946
|
+
self,
|
|
947
|
+
instructions: str,
|
|
948
|
+
response_format: type[ResponseFormat] | None = None,
|
|
949
|
+
max_examples: int = 100,
|
|
950
|
+
batch_size: int | None = None,
|
|
951
|
+
show_progress: bool = True,
|
|
952
|
+
**api_kwargs,
|
|
953
|
+
) -> pd.Series:
|
|
954
|
+
"""Parse DataFrame rows into structured data using an LLM.
|
|
955
|
+
|
|
956
|
+
Each row is converted to JSON and processed to extract structured
|
|
957
|
+
information. When no response format is provided, the method
|
|
958
|
+
automatically infers an appropriate schema from the data.
|
|
959
|
+
|
|
960
|
+
Args:
|
|
961
|
+
instructions (str): Plain language description of extraction goals
|
|
962
|
+
(e.g., "Extract transaction details including amount, date,
|
|
963
|
+
and merchant"). Guides extraction and schema inference.
|
|
964
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
965
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
966
|
+
type, or None for automatic inference. Defaults to None.
|
|
967
|
+
max_examples (int, optional): Maximum rows to analyze for schema
|
|
968
|
+
inference (when response_format is None). Defaults to 100.
|
|
969
|
+
batch_size (int | None, optional): Rows per API batch. None
|
|
970
|
+
enables automatic optimization. Defaults to None.
|
|
971
|
+
show_progress (bool, optional): Show progress bar in Jupyter
|
|
972
|
+
notebooks. Defaults to True.
|
|
973
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
974
|
+
|
|
975
|
+
Returns:
|
|
976
|
+
pandas.Series: Parsed structured data indexed like the original
|
|
977
|
+
DataFrame.
|
|
978
|
+
|
|
979
|
+
Example:
|
|
980
|
+
```python
|
|
981
|
+
df = pd.DataFrame({
|
|
982
|
+
'log': [
|
|
983
|
+
'2024-01-01 10:00 ERROR Database connection failed',
|
|
984
|
+
'2024-01-01 10:05 INFO Service started successfully'
|
|
985
|
+
]
|
|
986
|
+
})
|
|
987
|
+
|
|
988
|
+
# With automatic schema inference
|
|
989
|
+
parsed = df.ai.parse("Extract timestamp, level, and message")
|
|
990
|
+
# Returns Series with inferred structure like:
|
|
991
|
+
# {timestamp: str, level: str, message: str}
|
|
992
|
+
```
|
|
993
|
+
"""
|
|
994
|
+
return self.parse_with_cache(
|
|
995
|
+
instructions=instructions,
|
|
996
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
997
|
+
response_format=response_format,
|
|
998
|
+
max_examples=max_examples,
|
|
999
|
+
**api_kwargs,
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> SchemaInferenceOutput:
|
|
1003
|
+
"""Infer a structured data schema from DataFrame rows using AI.
|
|
1004
|
+
|
|
1005
|
+
This method analyzes a sample of DataFrame rows to automatically infer
|
|
1006
|
+
a structured schema that can be used for consistent data extraction.
|
|
1007
|
+
Each row is converted to JSON format and analyzed to identify patterns,
|
|
1008
|
+
field types, and potential categorical values.
|
|
1009
|
+
|
|
1010
|
+
Args:
|
|
1011
|
+
instructions (str): Plain language description of how the extracted
|
|
1012
|
+
structured data will be used (e.g., "Extract operational metrics
|
|
1013
|
+
for dashboard", "Parse customer attributes for segmentation").
|
|
1014
|
+
This guides field relevance and helps exclude irrelevant information.
|
|
1015
|
+
max_examples (int): Maximum number of rows to analyze from the
|
|
1016
|
+
DataFrame. The method will sample randomly up to this limit.
|
|
1017
|
+
Defaults to 100.
|
|
1018
|
+
|
|
1019
|
+
Returns:
|
|
1020
|
+
InferredSchema: An object containing:
|
|
1021
|
+
- instructions: Normalized statement of the extraction objective
|
|
1022
|
+
- fields: List of field specifications with names, types, and descriptions
|
|
1023
|
+
- inference_prompt: Reusable prompt for future extractions
|
|
1024
|
+
- model: Dynamically generated Pydantic model for parsing
|
|
1025
|
+
- task: PreparedTask for batch extraction operations
|
|
1026
|
+
|
|
1027
|
+
Example:
|
|
1028
|
+
```python
|
|
1029
|
+
df = pd.DataFrame({
|
|
1030
|
+
'text': [
|
|
1031
|
+
"Order #123: Shipped to NYC, arriving Tuesday",
|
|
1032
|
+
"Order #456: Delayed due to weather, new ETA Friday",
|
|
1033
|
+
"Order #789: Delivered to customer in LA"
|
|
1034
|
+
],
|
|
1035
|
+
'timestamp': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
1036
|
+
})
|
|
1037
|
+
|
|
1038
|
+
# Infer schema for logistics tracking
|
|
1039
|
+
schema = df.ai.infer_schema(
|
|
1040
|
+
instructions="Extract shipping status and location data for logistics tracking"
|
|
459
1041
|
)
|
|
1042
|
+
|
|
1043
|
+
# Apply the schema to extract structured data
|
|
1044
|
+
extracted_df = df.ai.task(schema.task)
|
|
1045
|
+
```
|
|
1046
|
+
|
|
1047
|
+
Note:
|
|
1048
|
+
Each row is converted to JSON before analysis. The inference
|
|
1049
|
+
process automatically detects hierarchical relationships and
|
|
1050
|
+
creates appropriate nested structures when present. The generated
|
|
1051
|
+
Pydantic model ensures type safety and validation.
|
|
1052
|
+
"""
|
|
1053
|
+
return _df_rows_to_json_series(self._obj).ai.infer_schema(
|
|
1054
|
+
instructions=instructions,
|
|
1055
|
+
max_examples=max_examples,
|
|
1056
|
+
**api_kwargs,
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
def extract(self, column: str) -> pd.DataFrame:
|
|
1060
|
+
"""Flatten one column of Pydantic models/dicts into top‑level columns.
|
|
1061
|
+
|
|
1062
|
+
Example:
|
|
1063
|
+
```python
|
|
1064
|
+
df = pd.DataFrame([
|
|
1065
|
+
{"animal": {"name": "cat", "legs": 4}},
|
|
1066
|
+
{"animal": {"name": "dog", "legs": 4}},
|
|
1067
|
+
{"animal": {"name": "elephant", "legs": 4}},
|
|
1068
|
+
])
|
|
1069
|
+
df.ai.extract("animal")
|
|
1070
|
+
```
|
|
1071
|
+
This method returns a DataFrame with the same index as the original,
|
|
1072
|
+
where each column corresponds to a key in the dictionaries.
|
|
1073
|
+
The source column is dropped.
|
|
1074
|
+
|
|
1075
|
+
Args:
|
|
1076
|
+
column (str): Column to expand.
|
|
1077
|
+
|
|
1078
|
+
Returns:
|
|
1079
|
+
pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
|
|
1080
|
+
"""
|
|
1081
|
+
if column not in self._obj.columns:
|
|
1082
|
+
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
|
1083
|
+
|
|
1084
|
+
return (
|
|
1085
|
+
self._obj.pipe(lambda df: df.reset_index(drop=True))
|
|
1086
|
+
.pipe(lambda df: df.join(df[column].ai.extract()))
|
|
1087
|
+
.pipe(lambda df: df.set_index(self._obj.index))
|
|
1088
|
+
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
460
1089
|
)
|
|
461
1090
|
|
|
462
|
-
def fillna(
|
|
1091
|
+
def fillna(
|
|
1092
|
+
self,
|
|
1093
|
+
target_column_name: str,
|
|
1094
|
+
max_examples: int = 500,
|
|
1095
|
+
batch_size: int | None = None,
|
|
1096
|
+
show_progress: bool = True,
|
|
1097
|
+
) -> pd.DataFrame:
|
|
463
1098
|
"""Fill missing values in a DataFrame column using AI-powered inference.
|
|
464
1099
|
|
|
465
1100
|
This method uses machine learning to intelligently fill missing (NaN) values
|
|
@@ -473,8 +1108,10 @@ class OpenAIVecDataFrameAccessor:
|
|
|
473
1108
|
max_examples (int, optional): The maximum number of example rows to use
|
|
474
1109
|
for context when predicting missing values. Higher values may improve
|
|
475
1110
|
accuracy but increase API costs and processing time. Defaults to 500.
|
|
476
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
477
|
-
to optimize API usage. Defaults to
|
|
1111
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
1112
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1113
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1114
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
478
1115
|
|
|
479
1116
|
Returns:
|
|
480
1117
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
@@ -490,6 +1127,10 @@ class OpenAIVecDataFrameAccessor:
|
|
|
490
1127
|
|
|
491
1128
|
# Fill missing values in the 'name' column
|
|
492
1129
|
filled_df = df.ai.fillna('name')
|
|
1130
|
+
|
|
1131
|
+
# With progress bar for large datasets
|
|
1132
|
+
large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
|
|
1133
|
+
filled_df = large_df.ai.fillna('name', batch_size=32, show_progress=True)
|
|
493
1134
|
```
|
|
494
1135
|
|
|
495
1136
|
Note:
|
|
@@ -502,7 +1143,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
502
1143
|
if missing_rows.empty:
|
|
503
1144
|
return self._obj
|
|
504
1145
|
|
|
505
|
-
filled_values:
|
|
1146
|
+
filled_values: list[FillNaResponse] = missing_rows.ai.task(
|
|
1147
|
+
task=task, batch_size=batch_size, show_progress=show_progress
|
|
1148
|
+
)
|
|
506
1149
|
|
|
507
1150
|
# get deep copy of the DataFrame to avoid modifying the original
|
|
508
1151
|
df = self._obj.copy()
|
|
@@ -519,27 +1162,106 @@ class OpenAIVecDataFrameAccessor:
|
|
|
519
1162
|
return df
|
|
520
1163
|
|
|
521
1164
|
def similarity(self, col1: str, col2: str) -> pd.Series:
|
|
1165
|
+
"""Compute cosine similarity between two columns containing embedding vectors.
|
|
1166
|
+
|
|
1167
|
+
This method calculates the cosine similarity between vectors stored in
|
|
1168
|
+
two columns of the DataFrame. The vectors should be numpy arrays or
|
|
1169
|
+
array-like objects that support dot product operations.
|
|
1170
|
+
|
|
1171
|
+
Example:
|
|
1172
|
+
```python
|
|
1173
|
+
df = pd.DataFrame({
|
|
1174
|
+
'vec1': [np.array([1, 0, 0]), np.array([0, 1, 0])],
|
|
1175
|
+
'vec2': [np.array([1, 0, 0]), np.array([1, 1, 0])]
|
|
1176
|
+
})
|
|
1177
|
+
similarities = df.ai.similarity('vec1', 'vec2')
|
|
1178
|
+
```
|
|
1179
|
+
|
|
1180
|
+
Args:
|
|
1181
|
+
col1 (str): Name of the first column containing embedding vectors.
|
|
1182
|
+
col2 (str): Name of the second column containing embedding vectors.
|
|
1183
|
+
|
|
1184
|
+
Returns:
|
|
1185
|
+
pandas.Series: Series containing cosine similarity scores between
|
|
1186
|
+
corresponding vectors in col1 and col2, with values ranging
|
|
1187
|
+
from -1 to 1, where 1 indicates identical direction.
|
|
1188
|
+
"""
|
|
522
1189
|
return self._obj.apply(
|
|
523
1190
|
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
524
1191
|
axis=1,
|
|
525
|
-
).rename("similarity")
|
|
1192
|
+
).rename("similarity") # type: ignore[arg-type]
|
|
1193
|
+
|
|
1194
|
+
|
|
1195
|
+
@pd.api.extensions.register_series_accessor("aio")
|
|
1196
|
+
class AsyncOpenAIVecSeriesAccessor:
|
|
1197
|
+
"""pandas Series accessor (``.aio``) that adds OpenAI helpers."""
|
|
1198
|
+
|
|
1199
|
+
def __init__(self, series_obj: pd.Series):
|
|
1200
|
+
self._obj = series_obj
|
|
1201
|
+
|
|
1202
|
+
async def responses_with_cache(
|
|
1203
|
+
self,
|
|
1204
|
+
instructions: str,
|
|
1205
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1206
|
+
response_format: type[ResponseFormat] = str,
|
|
1207
|
+
**api_kwargs,
|
|
1208
|
+
) -> pd.Series:
|
|
1209
|
+
"""Call an LLM once for every Series element using a provided cache (asynchronously).
|
|
1210
|
+
|
|
1211
|
+
This method allows external control over caching behavior by accepting
|
|
1212
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1213
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1214
|
+
is controlled by the cache instance itself.
|
|
1215
|
+
|
|
1216
|
+
Example:
|
|
1217
|
+
```python
|
|
1218
|
+
result = await series.aio.responses_with_cache(
|
|
1219
|
+
"classify",
|
|
1220
|
+
cache=shared,
|
|
1221
|
+
max_output_tokens=256,
|
|
1222
|
+
frequency_penalty=0.2,
|
|
1223
|
+
)
|
|
1224
|
+
```
|
|
526
1225
|
|
|
1226
|
+
Args:
|
|
1227
|
+
instructions (str): System prompt prepended to every user message.
|
|
1228
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1229
|
+
instance for managing API call batching and deduplication.
|
|
1230
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1231
|
+
response_format (type[ResponseFormat], optional): Pydantic model or built‑in
|
|
1232
|
+
type the assistant should return. Defaults to ``str``.
|
|
1233
|
+
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
1234
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
|
|
1235
|
+
``max_output_tokens``, penalties, future parameters). Core batching keys
|
|
1236
|
+
(model, instructions, input, text_format) are protected and silently
|
|
1237
|
+
ignored if provided.
|
|
527
1238
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
"""pandas Series accessor (``.aio``) that adds OpenAI helpers."""
|
|
1239
|
+
Returns:
|
|
1240
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
531
1241
|
|
|
532
|
-
|
|
533
|
-
|
|
1242
|
+
Note:
|
|
1243
|
+
This is an asynchronous method and must be awaited.
|
|
1244
|
+
"""
|
|
1245
|
+
client: AsyncBatchResponses = AsyncBatchResponses(
|
|
1246
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1247
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
1248
|
+
system_message=instructions,
|
|
1249
|
+
response_format=response_format,
|
|
1250
|
+
cache=cache,
|
|
1251
|
+
api_kwargs=api_kwargs,
|
|
1252
|
+
)
|
|
1253
|
+
|
|
1254
|
+
results = await client.parse(self._obj.tolist())
|
|
1255
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
534
1256
|
|
|
535
1257
|
async def responses(
|
|
536
1258
|
self,
|
|
537
1259
|
instructions: str,
|
|
538
|
-
response_format:
|
|
539
|
-
batch_size: int =
|
|
540
|
-
temperature: float = 0.0,
|
|
541
|
-
top_p: float = 1.0,
|
|
1260
|
+
response_format: type[ResponseFormat] = str,
|
|
1261
|
+
batch_size: int | None = None,
|
|
542
1262
|
max_concurrency: int = 8,
|
|
1263
|
+
show_progress: bool = True,
|
|
1264
|
+
**api_kwargs,
|
|
543
1265
|
) -> pd.Series:
|
|
544
1266
|
"""Call an LLM once for every Series element (asynchronously).
|
|
545
1267
|
|
|
@@ -548,22 +1270,32 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
548
1270
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
549
1271
|
# Must be awaited
|
|
550
1272
|
results = await animals.aio.responses("translate to French")
|
|
1273
|
+
|
|
1274
|
+
# With progress bar for large datasets
|
|
1275
|
+
large_series = pd.Series(["data"] * 1000)
|
|
1276
|
+
results = await large_series.aio.responses(
|
|
1277
|
+
"analyze this data",
|
|
1278
|
+
batch_size=32,
|
|
1279
|
+
max_concurrency=4,
|
|
1280
|
+
show_progress=True
|
|
1281
|
+
)
|
|
551
1282
|
```
|
|
552
|
-
This method returns a Series of strings, each containing the
|
|
553
|
-
assistant's response to the corresponding input.
|
|
554
|
-
The model used is set by the `responses_model` function.
|
|
555
|
-
The default model is `gpt-4.1-mini`.
|
|
556
1283
|
|
|
557
1284
|
Args:
|
|
558
1285
|
instructions (str): System prompt prepended to every user message.
|
|
559
|
-
response_format (
|
|
1286
|
+
response_format (type[ResponseFormat], optional): Pydantic model or built‑in
|
|
560
1287
|
type the assistant should return. Defaults to ``str``.
|
|
561
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
562
|
-
request. Defaults to ``
|
|
563
|
-
|
|
564
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1288
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1289
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
1290
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
565
1291
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
566
1292
|
requests. Defaults to ``8``.
|
|
1293
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
1294
|
+
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
1295
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
|
|
1296
|
+
``max_output_tokens``, penalties, future parameters). Core batching keys
|
|
1297
|
+
(model, instructions, input, text_format) are protected and silently
|
|
1298
|
+
ignored if provided.
|
|
567
1299
|
|
|
568
1300
|
Returns:
|
|
569
1301
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
@@ -571,18 +1303,64 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
571
1303
|
Note:
|
|
572
1304
|
This is an asynchronous method and must be awaited.
|
|
573
1305
|
"""
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
1306
|
+
return await self.responses_with_cache(
|
|
1307
|
+
instructions=instructions,
|
|
1308
|
+
cache=AsyncBatchingMapProxy(
|
|
1309
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1310
|
+
),
|
|
578
1311
|
response_format=response_format,
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
1312
|
+
**api_kwargs,
|
|
1313
|
+
)
|
|
1314
|
+
|
|
1315
|
+
async def embeddings_with_cache(
|
|
1316
|
+
self,
|
|
1317
|
+
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
1318
|
+
**api_kwargs,
|
|
1319
|
+
) -> pd.Series:
|
|
1320
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
1321
|
+
|
|
1322
|
+
This method allows external control over caching behavior by accepting
|
|
1323
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1324
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1325
|
+
is controlled by the cache instance itself.
|
|
1326
|
+
|
|
1327
|
+
Example:
|
|
1328
|
+
```python
|
|
1329
|
+
from openaivec._cache import AsyncBatchingMapProxy
|
|
1330
|
+
import numpy as np
|
|
1331
|
+
|
|
1332
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1333
|
+
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
1334
|
+
batch_size=64, max_concurrency=4
|
|
1335
|
+
)
|
|
1336
|
+
|
|
1337
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
1338
|
+
# Must be awaited
|
|
1339
|
+
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
1340
|
+
```
|
|
1341
|
+
|
|
1342
|
+
Args:
|
|
1343
|
+
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
1344
|
+
instance for managing API call batching and deduplication.
|
|
1345
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1346
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
1347
|
+
|
|
1348
|
+
Returns:
|
|
1349
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
1350
|
+
(dtype ``float32``).
|
|
1351
|
+
|
|
1352
|
+
Note:
|
|
1353
|
+
This is an asynchronous method and must be awaited.
|
|
1354
|
+
"""
|
|
1355
|
+
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
1356
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1357
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
1358
|
+
cache=cache,
|
|
1359
|
+
api_kwargs=api_kwargs,
|
|
582
1360
|
)
|
|
583
1361
|
|
|
584
1362
|
# Await the async operation
|
|
585
|
-
results = await client.
|
|
1363
|
+
results = await client.create(self._obj.tolist())
|
|
586
1364
|
|
|
587
1365
|
return pd.Series(
|
|
588
1366
|
results,
|
|
@@ -590,7 +1368,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
590
1368
|
name=self._obj.name,
|
|
591
1369
|
)
|
|
592
1370
|
|
|
593
|
-
async def embeddings(
|
|
1371
|
+
async def embeddings(
|
|
1372
|
+
self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = True, **api_kwargs
|
|
1373
|
+
) -> pd.Series:
|
|
594
1374
|
"""Compute OpenAI embeddings for every Series element (asynchronously).
|
|
595
1375
|
|
|
596
1376
|
Example:
|
|
@@ -598,17 +1378,24 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
598
1378
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
599
1379
|
# Must be awaited
|
|
600
1380
|
embeddings = await animals.aio.embeddings()
|
|
1381
|
+
|
|
1382
|
+
# With progress bar for large datasets
|
|
1383
|
+
large_texts = pd.Series(["text"] * 5000)
|
|
1384
|
+
embeddings = await large_texts.aio.embeddings(
|
|
1385
|
+
batch_size=100,
|
|
1386
|
+
max_concurrency=4,
|
|
1387
|
+
show_progress=True
|
|
1388
|
+
)
|
|
601
1389
|
```
|
|
602
|
-
This method returns a Series of numpy arrays, each containing the
|
|
603
|
-
embedding vector for the corresponding input.
|
|
604
|
-
The embedding model is set by the `embeddings_model` function.
|
|
605
|
-
The default embedding model is `text-embedding-3-small`.
|
|
606
1390
|
|
|
607
1391
|
Args:
|
|
608
|
-
batch_size (int, optional): Number of inputs grouped into a
|
|
609
|
-
single request. Defaults to ``
|
|
1392
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
1393
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
1394
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
610
1395
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
611
1396
|
requests. Defaults to ``8``.
|
|
1397
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
1398
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
612
1399
|
|
|
613
1400
|
Returns:
|
|
614
1401
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -617,31 +1404,88 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
617
1404
|
Note:
|
|
618
1405
|
This is an asynchronous method and must be awaited.
|
|
619
1406
|
"""
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
1407
|
+
return await self.embeddings_with_cache(
|
|
1408
|
+
cache=AsyncBatchingMapProxy(
|
|
1409
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1410
|
+
),
|
|
1411
|
+
**api_kwargs,
|
|
624
1412
|
)
|
|
625
1413
|
|
|
626
|
-
|
|
627
|
-
|
|
1414
|
+
async def task_with_cache(
|
|
1415
|
+
self,
|
|
1416
|
+
task: PreparedTask[ResponseFormat],
|
|
1417
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1418
|
+
**api_kwargs,
|
|
1419
|
+
) -> pd.Series:
|
|
1420
|
+
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
628
1421
|
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
1422
|
+
This method allows external control over caching behavior by accepting
|
|
1423
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1424
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1425
|
+
is controlled by the cache instance itself.
|
|
1426
|
+
|
|
1427
|
+
Args:
|
|
1428
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
1429
|
+
response format for processing the inputs.
|
|
1430
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1431
|
+
instance for managing API call batching and deduplication.
|
|
1432
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1433
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
1434
|
+
|
|
1435
|
+
Example:
|
|
1436
|
+
```python
|
|
1437
|
+
from openaivec._model import PreparedTask
|
|
1438
|
+
from openaivec._cache import AsyncBatchingMapProxy
|
|
1439
|
+
|
|
1440
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1441
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
1442
|
+
|
|
1443
|
+
# Assume you have a prepared task for sentiment analysis
|
|
1444
|
+
sentiment_task = PreparedTask(...)
|
|
1445
|
+
|
|
1446
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
1447
|
+
# Must be awaited
|
|
1448
|
+
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
1449
|
+
```
|
|
1450
|
+
|
|
1451
|
+
Additional Keyword Args:
|
|
1452
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1453
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1454
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1455
|
+
library and cannot be overridden.
|
|
1456
|
+
|
|
1457
|
+
Returns:
|
|
1458
|
+
pandas.Series: Series whose values are instances of the task's
|
|
1459
|
+
response format, aligned with the original Series index.
|
|
1460
|
+
|
|
1461
|
+
Note:
|
|
1462
|
+
This is an asynchronous method and must be awaited.
|
|
1463
|
+
"""
|
|
1464
|
+
client = AsyncBatchResponses(
|
|
1465
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1466
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
1467
|
+
system_message=task.instructions,
|
|
1468
|
+
response_format=task.response_format,
|
|
1469
|
+
cache=cache,
|
|
1470
|
+
api_kwargs=api_kwargs,
|
|
633
1471
|
)
|
|
1472
|
+
results = await client.parse(self._obj.tolist())
|
|
634
1473
|
|
|
635
|
-
|
|
636
|
-
"""Execute a prepared task on every Series element (asynchronously).
|
|
1474
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
637
1475
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
1476
|
+
async def task(
|
|
1477
|
+
self,
|
|
1478
|
+
task: PreparedTask,
|
|
1479
|
+
batch_size: int | None = None,
|
|
1480
|
+
max_concurrency: int = 8,
|
|
1481
|
+
show_progress: bool = True,
|
|
1482
|
+
**api_kwargs,
|
|
1483
|
+
) -> pd.Series:
|
|
1484
|
+
"""Execute a prepared task on every Series element (asynchronously).
|
|
641
1485
|
|
|
642
1486
|
Example:
|
|
643
1487
|
```python
|
|
644
|
-
from openaivec.
|
|
1488
|
+
from openaivec._model import PreparedTask
|
|
645
1489
|
|
|
646
1490
|
# Assume you have a prepared task for sentiment analysis
|
|
647
1491
|
sentiment_task = PreparedTask(...)
|
|
@@ -649,17 +1493,32 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
649
1493
|
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
650
1494
|
# Must be awaited
|
|
651
1495
|
results = await reviews.aio.task(sentiment_task)
|
|
1496
|
+
|
|
1497
|
+
# With progress bar for large datasets
|
|
1498
|
+
large_reviews = pd.Series(["review text"] * 2000)
|
|
1499
|
+
results = await large_reviews.aio.task(
|
|
1500
|
+
sentiment_task,
|
|
1501
|
+
batch_size=50,
|
|
1502
|
+
max_concurrency=4,
|
|
1503
|
+
show_progress=True
|
|
1504
|
+
)
|
|
652
1505
|
```
|
|
653
|
-
This method returns a Series containing the task results for each
|
|
654
|
-
corresponding input element, following the task's defined structure.
|
|
655
1506
|
|
|
656
1507
|
Args:
|
|
657
1508
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
658
|
-
response format
|
|
659
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
660
|
-
request to optimize API usage. Defaults to
|
|
1509
|
+
response format for processing the inputs.
|
|
1510
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1511
|
+
request to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1512
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
661
1513
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
662
1514
|
requests. Defaults to 8.
|
|
1515
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
1516
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
1517
|
+
|
|
1518
|
+
Note:
|
|
1519
|
+
The task's stored API parameters are used. Core batching / routing
|
|
1520
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1521
|
+
library and cannot be overridden.
|
|
663
1522
|
|
|
664
1523
|
Returns:
|
|
665
1524
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -668,20 +1527,117 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
668
1527
|
Note:
|
|
669
1528
|
This is an asynchronous method and must be awaited.
|
|
670
1529
|
"""
|
|
671
|
-
|
|
672
|
-
client=_DI.resolve(AsyncOpenAI),
|
|
673
|
-
model_name=_DI.resolve(ResponsesModelName).value,
|
|
1530
|
+
return await self.task_with_cache(
|
|
674
1531
|
task=task,
|
|
675
|
-
|
|
1532
|
+
cache=AsyncBatchingMapProxy(
|
|
1533
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1534
|
+
),
|
|
1535
|
+
**api_kwargs,
|
|
676
1536
|
)
|
|
677
1537
|
|
|
678
|
-
|
|
679
|
-
|
|
1538
|
+
async def parse_with_cache(
|
|
1539
|
+
self,
|
|
1540
|
+
instructions: str,
|
|
1541
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1542
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1543
|
+
max_examples: int = 100,
|
|
1544
|
+
**api_kwargs,
|
|
1545
|
+
) -> pd.Series:
|
|
1546
|
+
"""Parse Series values into structured data using an LLM with a provided cache (asynchronously).
|
|
680
1547
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
1548
|
+
This async method provides external cache control while parsing Series
|
|
1549
|
+
content into structured data. Automatic schema inference is performed
|
|
1550
|
+
when no response format is specified.
|
|
1551
|
+
|
|
1552
|
+
Args:
|
|
1553
|
+
instructions (str): Plain language description of what to extract
|
|
1554
|
+
(e.g., "Extract dates, amounts, and descriptions from receipts").
|
|
1555
|
+
Guides both extraction and schema inference.
|
|
1556
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
|
|
1557
|
+
async cache for managing concurrent API calls and deduplication.
|
|
1558
|
+
Set cache.batch_size=None for automatic optimization.
|
|
1559
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1560
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
1561
|
+
type, or None for automatic inference. Defaults to None.
|
|
1562
|
+
max_examples (int, optional): Maximum values to analyze for schema
|
|
1563
|
+
inference (when response_format is None). Defaults to 100.
|
|
1564
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1565
|
+
|
|
1566
|
+
Returns:
|
|
1567
|
+
pandas.Series: Series containing parsed structured data aligned
|
|
1568
|
+
with the original index.
|
|
1569
|
+
|
|
1570
|
+
Note:
|
|
1571
|
+
This is an asynchronous method and must be awaited.
|
|
1572
|
+
"""
|
|
1573
|
+
schema: SchemaInferenceOutput | None = None
|
|
1574
|
+
if response_format is None:
|
|
1575
|
+
# Use synchronous schema inference
|
|
1576
|
+
schema = self._obj.ai.infer_schema(instructions=instructions, max_examples=max_examples)
|
|
1577
|
+
|
|
1578
|
+
return await self.responses_with_cache(
|
|
1579
|
+
instructions=schema.inference_prompt if schema else instructions,
|
|
1580
|
+
cache=cache,
|
|
1581
|
+
response_format=response_format or schema.model,
|
|
1582
|
+
**api_kwargs,
|
|
1583
|
+
)
|
|
1584
|
+
|
|
1585
|
+
async def parse(
|
|
1586
|
+
self,
|
|
1587
|
+
instructions: str,
|
|
1588
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1589
|
+
max_examples: int = 100,
|
|
1590
|
+
batch_size: int | None = None,
|
|
1591
|
+
max_concurrency: int = 8,
|
|
1592
|
+
show_progress: bool = True,
|
|
1593
|
+
**api_kwargs,
|
|
1594
|
+
) -> pd.Series:
|
|
1595
|
+
"""Parse Series values into structured data using an LLM (asynchronously).
|
|
1596
|
+
|
|
1597
|
+
Async version of the parse method, extracting structured information
|
|
1598
|
+
from unstructured text with automatic schema inference when needed.
|
|
1599
|
+
|
|
1600
|
+
Args:
|
|
1601
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1602
|
+
product names, prices, and categories from descriptions").
|
|
1603
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1604
|
+
structure. None triggers automatic schema inference. Defaults to None.
|
|
1605
|
+
max_examples (int, optional): Maximum values for schema inference.
|
|
1606
|
+
Defaults to 100.
|
|
1607
|
+
batch_size (int | None, optional): Requests per batch. None for
|
|
1608
|
+
automatic optimization. Defaults to None.
|
|
1609
|
+
max_concurrency (int, optional): Maximum concurrent API requests.
|
|
1610
|
+
Defaults to 8.
|
|
1611
|
+
show_progress (bool, optional): Show progress bar. Defaults to True.
|
|
1612
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1613
|
+
|
|
1614
|
+
Returns:
|
|
1615
|
+
pandas.Series: Parsed structured data indexed like the original Series.
|
|
1616
|
+
|
|
1617
|
+
Example:
|
|
1618
|
+
```python
|
|
1619
|
+
emails = pd.Series([
|
|
1620
|
+
"Meeting tomorrow at 3pm with John about Q4 planning",
|
|
1621
|
+
"Lunch with Sarah on Friday to discuss new project"
|
|
1622
|
+
])
|
|
1623
|
+
|
|
1624
|
+
# Async extraction with schema inference
|
|
1625
|
+
parsed = await emails.aio.parse(
|
|
1626
|
+
"Extract meeting details including time, person, and topic"
|
|
1627
|
+
)
|
|
1628
|
+
```
|
|
1629
|
+
|
|
1630
|
+
Note:
|
|
1631
|
+
This is an asynchronous method and must be awaited.
|
|
1632
|
+
"""
|
|
1633
|
+
return await self.parse_with_cache(
|
|
1634
|
+
instructions=instructions,
|
|
1635
|
+
cache=AsyncBatchingMapProxy(
|
|
1636
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1637
|
+
),
|
|
1638
|
+
response_format=response_format,
|
|
1639
|
+
max_examples=max_examples,
|
|
1640
|
+
**api_kwargs,
|
|
685
1641
|
)
|
|
686
1642
|
|
|
687
1643
|
|
|
@@ -692,78 +1648,163 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
692
1648
|
def __init__(self, df_obj: pd.DataFrame):
|
|
693
1649
|
self._obj = df_obj
|
|
694
1650
|
|
|
1651
|
+
async def responses_with_cache(
|
|
1652
|
+
self,
|
|
1653
|
+
instructions: str,
|
|
1654
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1655
|
+
response_format: type[ResponseFormat] = str,
|
|
1656
|
+
**api_kwargs,
|
|
1657
|
+
) -> pd.Series:
|
|
1658
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
|
|
1659
|
+
|
|
1660
|
+
This method allows external control over caching behavior by accepting
|
|
1661
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1662
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1663
|
+
is controlled by the cache instance itself.
|
|
1664
|
+
|
|
1665
|
+
Example:
|
|
1666
|
+
```python
|
|
1667
|
+
from openaivec._cache import AsyncBatchingMapProxy
|
|
1668
|
+
|
|
1669
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1670
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
1671
|
+
|
|
1672
|
+
df = pd.DataFrame([
|
|
1673
|
+
{"name": "cat", "legs": 4},
|
|
1674
|
+
{"name": "dog", "legs": 4},
|
|
1675
|
+
{"name": "elephant", "legs": 4},
|
|
1676
|
+
])
|
|
1677
|
+
# Must be awaited
|
|
1678
|
+
result = await df.aio.responses_with_cache(
|
|
1679
|
+
"what is the animal's name?",
|
|
1680
|
+
cache=shared_cache
|
|
1681
|
+
)
|
|
1682
|
+
```
|
|
1683
|
+
|
|
1684
|
+
Args:
|
|
1685
|
+
instructions (str): System prompt for the assistant.
|
|
1686
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1687
|
+
instance for managing API call batching and deduplication.
|
|
1688
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1689
|
+
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
1690
|
+
responses. Defaults to ``str``.
|
|
1691
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
1692
|
+
|
|
1693
|
+
Returns:
|
|
1694
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
1695
|
+
|
|
1696
|
+
Note:
|
|
1697
|
+
This is an asynchronous method and must be awaited.
|
|
1698
|
+
"""
|
|
1699
|
+
# Await the call to the async Series method using .aio
|
|
1700
|
+
return await _df_rows_to_json_series(self._obj).aio.responses_with_cache(
|
|
1701
|
+
instructions=instructions,
|
|
1702
|
+
cache=cache,
|
|
1703
|
+
response_format=response_format,
|
|
1704
|
+
**api_kwargs,
|
|
1705
|
+
)
|
|
1706
|
+
|
|
695
1707
|
async def responses(
|
|
696
1708
|
self,
|
|
697
1709
|
instructions: str,
|
|
698
|
-
response_format:
|
|
699
|
-
batch_size: int =
|
|
700
|
-
temperature: float = 0.0,
|
|
701
|
-
top_p: float = 1.0,
|
|
1710
|
+
response_format: type[ResponseFormat] = str,
|
|
1711
|
+
batch_size: int | None = None,
|
|
702
1712
|
max_concurrency: int = 8,
|
|
1713
|
+
show_progress: bool = True,
|
|
1714
|
+
**api_kwargs,
|
|
703
1715
|
) -> pd.Series:
|
|
704
|
-
"""Generate a response for each row after
|
|
1716
|
+
"""Generate a response for each row after serializing it to JSON (asynchronously).
|
|
705
1717
|
|
|
706
1718
|
Example:
|
|
707
1719
|
```python
|
|
708
1720
|
df = pd.DataFrame([
|
|
709
|
-
{
|
|
710
|
-
{
|
|
711
|
-
{
|
|
1721
|
+
{"name": "cat", "legs": 4},
|
|
1722
|
+
{"name": "dog", "legs": 4},
|
|
1723
|
+
{"name": "elephant", "legs": 4},
|
|
712
1724
|
])
|
|
713
1725
|
# Must be awaited
|
|
714
|
-
results = await df.aio.responses(
|
|
1726
|
+
results = await df.aio.responses("what is the animal's name?")
|
|
1727
|
+
|
|
1728
|
+
# With progress bar for large datasets
|
|
1729
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
1730
|
+
results = await large_df.aio.responses(
|
|
1731
|
+
"generate a name for this ID",
|
|
1732
|
+
batch_size=20,
|
|
1733
|
+
max_concurrency=4,
|
|
1734
|
+
show_progress=True
|
|
1735
|
+
)
|
|
715
1736
|
```
|
|
716
|
-
This method returns a Series of strings, each containing the
|
|
717
|
-
assistant's response to the corresponding input.
|
|
718
|
-
Each row is serialised to JSON before being sent to the assistant.
|
|
719
|
-
The model used is set by the `responses_model` function.
|
|
720
|
-
The default model is `gpt-4.1-mini`.
|
|
721
1737
|
|
|
722
1738
|
Args:
|
|
723
1739
|
instructions (str): System prompt for the assistant.
|
|
724
|
-
response_format (
|
|
1740
|
+
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
725
1741
|
responses. Defaults to ``str``.
|
|
726
|
-
batch_size (int, optional): Number of requests sent in one batch.
|
|
727
|
-
Defaults to ``
|
|
728
|
-
|
|
729
|
-
|
|
1742
|
+
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
1743
|
+
Defaults to ``None`` (automatic batch size optimization
|
|
1744
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
1745
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
730
1746
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
731
1747
|
requests. Defaults to ``8``.
|
|
1748
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
732
1749
|
|
|
733
1750
|
Returns:
|
|
734
|
-
pandas.Series: Responses aligned with the DataFrame
|
|
1751
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
735
1752
|
|
|
736
1753
|
Note:
|
|
737
1754
|
This is an asynchronous method and must be awaited.
|
|
738
1755
|
"""
|
|
739
|
-
|
|
740
|
-
lambda df: (
|
|
741
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
742
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
743
|
-
)
|
|
744
|
-
)
|
|
745
|
-
)
|
|
746
|
-
# Await the call to the async Series method using .aio
|
|
747
|
-
return await series_of_json.aio.responses(
|
|
1756
|
+
return await self.responses_with_cache(
|
|
748
1757
|
instructions=instructions,
|
|
1758
|
+
cache=AsyncBatchingMapProxy(
|
|
1759
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1760
|
+
),
|
|
749
1761
|
response_format=response_format,
|
|
750
|
-
|
|
751
|
-
temperature=temperature,
|
|
752
|
-
top_p=top_p,
|
|
753
|
-
max_concurrency=max_concurrency,
|
|
1762
|
+
**api_kwargs,
|
|
754
1763
|
)
|
|
755
1764
|
|
|
756
|
-
async def
|
|
757
|
-
|
|
1765
|
+
async def task_with_cache(
|
|
1766
|
+
self,
|
|
1767
|
+
task: PreparedTask[ResponseFormat],
|
|
1768
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1769
|
+
**api_kwargs,
|
|
1770
|
+
) -> pd.Series:
|
|
1771
|
+
"""Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
|
|
1772
|
+
|
|
1773
|
+
After serializing each row to JSON, this method executes the prepared task.
|
|
1774
|
+
|
|
1775
|
+
Args:
|
|
1776
|
+
task (PreparedTask): Prepared task (instructions + response_format).
|
|
1777
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1778
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
1779
|
+
|
|
1780
|
+
Note:
|
|
1781
|
+
Core routing keys are managed internally.
|
|
1782
|
+
|
|
1783
|
+
Returns:
|
|
1784
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
1785
|
+
|
|
1786
|
+
Note:
|
|
1787
|
+
This is an asynchronous method and must be awaited.
|
|
1788
|
+
"""
|
|
1789
|
+
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1790
|
+
task=task,
|
|
1791
|
+
cache=cache,
|
|
1792
|
+
**api_kwargs,
|
|
1793
|
+
)
|
|
758
1794
|
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
1795
|
+
async def task(
|
|
1796
|
+
self,
|
|
1797
|
+
task: PreparedTask,
|
|
1798
|
+
batch_size: int | None = None,
|
|
1799
|
+
max_concurrency: int = 8,
|
|
1800
|
+
show_progress: bool = True,
|
|
1801
|
+
**api_kwargs,
|
|
1802
|
+
) -> pd.Series:
|
|
1803
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
|
|
763
1804
|
|
|
764
1805
|
Example:
|
|
765
1806
|
```python
|
|
766
|
-
from openaivec.
|
|
1807
|
+
from openaivec._model import PreparedTask
|
|
767
1808
|
|
|
768
1809
|
# Assume you have a prepared task for data analysis
|
|
769
1810
|
analysis_task = PreparedTask(...)
|
|
@@ -775,17 +1816,31 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
775
1816
|
])
|
|
776
1817
|
# Must be awaited
|
|
777
1818
|
results = await df.aio.task(analysis_task)
|
|
1819
|
+
|
|
1820
|
+
# With progress bar for large datasets
|
|
1821
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
1822
|
+
results = await large_df.aio.task(
|
|
1823
|
+
analysis_task,
|
|
1824
|
+
batch_size=50,
|
|
1825
|
+
max_concurrency=4,
|
|
1826
|
+
show_progress=True
|
|
1827
|
+
)
|
|
778
1828
|
```
|
|
779
|
-
This method returns a Series containing the task results for each
|
|
780
|
-
corresponding row, following the task's defined structure.
|
|
781
1829
|
|
|
782
1830
|
Args:
|
|
783
1831
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
784
|
-
response format
|
|
785
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
786
|
-
to optimize API usage. Defaults to
|
|
1832
|
+
response format for processing the inputs.
|
|
1833
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
1834
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1835
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
787
1836
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
788
1837
|
requests. Defaults to 8.
|
|
1838
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
1839
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
1840
|
+
|
|
1841
|
+
Note:
|
|
1842
|
+
Core batching / routing keys (``model``, ``instructions`` / system message, user ``input``)
|
|
1843
|
+
are managed by the library and cannot be overridden.
|
|
789
1844
|
|
|
790
1845
|
Returns:
|
|
791
1846
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -794,27 +1849,131 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
794
1849
|
Note:
|
|
795
1850
|
This is an asynchronous method and must be awaited.
|
|
796
1851
|
"""
|
|
797
|
-
series_of_json = self._obj.pipe(
|
|
798
|
-
lambda df: (
|
|
799
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
800
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
801
|
-
)
|
|
802
|
-
)
|
|
803
|
-
)
|
|
804
1852
|
# Await the call to the async Series method using .aio
|
|
805
|
-
return await
|
|
1853
|
+
return await _df_rows_to_json_series(self._obj).aio.task(
|
|
806
1854
|
task=task,
|
|
807
1855
|
batch_size=batch_size,
|
|
808
1856
|
max_concurrency=max_concurrency,
|
|
1857
|
+
show_progress=show_progress,
|
|
1858
|
+
**api_kwargs,
|
|
809
1859
|
)
|
|
810
1860
|
|
|
811
|
-
async def
|
|
1861
|
+
async def parse_with_cache(
|
|
1862
|
+
self,
|
|
1863
|
+
instructions: str,
|
|
1864
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1865
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1866
|
+
max_examples: int = 100,
|
|
1867
|
+
**api_kwargs,
|
|
1868
|
+
) -> pd.Series:
|
|
1869
|
+
"""Parse DataFrame rows into structured data using an LLM with cache (asynchronously).
|
|
1870
|
+
|
|
1871
|
+
Async method for parsing DataFrame rows (as JSON) with external cache
|
|
1872
|
+
control, enabling deduplication across operations and concurrent processing.
|
|
1873
|
+
|
|
1874
|
+
Args:
|
|
1875
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1876
|
+
invoice details including items, quantities, and totals").
|
|
1877
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
|
|
1878
|
+
async cache for concurrent API call management.
|
|
1879
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1880
|
+
structure. None triggers automatic schema inference. Defaults to None.
|
|
1881
|
+
max_examples (int, optional): Maximum rows for schema inference.
|
|
1882
|
+
Defaults to 100.
|
|
1883
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1884
|
+
|
|
1885
|
+
Returns:
|
|
1886
|
+
pandas.Series: Parsed structured data indexed like the original DataFrame.
|
|
1887
|
+
|
|
1888
|
+
Note:
|
|
1889
|
+
This is an asynchronous method and must be awaited.
|
|
1890
|
+
"""
|
|
1891
|
+
return await _df_rows_to_json_series(self._obj).aio.parse_with_cache(
|
|
1892
|
+
instructions=instructions,
|
|
1893
|
+
cache=cache,
|
|
1894
|
+
response_format=response_format,
|
|
1895
|
+
max_examples=max_examples,
|
|
1896
|
+
**api_kwargs,
|
|
1897
|
+
)
|
|
1898
|
+
|
|
1899
|
+
async def parse(
|
|
1900
|
+
self,
|
|
1901
|
+
instructions: str,
|
|
1902
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1903
|
+
max_examples: int = 100,
|
|
1904
|
+
batch_size: int | None = None,
|
|
1905
|
+
max_concurrency: int = 8,
|
|
1906
|
+
show_progress: bool = True,
|
|
1907
|
+
**api_kwargs,
|
|
1908
|
+
) -> pd.Series:
|
|
1909
|
+
"""Parse DataFrame rows into structured data using an LLM (asynchronously).
|
|
1910
|
+
|
|
1911
|
+
Async version for extracting structured information from DataFrame rows,
|
|
1912
|
+
with automatic schema inference when no format is specified.
|
|
1913
|
+
|
|
1914
|
+
Args:
|
|
1915
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1916
|
+
customer details, order items, and payment information").
|
|
1917
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1918
|
+
structure. None triggers automatic inference. Defaults to None.
|
|
1919
|
+
max_examples (int, optional): Maximum rows for schema inference.
|
|
1920
|
+
Defaults to 100.
|
|
1921
|
+
batch_size (int | None, optional): Rows per batch. None for
|
|
1922
|
+
automatic optimization. Defaults to None.
|
|
1923
|
+
max_concurrency (int, optional): Maximum concurrent requests.
|
|
1924
|
+
Defaults to 8.
|
|
1925
|
+
show_progress (bool, optional): Show progress bar. Defaults to True.
|
|
1926
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1927
|
+
|
|
1928
|
+
Returns:
|
|
1929
|
+
pandas.Series: Parsed structured data indexed like the original DataFrame.
|
|
1930
|
+
|
|
1931
|
+
Example:
|
|
1932
|
+
```python
|
|
1933
|
+
df = pd.DataFrame({
|
|
1934
|
+
'raw_data': [
|
|
1935
|
+
'Customer: John Doe, Order: 2 laptops @ $1200 each',
|
|
1936
|
+
'Customer: Jane Smith, Order: 5 phones @ $800 each'
|
|
1937
|
+
]
|
|
1938
|
+
})
|
|
1939
|
+
|
|
1940
|
+
# Async parsing with automatic schema inference
|
|
1941
|
+
parsed = await df.aio.parse(
|
|
1942
|
+
"Extract customer name, product, quantity, and unit price"
|
|
1943
|
+
)
|
|
1944
|
+
```
|
|
1945
|
+
|
|
1946
|
+
Note:
|
|
1947
|
+
This is an asynchronous method and must be awaited.
|
|
812
1948
|
"""
|
|
813
|
-
|
|
1949
|
+
return await self.parse_with_cache(
|
|
1950
|
+
instructions=instructions,
|
|
1951
|
+
cache=AsyncBatchingMapProxy(
|
|
1952
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1953
|
+
),
|
|
1954
|
+
response_format=response_format,
|
|
1955
|
+
max_examples=max_examples,
|
|
1956
|
+
**api_kwargs,
|
|
1957
|
+
)
|
|
1958
|
+
|
|
1959
|
+
async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
|
|
1960
|
+
"""Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
|
|
814
1961
|
|
|
815
1962
|
This method allows chaining operations on the DataFrame, similar to pandas' `pipe` method,
|
|
816
1963
|
but with support for asynchronous functions.
|
|
817
1964
|
|
|
1965
|
+
Example:
|
|
1966
|
+
```python
|
|
1967
|
+
async def process_data(df):
|
|
1968
|
+
# Simulate an asynchronous computation
|
|
1969
|
+
await asyncio.sleep(1)
|
|
1970
|
+
return df.dropna()
|
|
1971
|
+
|
|
1972
|
+
df = pd.DataFrame({"col": [1, 2, None, 4]})
|
|
1973
|
+
# Must be awaited
|
|
1974
|
+
result = await df.aio.pipe(process_data)
|
|
1975
|
+
```
|
|
1976
|
+
|
|
818
1977
|
Args:
|
|
819
1978
|
func (Callable[[pd.DataFrame], Awaitable[T] | T]): A function that takes a DataFrame
|
|
820
1979
|
as input and returns either a result or an awaitable result.
|
|
@@ -831,7 +1990,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
831
1990
|
else:
|
|
832
1991
|
return result
|
|
833
1992
|
|
|
834
|
-
async def assign(self, **kwargs
|
|
1993
|
+
async def assign(self, **kwargs) -> pd.DataFrame:
|
|
835
1994
|
"""Asynchronously assign new columns to the DataFrame, evaluating sequentially.
|
|
836
1995
|
|
|
837
1996
|
This method extends pandas' `assign` method by supporting asynchronous
|
|
@@ -866,7 +2025,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
866
2025
|
```
|
|
867
2026
|
|
|
868
2027
|
Args:
|
|
869
|
-
**kwargs:
|
|
2028
|
+
**kwargs: Column names as keys and either static values or callables
|
|
870
2029
|
(synchronous or asynchronous) as values.
|
|
871
2030
|
|
|
872
2031
|
Returns:
|
|
@@ -891,7 +2050,12 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
891
2050
|
return df_current
|
|
892
2051
|
|
|
893
2052
|
async def fillna(
|
|
894
|
-
self,
|
|
2053
|
+
self,
|
|
2054
|
+
target_column_name: str,
|
|
2055
|
+
max_examples: int = 500,
|
|
2056
|
+
batch_size: int | None = None,
|
|
2057
|
+
max_concurrency: int = 8,
|
|
2058
|
+
show_progress: bool = True,
|
|
895
2059
|
) -> pd.DataFrame:
|
|
896
2060
|
"""Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
|
|
897
2061
|
|
|
@@ -906,10 +2070,12 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
906
2070
|
max_examples (int, optional): The maximum number of example rows to use
|
|
907
2071
|
for context when predicting missing values. Higher values may improve
|
|
908
2072
|
accuracy but increase API costs and processing time. Defaults to 500.
|
|
909
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
910
|
-
to optimize API usage. Defaults to
|
|
2073
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
2074
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
2075
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
911
2076
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
912
2077
|
requests. Defaults to 8.
|
|
2078
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
913
2079
|
|
|
914
2080
|
Returns:
|
|
915
2081
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
@@ -925,6 +2091,15 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
925
2091
|
|
|
926
2092
|
# Fill missing values in the 'name' column (must be awaited)
|
|
927
2093
|
filled_df = await df.aio.fillna('name')
|
|
2094
|
+
|
|
2095
|
+
# With progress bar for large datasets
|
|
2096
|
+
large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
|
|
2097
|
+
filled_df = await large_df.aio.fillna(
|
|
2098
|
+
'name',
|
|
2099
|
+
batch_size=32,
|
|
2100
|
+
max_concurrency=4,
|
|
2101
|
+
show_progress=True
|
|
2102
|
+
)
|
|
928
2103
|
```
|
|
929
2104
|
|
|
930
2105
|
Note:
|
|
@@ -938,8 +2113,11 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
938
2113
|
if missing_rows.empty:
|
|
939
2114
|
return self._obj
|
|
940
2115
|
|
|
941
|
-
filled_values:
|
|
942
|
-
task=task,
|
|
2116
|
+
filled_values: list[FillNaResponse] = await missing_rows.aio.task(
|
|
2117
|
+
task=task,
|
|
2118
|
+
batch_size=batch_size,
|
|
2119
|
+
max_concurrency=max_concurrency,
|
|
2120
|
+
show_progress=show_progress,
|
|
943
2121
|
)
|
|
944
2122
|
|
|
945
2123
|
# get deep copy of the DataFrame to avoid modifying the original
|