openaivec 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_proxy.py +24 -2
- openaivec/_responses.py +77 -25
- openaivec/_schema.py +454 -0
- openaivec/pandas_ext.py +559 -423
- openaivec/spark.py +21 -1
- {openaivec-0.14.2.dist-info → openaivec-0.14.4.dist-info}/METADATA +1 -1
- {openaivec-0.14.2.dist-info → openaivec-0.14.4.dist-info}/RECORD +9 -8
- {openaivec-0.14.2.dist-info → openaivec-0.14.4.dist-info}/WHEEL +0 -0
- {openaivec-0.14.2.dist-info → openaivec-0.14.4.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -74,6 +74,21 @@ __all__ = [
|
|
|
74
74
|
_LOGGER = logging.getLogger(__name__)
|
|
75
75
|
|
|
76
76
|
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# Internal helpers (not exported)
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
def _df_rows_to_json_series(df: pd.DataFrame) -> pd.Series:
|
|
81
|
+
"""Return a Series of JSON strings (UTF-8, no ASCII escaping) representing DataFrame rows.
|
|
82
|
+
|
|
83
|
+
Each element is the JSON serialisation of the corresponding row as a dict. Index and
|
|
84
|
+
name are preserved so downstream operations retain alignment. This consolidates the
|
|
85
|
+
previously duplicated inline pipeline used by responses*/task* DataFrame helpers.
|
|
86
|
+
"""
|
|
87
|
+
return pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
88
|
+
lambda x: json.dumps(x, ensure_ascii=False)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
77
92
|
T = TypeVar("T") # For pipe function return type
|
|
78
93
|
|
|
79
94
|
|
|
@@ -165,7 +180,29 @@ class OpenAIVecSeriesAccessor:
|
|
|
165
180
|
response_format: Type[ResponseFormat] = str,
|
|
166
181
|
temperature: float | None = 0.0,
|
|
167
182
|
top_p: float = 1.0,
|
|
183
|
+
**api_kwargs,
|
|
168
184
|
) -> pd.Series:
|
|
185
|
+
"""Call an LLM once for every Series element using a provided cache.
|
|
186
|
+
|
|
187
|
+
This is a lower-level method that allows explicit cache management for advanced
|
|
188
|
+
use cases. Most users should use the standard ``responses`` method instead.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
instructions (str): System prompt prepended to every user message.
|
|
192
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
|
|
193
|
+
batching and deduplication control.
|
|
194
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built-in
|
|
195
|
+
type the assistant should return. Defaults to ``str``.
|
|
196
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
197
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
198
|
+
|
|
199
|
+
Additional Keyword Args:
|
|
200
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
201
|
+
``seed``, etc.) are forwarded verbatim to the underlying client.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
205
|
+
"""
|
|
169
206
|
client: BatchResponses = BatchResponses(
|
|
170
207
|
client=CONTAINER.resolve(OpenAI),
|
|
171
208
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
@@ -176,7 +213,58 @@ class OpenAIVecSeriesAccessor:
|
|
|
176
213
|
top_p=top_p,
|
|
177
214
|
)
|
|
178
215
|
|
|
179
|
-
|
|
216
|
+
# Forward any extra kwargs to the underlying Responses API.
|
|
217
|
+
return pd.Series(client.parse(self._obj.tolist(), **api_kwargs), index=self._obj.index, name=self._obj.name)
|
|
218
|
+
|
|
219
|
+
def responses(
|
|
220
|
+
self,
|
|
221
|
+
instructions: str,
|
|
222
|
+
response_format: Type[ResponseFormat] = str,
|
|
223
|
+
batch_size: int | None = None,
|
|
224
|
+
temperature: float | None = 0.0,
|
|
225
|
+
top_p: float = 1.0,
|
|
226
|
+
show_progress: bool = False,
|
|
227
|
+
**api_kwargs,
|
|
228
|
+
) -> pd.Series:
|
|
229
|
+
"""Call an LLM once for every Series element.
|
|
230
|
+
|
|
231
|
+
Example:
|
|
232
|
+
```python
|
|
233
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
234
|
+
# Basic usage
|
|
235
|
+
animals.ai.responses("translate to French")
|
|
236
|
+
|
|
237
|
+
# With progress bar in Jupyter notebooks
|
|
238
|
+
large_series = pd.Series(["data"] * 1000)
|
|
239
|
+
large_series.ai.responses(
|
|
240
|
+
"analyze this data",
|
|
241
|
+
batch_size=32,
|
|
242
|
+
show_progress=True
|
|
243
|
+
)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
instructions (str): System prompt prepended to every user message.
|
|
248
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
249
|
+
type the assistant should return. Defaults to ``str``.
|
|
250
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
251
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
252
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
253
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
254
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
255
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
259
|
+
"""
|
|
260
|
+
return self.responses_with_cache(
|
|
261
|
+
instructions=instructions,
|
|
262
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
263
|
+
response_format=response_format,
|
|
264
|
+
temperature=temperature,
|
|
265
|
+
top_p=top_p,
|
|
266
|
+
**api_kwargs,
|
|
267
|
+
)
|
|
180
268
|
|
|
181
269
|
def embeddings_with_cache(
|
|
182
270
|
self,
|
|
@@ -188,15 +276,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
188
276
|
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
189
277
|
across multiple operations or custom batch size management.
|
|
190
278
|
|
|
191
|
-
Args:
|
|
192
|
-
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
193
|
-
instance for managing API call batching and deduplication.
|
|
194
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
195
|
-
|
|
196
|
-
Returns:
|
|
197
|
-
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
198
|
-
(dtype ``float32``).
|
|
199
|
-
|
|
200
279
|
Example:
|
|
201
280
|
```python
|
|
202
281
|
from openaivec._proxy import BatchingMapProxy
|
|
@@ -208,6 +287,15 @@ class OpenAIVecSeriesAccessor:
|
|
|
208
287
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
209
288
|
embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
|
|
210
289
|
```
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
293
|
+
instance for managing API call batching and deduplication.
|
|
294
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
298
|
+
(dtype ``float32``).
|
|
211
299
|
"""
|
|
212
300
|
client: BatchEmbeddings = BatchEmbeddings(
|
|
213
301
|
client=CONTAINER.resolve(OpenAI),
|
|
@@ -221,96 +309,69 @@ class OpenAIVecSeriesAccessor:
|
|
|
221
309
|
name=self._obj.name,
|
|
222
310
|
)
|
|
223
311
|
|
|
224
|
-
def
|
|
225
|
-
|
|
226
|
-
instructions: str,
|
|
227
|
-
response_format: Type[ResponseFormat] = str,
|
|
228
|
-
batch_size: int | None = None,
|
|
229
|
-
temperature: float | None = 0.0,
|
|
230
|
-
top_p: float = 1.0,
|
|
231
|
-
show_progress: bool = False,
|
|
232
|
-
) -> pd.Series:
|
|
233
|
-
"""Call an LLM once for every Series element.
|
|
312
|
+
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
313
|
+
"""Compute OpenAI embeddings for every Series element.
|
|
234
314
|
|
|
235
315
|
Example:
|
|
236
316
|
```python
|
|
237
317
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
238
318
|
# Basic usage
|
|
239
|
-
animals.ai.
|
|
319
|
+
animals.ai.embeddings()
|
|
240
320
|
|
|
241
|
-
# With progress bar
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
batch_size=32,
|
|
321
|
+
# With progress bar for large datasets
|
|
322
|
+
large_texts = pd.Series(["text"] * 5000)
|
|
323
|
+
embeddings = large_texts.ai.embeddings(
|
|
324
|
+
batch_size=100,
|
|
246
325
|
show_progress=True
|
|
247
326
|
)
|
|
248
327
|
```
|
|
249
|
-
This method returns a Series of strings, each containing the
|
|
250
|
-
assistant's response to the corresponding input.
|
|
251
|
-
The model used is set by the `responses_model` function.
|
|
252
|
-
The default model is `gpt-4.1-mini`.
|
|
253
328
|
|
|
254
329
|
Args:
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
type the assistant should return. Defaults to ``str``.
|
|
258
|
-
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
259
|
-
request. Defaults to ``None`` (automatic batch size optimization
|
|
330
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
331
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
260
332
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
261
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
262
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
263
333
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
264
334
|
|
|
265
335
|
Returns:
|
|
266
|
-
pandas.Series: Series whose values are
|
|
336
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
337
|
+
(dtype ``float32``).
|
|
267
338
|
"""
|
|
268
|
-
return self.
|
|
269
|
-
instructions=instructions,
|
|
339
|
+
return self.embeddings_with_cache(
|
|
270
340
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
271
|
-
response_format=response_format,
|
|
272
|
-
temperature=temperature,
|
|
273
|
-
top_p=top_p,
|
|
274
341
|
)
|
|
275
342
|
|
|
276
343
|
def task_with_cache(
|
|
277
344
|
self,
|
|
278
345
|
task: PreparedTask[ResponseFormat],
|
|
279
346
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
347
|
+
**api_kwargs,
|
|
280
348
|
) -> pd.Series:
|
|
281
349
|
"""Execute a prepared task on every Series element using a provided cache.
|
|
282
350
|
|
|
283
|
-
This
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
Args:
|
|
288
|
-
task (PreparedTask): A pre-configured task containing instructions,
|
|
289
|
-
response format, and other parameters for processing the inputs.
|
|
290
|
-
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
291
|
-
instance for managing API call batching and deduplication.
|
|
292
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
293
|
-
|
|
294
|
-
Returns:
|
|
295
|
-
pandas.Series: Series whose values are instances of the task's
|
|
296
|
-
response format, aligned with the original Series index.
|
|
351
|
+
This mirrors ``responses_with_cache`` but uses the task's stored instructions,
|
|
352
|
+
response format, temperature and top_p. A supplied ``BatchingMapProxy`` enables
|
|
353
|
+
cross‑operation deduplicated reuse and external batch size / progress control.
|
|
297
354
|
|
|
298
355
|
Example:
|
|
299
356
|
```python
|
|
300
|
-
from openaivec._model import PreparedTask
|
|
301
357
|
from openaivec._proxy import BatchingMapProxy
|
|
302
|
-
|
|
303
|
-
# Create a shared cache with custom batch size
|
|
304
358
|
shared_cache = BatchingMapProxy(batch_size=64)
|
|
359
|
+
reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
360
|
+
```
|
|
305
361
|
|
|
306
|
-
|
|
307
|
-
|
|
362
|
+
Args:
|
|
363
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
364
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
308
365
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
366
|
+
Additional Keyword Args:
|
|
367
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
368
|
+
``seed``, etc.) forwarded verbatim to the underlying client. Core routing keys
|
|
369
|
+
(``model``, system instructions, user input) are managed internally and cannot be overridden.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
pandas.Series: Task results aligned with the original Series index.
|
|
312
373
|
"""
|
|
313
|
-
client = BatchResponses(
|
|
374
|
+
client: BatchResponses = BatchResponses(
|
|
314
375
|
client=CONTAINER.resolve(OpenAI),
|
|
315
376
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
316
377
|
system_message=task.instructions,
|
|
@@ -319,15 +380,17 @@ class OpenAIVecSeriesAccessor:
|
|
|
319
380
|
temperature=task.temperature,
|
|
320
381
|
top_p=task.top_p,
|
|
321
382
|
)
|
|
322
|
-
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
383
|
+
return pd.Series(client.parse(self._obj.tolist(), **api_kwargs), index=self._obj.index, name=self._obj.name)
|
|
323
384
|
|
|
324
|
-
def task(
|
|
385
|
+
def task(
|
|
386
|
+
self,
|
|
387
|
+
task: PreparedTask,
|
|
388
|
+
batch_size: int | None = None,
|
|
389
|
+
show_progress: bool = False,
|
|
390
|
+
**api_kwargs,
|
|
391
|
+
) -> pd.Series:
|
|
325
392
|
"""Execute a prepared task on every Series element.
|
|
326
393
|
|
|
327
|
-
This method applies a pre-configured task to each element in the Series,
|
|
328
|
-
using the task's instructions and response format to generate structured
|
|
329
|
-
responses from the language model.
|
|
330
|
-
|
|
331
394
|
Example:
|
|
332
395
|
```python
|
|
333
396
|
from openaivec._model import PreparedTask
|
|
@@ -347,8 +410,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
347
410
|
show_progress=True
|
|
348
411
|
)
|
|
349
412
|
```
|
|
350
|
-
This method returns a Series containing the task results for each
|
|
351
|
-
corresponding input element, following the task's defined structure.
|
|
352
413
|
|
|
353
414
|
Args:
|
|
354
415
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
@@ -358,48 +419,19 @@ class OpenAIVecSeriesAccessor:
|
|
|
358
419
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
359
420
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
360
421
|
|
|
422
|
+
Additional Keyword Args:
|
|
423
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
424
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
425
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
426
|
+
library and cannot be overridden.
|
|
427
|
+
|
|
361
428
|
Returns:
|
|
362
|
-
pandas.Series: Series whose values are instances of the task's
|
|
363
|
-
response format, aligned with the original Series index.
|
|
429
|
+
pandas.Series: Series whose values are instances of the task's response format.
|
|
364
430
|
"""
|
|
365
431
|
return self.task_with_cache(
|
|
366
432
|
task=task,
|
|
367
433
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
371
|
-
"""Compute OpenAI embeddings for every Series element.
|
|
372
|
-
|
|
373
|
-
Example:
|
|
374
|
-
```python
|
|
375
|
-
animals = pd.Series(["cat", "dog", "elephant"])
|
|
376
|
-
# Basic usage
|
|
377
|
-
animals.ai.embeddings()
|
|
378
|
-
|
|
379
|
-
# With progress bar for large datasets
|
|
380
|
-
large_texts = pd.Series(["text"] * 5000)
|
|
381
|
-
embeddings = large_texts.ai.embeddings(
|
|
382
|
-
batch_size=100,
|
|
383
|
-
show_progress=True
|
|
384
|
-
)
|
|
385
|
-
```
|
|
386
|
-
This method returns a Series of numpy arrays, each containing the
|
|
387
|
-
embedding vector for the corresponding input.
|
|
388
|
-
The embedding model is set by the `embeddings_model` function.
|
|
389
|
-
The default embedding model is `text-embedding-3-small`.
|
|
390
|
-
|
|
391
|
-
Args:
|
|
392
|
-
batch_size (int | None, optional): Number of inputs grouped into a
|
|
393
|
-
single request. Defaults to ``None`` (automatic batch size optimization
|
|
394
|
-
based on execution time). Set to a positive integer for fixed batch size.
|
|
395
|
-
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
396
|
-
|
|
397
|
-
Returns:
|
|
398
|
-
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
399
|
-
(dtype ``float32``).
|
|
400
|
-
"""
|
|
401
|
-
return self.embeddings_with_cache(
|
|
402
|
-
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
434
|
+
**api_kwargs,
|
|
403
435
|
)
|
|
404
436
|
|
|
405
437
|
def count_tokens(self) -> pd.Series:
|
|
@@ -456,38 +488,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
456
488
|
def __init__(self, df_obj: pd.DataFrame):
|
|
457
489
|
self._obj = df_obj
|
|
458
490
|
|
|
459
|
-
def extract(self, column: str) -> pd.DataFrame:
|
|
460
|
-
"""Flatten one column of Pydantic models/dicts into top‑level columns.
|
|
461
|
-
|
|
462
|
-
Example:
|
|
463
|
-
```python
|
|
464
|
-
df = pd.DataFrame([
|
|
465
|
-
{"animal": {"name": "cat", "legs": 4}},
|
|
466
|
-
{"animal": {"name": "dog", "legs": 4}},
|
|
467
|
-
{"animal": {"name": "elephant", "legs": 4}},
|
|
468
|
-
])
|
|
469
|
-
df.ai.extract("animal")
|
|
470
|
-
```
|
|
471
|
-
This method returns a DataFrame with the same index as the original,
|
|
472
|
-
where each column corresponds to a key in the dictionaries.
|
|
473
|
-
The source column is dropped.
|
|
474
|
-
|
|
475
|
-
Args:
|
|
476
|
-
column (str): Column to expand.
|
|
477
|
-
|
|
478
|
-
Returns:
|
|
479
|
-
pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
|
|
480
|
-
"""
|
|
481
|
-
if column not in self._obj.columns:
|
|
482
|
-
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
|
483
|
-
|
|
484
|
-
return (
|
|
485
|
-
self._obj.pipe(lambda df: df.reset_index(drop=True))
|
|
486
|
-
.pipe(lambda df: df.join(df[column].ai.extract()))
|
|
487
|
-
.pipe(lambda df: df.set_index(self._obj.index))
|
|
488
|
-
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
489
|
-
)
|
|
490
|
-
|
|
491
491
|
def responses_with_cache(
|
|
492
492
|
self,
|
|
493
493
|
instructions: str,
|
|
@@ -495,26 +495,14 @@ class OpenAIVecDataFrameAccessor:
|
|
|
495
495
|
response_format: Type[ResponseFormat] = str,
|
|
496
496
|
temperature: float | None = 0.0,
|
|
497
497
|
top_p: float = 1.0,
|
|
498
|
+
**api_kwargs,
|
|
498
499
|
) -> pd.Series:
|
|
499
|
-
"""Generate a response for each row after
|
|
500
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache.
|
|
500
501
|
|
|
501
502
|
This method allows external control over caching behavior by accepting
|
|
502
503
|
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
503
504
|
across multiple operations or custom batch size management.
|
|
504
505
|
|
|
505
|
-
Args:
|
|
506
|
-
instructions (str): System prompt for the assistant.
|
|
507
|
-
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
508
|
-
instance for managing API call batching and deduplication.
|
|
509
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
510
|
-
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
511
|
-
responses. Defaults to ``str``.
|
|
512
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
513
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
514
|
-
|
|
515
|
-
Returns:
|
|
516
|
-
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
517
|
-
|
|
518
506
|
Example:
|
|
519
507
|
```python
|
|
520
508
|
from openaivec._proxy import BatchingMapProxy
|
|
@@ -532,19 +520,27 @@ class OpenAIVecDataFrameAccessor:
|
|
|
532
520
|
cache=shared_cache
|
|
533
521
|
)
|
|
534
522
|
```
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
instructions (str): System prompt for the assistant.
|
|
526
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
527
|
+
instance for managing API call batching and deduplication.
|
|
528
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
529
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
530
|
+
responses. Defaults to ``str``.
|
|
531
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
532
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
533
|
+
|
|
534
|
+
Returns:
|
|
535
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
535
536
|
"""
|
|
536
|
-
return self._obj.
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
response_format=response_format,
|
|
544
|
-
temperature=temperature,
|
|
545
|
-
top_p=top_p,
|
|
546
|
-
)
|
|
547
|
-
)
|
|
537
|
+
return _df_rows_to_json_series(self._obj).ai.responses_with_cache(
|
|
538
|
+
instructions=instructions,
|
|
539
|
+
cache=cache,
|
|
540
|
+
response_format=response_format,
|
|
541
|
+
temperature=temperature,
|
|
542
|
+
top_p=top_p,
|
|
543
|
+
**api_kwargs,
|
|
548
544
|
)
|
|
549
545
|
|
|
550
546
|
def responses(
|
|
@@ -555,8 +551,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
555
551
|
temperature: float | None = 0.0,
|
|
556
552
|
top_p: float = 1.0,
|
|
557
553
|
show_progress: bool = False,
|
|
554
|
+
**api_kwargs,
|
|
558
555
|
) -> pd.Series:
|
|
559
|
-
"""Generate a response for each row after
|
|
556
|
+
"""Generate a response for each row after serializing it to JSON.
|
|
560
557
|
|
|
561
558
|
Example:
|
|
562
559
|
```python
|
|
@@ -576,11 +573,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
576
573
|
show_progress=True
|
|
577
574
|
)
|
|
578
575
|
```
|
|
579
|
-
This method returns a Series of strings, each containing the
|
|
580
|
-
assistant's response to the corresponding input.
|
|
581
|
-
Each row is serialised to JSON before being sent to the assistant.
|
|
582
|
-
The model used is set by the `responses_model` function.
|
|
583
|
-
The default model is `gpt-4.1-mini`.
|
|
584
576
|
|
|
585
577
|
Args:
|
|
586
578
|
instructions (str): System prompt for the assistant.
|
|
@@ -589,7 +581,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
589
581
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
590
582
|
Defaults to ``None`` (automatic batch size optimization
|
|
591
583
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
592
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
584
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
593
585
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
594
586
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
595
587
|
|
|
@@ -602,15 +594,42 @@ class OpenAIVecDataFrameAccessor:
|
|
|
602
594
|
response_format=response_format,
|
|
603
595
|
temperature=temperature,
|
|
604
596
|
top_p=top_p,
|
|
597
|
+
**api_kwargs,
|
|
605
598
|
)
|
|
606
599
|
|
|
607
|
-
def
|
|
608
|
-
|
|
600
|
+
def task_with_cache(
|
|
601
|
+
self,
|
|
602
|
+
task: PreparedTask[ResponseFormat],
|
|
603
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
604
|
+
**api_kwargs,
|
|
605
|
+
) -> pd.Series:
|
|
606
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
|
|
607
|
+
|
|
608
|
+
Args:
|
|
609
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
610
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
611
|
+
|
|
612
|
+
Additional Keyword Args:
|
|
613
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
614
|
+
``seed``) forwarded verbatim. Core routing keys are managed internally.
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
618
|
+
"""
|
|
619
|
+
return _df_rows_to_json_series(self._obj).ai.task_with_cache(
|
|
620
|
+
task=task,
|
|
621
|
+
cache=cache,
|
|
622
|
+
**api_kwargs,
|
|
623
|
+
)
|
|
609
624
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
625
|
+
def task(
|
|
626
|
+
self,
|
|
627
|
+
task: PreparedTask,
|
|
628
|
+
batch_size: int | None = None,
|
|
629
|
+
show_progress: bool = False,
|
|
630
|
+
**api_kwargs,
|
|
631
|
+
) -> pd.Series:
|
|
632
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON.
|
|
614
633
|
|
|
615
634
|
Example:
|
|
616
635
|
```python
|
|
@@ -624,10 +643,17 @@ class OpenAIVecDataFrameAccessor:
|
|
|
624
643
|
{"name": "dog", "legs": 4},
|
|
625
644
|
{"name": "elephant", "legs": 4},
|
|
626
645
|
])
|
|
646
|
+
# Basic usage
|
|
627
647
|
results = df.ai.task(analysis_task)
|
|
648
|
+
|
|
649
|
+
# With progress bar for large datasets
|
|
650
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
651
|
+
results = large_df.ai.task(
|
|
652
|
+
analysis_task,
|
|
653
|
+
batch_size=50,
|
|
654
|
+
show_progress=True
|
|
655
|
+
)
|
|
628
656
|
```
|
|
629
|
-
This method returns a Series containing the task results for each
|
|
630
|
-
corresponding row, following the task's defined structure.
|
|
631
657
|
|
|
632
658
|
Args:
|
|
633
659
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
@@ -637,19 +663,63 @@ class OpenAIVecDataFrameAccessor:
|
|
|
637
663
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
638
664
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
639
665
|
|
|
666
|
+
Additional Keyword Args:
|
|
667
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
668
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
669
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
670
|
+
library and cannot be overridden.
|
|
671
|
+
|
|
640
672
|
Returns:
|
|
641
673
|
pandas.Series: Series whose values are instances of the task's
|
|
642
674
|
response format, aligned with the DataFrame's original index.
|
|
643
675
|
"""
|
|
644
|
-
return self._obj.
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
676
|
+
return _df_rows_to_json_series(self._obj).ai.task(
|
|
677
|
+
task=task,
|
|
678
|
+
batch_size=batch_size,
|
|
679
|
+
show_progress=show_progress,
|
|
680
|
+
**api_kwargs,
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
def extract(self, column: str) -> pd.DataFrame:
|
|
684
|
+
"""Flatten one column of Pydantic models/dicts into top‑level columns.
|
|
685
|
+
|
|
686
|
+
Example:
|
|
687
|
+
```python
|
|
688
|
+
df = pd.DataFrame([
|
|
689
|
+
{"animal": {"name": "cat", "legs": 4}},
|
|
690
|
+
{"animal": {"name": "dog", "legs": 4}},
|
|
691
|
+
{"animal": {"name": "elephant", "legs": 4}},
|
|
692
|
+
])
|
|
693
|
+
df.ai.extract("animal")
|
|
694
|
+
```
|
|
695
|
+
This method returns a DataFrame with the same index as the original,
|
|
696
|
+
where each column corresponds to a key in the dictionaries.
|
|
697
|
+
The source column is dropped.
|
|
698
|
+
|
|
699
|
+
Args:
|
|
700
|
+
column (str): Column to expand.
|
|
701
|
+
|
|
702
|
+
Returns:
|
|
703
|
+
pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
|
|
704
|
+
"""
|
|
705
|
+
if column not in self._obj.columns:
|
|
706
|
+
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
|
707
|
+
|
|
708
|
+
return (
|
|
709
|
+
self._obj.pipe(lambda df: df.reset_index(drop=True))
|
|
710
|
+
.pipe(lambda df: df.join(df[column].ai.extract()))
|
|
711
|
+
.pipe(lambda df: df.set_index(self._obj.index))
|
|
712
|
+
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
650
713
|
)
|
|
651
714
|
|
|
652
|
-
def fillna(
|
|
715
|
+
def fillna(
|
|
716
|
+
self,
|
|
717
|
+
target_column_name: str,
|
|
718
|
+
max_examples: int = 500,
|
|
719
|
+
batch_size: int | None = None,
|
|
720
|
+
show_progress: bool = False,
|
|
721
|
+
**api_kwargs,
|
|
722
|
+
) -> pd.DataFrame:
|
|
653
723
|
"""Fill missing values in a DataFrame column using AI-powered inference.
|
|
654
724
|
|
|
655
725
|
This method uses machine learning to intelligently fill missing (NaN) values
|
|
@@ -666,6 +736,11 @@ class OpenAIVecDataFrameAccessor:
|
|
|
666
736
|
batch_size (int | None, optional): Number of requests sent in one batch
|
|
667
737
|
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
668
738
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
739
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
740
|
+
|
|
741
|
+
Additional Keyword Args:
|
|
742
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
743
|
+
``seed``, etc.) are forwarded verbatim to the underlying task execution.
|
|
669
744
|
|
|
670
745
|
Returns:
|
|
671
746
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
@@ -681,6 +756,10 @@ class OpenAIVecDataFrameAccessor:
|
|
|
681
756
|
|
|
682
757
|
# Fill missing values in the 'name' column
|
|
683
758
|
filled_df = df.ai.fillna('name')
|
|
759
|
+
|
|
760
|
+
# With progress bar for large datasets
|
|
761
|
+
large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
|
|
762
|
+
filled_df = large_df.ai.fillna('name', batch_size=32, show_progress=True)
|
|
684
763
|
```
|
|
685
764
|
|
|
686
765
|
Note:
|
|
@@ -693,7 +772,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
693
772
|
if missing_rows.empty:
|
|
694
773
|
return self._obj
|
|
695
774
|
|
|
696
|
-
filled_values: List[FillNaResponse] = missing_rows.ai.task(
|
|
775
|
+
filled_values: List[FillNaResponse] = missing_rows.ai.task(
|
|
776
|
+
task=task, batch_size=batch_size, show_progress=show_progress, **api_kwargs
|
|
777
|
+
)
|
|
697
778
|
|
|
698
779
|
# get deep copy of the DataFrame to avoid modifying the original
|
|
699
780
|
df = self._obj.copy()
|
|
@@ -716,15 +797,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
716
797
|
two columns of the DataFrame. The vectors should be numpy arrays or
|
|
717
798
|
array-like objects that support dot product operations.
|
|
718
799
|
|
|
719
|
-
Args:
|
|
720
|
-
col1 (str): Name of the first column containing embedding vectors.
|
|
721
|
-
col2 (str): Name of the second column containing embedding vectors.
|
|
722
|
-
|
|
723
|
-
Returns:
|
|
724
|
-
pandas.Series: Series containing cosine similarity scores between
|
|
725
|
-
corresponding vectors in col1 and col2, with values ranging
|
|
726
|
-
from -1 to 1, where 1 indicates identical direction.
|
|
727
|
-
|
|
728
800
|
Example:
|
|
729
801
|
```python
|
|
730
802
|
df = pd.DataFrame({
|
|
@@ -733,188 +805,86 @@ class OpenAIVecDataFrameAccessor:
|
|
|
733
805
|
})
|
|
734
806
|
similarities = df.ai.similarity('vec1', 'vec2')
|
|
735
807
|
```
|
|
736
|
-
"""
|
|
737
|
-
return self._obj.apply(
|
|
738
|
-
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
739
|
-
axis=1,
|
|
740
|
-
).rename("similarity") # type: ignore[arg-type]
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
@pd.api.extensions.register_series_accessor("aio")
|
|
744
|
-
class AsyncOpenAIVecSeriesAccessor:
|
|
745
|
-
"""pandas Series accessor (``.aio``) that adds OpenAI helpers."""
|
|
746
|
-
|
|
747
|
-
def __init__(self, series_obj: pd.Series):
|
|
748
|
-
self._obj = series_obj
|
|
749
|
-
|
|
750
|
-
async def responses_with_cache(
|
|
751
|
-
self,
|
|
752
|
-
instructions: str,
|
|
753
|
-
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
754
|
-
response_format: Type[ResponseFormat] = str,
|
|
755
|
-
temperature: float | None = 0.0,
|
|
756
|
-
top_p: float = 1.0,
|
|
757
|
-
) -> pd.Series:
|
|
758
|
-
"""Call an LLM once for every Series element using a provided cache (asynchronously).
|
|
759
|
-
|
|
760
|
-
This method allows external control over caching behavior by accepting
|
|
761
|
-
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
762
|
-
across multiple operations or custom batch size management. The concurrency
|
|
763
|
-
is controlled by the cache instance itself.
|
|
764
808
|
|
|
765
809
|
Args:
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
instance for managing API call batching and deduplication.
|
|
769
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
770
|
-
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
771
|
-
type the assistant should return. Defaults to ``str``.
|
|
772
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
773
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
810
|
+
col1 (str): Name of the first column containing embedding vectors.
|
|
811
|
+
col2 (str): Name of the second column containing embedding vectors.
|
|
774
812
|
|
|
775
813
|
Returns:
|
|
776
|
-
pandas.Series: Series
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
```python
|
|
780
|
-
from openaivec._proxy import AsyncBatchingMapProxy
|
|
781
|
-
|
|
782
|
-
# Create a shared cache with custom batch size and concurrency
|
|
783
|
-
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
784
|
-
|
|
785
|
-
animals = pd.Series(["cat", "dog", "elephant"])
|
|
786
|
-
# Must be awaited
|
|
787
|
-
result = await animals.aio.responses_with_cache(
|
|
788
|
-
"translate to French",
|
|
789
|
-
cache=shared_cache
|
|
790
|
-
)
|
|
791
|
-
```
|
|
792
|
-
|
|
793
|
-
Note:
|
|
794
|
-
This is an asynchronous method and must be awaited.
|
|
814
|
+
pandas.Series: Series containing cosine similarity scores between
|
|
815
|
+
corresponding vectors in col1 and col2, with values ranging
|
|
816
|
+
from -1 to 1, where 1 indicates identical direction.
|
|
795
817
|
"""
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
response_format=response_format,
|
|
801
|
-
cache=cache,
|
|
802
|
-
temperature=temperature,
|
|
803
|
-
top_p=top_p,
|
|
804
|
-
)
|
|
805
|
-
# Await the async operation
|
|
806
|
-
results = await client.parse(self._obj.tolist())
|
|
807
|
-
|
|
808
|
-
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
809
|
-
|
|
810
|
-
async def embeddings_with_cache(
|
|
811
|
-
self,
|
|
812
|
-
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
813
|
-
) -> pd.Series:
|
|
814
|
-
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
815
|
-
|
|
816
|
-
This method allows external control over caching behavior by accepting
|
|
817
|
-
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
818
|
-
across multiple operations or custom batch size management. The concurrency
|
|
819
|
-
is controlled by the cache instance itself.
|
|
820
|
-
|
|
821
|
-
Args:
|
|
822
|
-
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
823
|
-
instance for managing API call batching and deduplication.
|
|
824
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
825
|
-
|
|
826
|
-
Returns:
|
|
827
|
-
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
828
|
-
(dtype ``float32``).
|
|
829
|
-
|
|
830
|
-
Example:
|
|
831
|
-
```python
|
|
832
|
-
from openaivec._proxy import AsyncBatchingMapProxy
|
|
833
|
-
import numpy as np
|
|
834
|
-
|
|
835
|
-
# Create a shared cache with custom batch size and concurrency
|
|
836
|
-
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
837
|
-
batch_size=64, max_concurrency=4
|
|
838
|
-
)
|
|
839
|
-
|
|
840
|
-
animals = pd.Series(["cat", "dog", "elephant"])
|
|
841
|
-
# Must be awaited
|
|
842
|
-
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
843
|
-
```
|
|
818
|
+
return self._obj.apply(
|
|
819
|
+
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
820
|
+
axis=1,
|
|
821
|
+
).rename("similarity") # type: ignore[arg-type]
|
|
844
822
|
|
|
845
|
-
Note:
|
|
846
|
-
This is an asynchronous method and must be awaited.
|
|
847
|
-
"""
|
|
848
|
-
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
849
|
-
client=CONTAINER.resolve(AsyncOpenAI),
|
|
850
|
-
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
851
|
-
cache=cache,
|
|
852
|
-
)
|
|
853
823
|
|
|
854
|
-
|
|
855
|
-
|
|
824
|
+
@pd.api.extensions.register_series_accessor("aio")
|
|
825
|
+
class AsyncOpenAIVecSeriesAccessor:
|
|
826
|
+
"""pandas Series accessor (``.aio``) that adds OpenAI helpers."""
|
|
856
827
|
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
index=self._obj.index,
|
|
860
|
-
name=self._obj.name,
|
|
861
|
-
)
|
|
828
|
+
def __init__(self, series_obj: pd.Series):
|
|
829
|
+
self._obj = series_obj
|
|
862
830
|
|
|
863
|
-
async def
|
|
831
|
+
async def responses_with_cache(
|
|
864
832
|
self,
|
|
865
|
-
|
|
833
|
+
instructions: str,
|
|
866
834
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
835
|
+
response_format: Type[ResponseFormat] = str,
|
|
836
|
+
temperature: float | None = 0.0,
|
|
837
|
+
top_p: float = 1.0,
|
|
838
|
+
**api_kwargs,
|
|
867
839
|
) -> pd.Series:
|
|
868
|
-
"""
|
|
840
|
+
"""Call an LLM once for every Series element using a provided cache (asynchronously).
|
|
869
841
|
|
|
870
842
|
This method allows external control over caching behavior by accepting
|
|
871
843
|
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
872
844
|
across multiple operations or custom batch size management. The concurrency
|
|
873
845
|
is controlled by the cache instance itself.
|
|
874
846
|
|
|
847
|
+
Example:
|
|
848
|
+
```python
|
|
849
|
+
result = await series.aio.responses_with_cache(
|
|
850
|
+
"classify",
|
|
851
|
+
cache=shared,
|
|
852
|
+
max_output_tokens=256,
|
|
853
|
+
frequency_penalty=0.2,
|
|
854
|
+
)
|
|
855
|
+
```
|
|
856
|
+
|
|
875
857
|
Args:
|
|
876
|
-
|
|
877
|
-
response format, and other parameters for processing the inputs.
|
|
858
|
+
instructions (str): System prompt prepended to every user message.
|
|
878
859
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
879
860
|
instance for managing API call batching and deduplication.
|
|
880
861
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
862
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
863
|
+
type the assistant should return. Defaults to ``str``.
|
|
864
|
+
temperature (float | None, optional): Sampling temperature. ``None`` omits the
|
|
865
|
+
parameter (recommended for reasoning models). Defaults to ``0.0``.
|
|
866
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
867
|
+
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
868
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``max_output_tokens``, penalties,
|
|
869
|
+
future parameters). Core batching keys (model, instructions, input,
|
|
870
|
+
text_format) are protected and silently ignored if provided.
|
|
881
871
|
|
|
882
872
|
Returns:
|
|
883
|
-
pandas.Series: Series whose values are instances of
|
|
884
|
-
response format, aligned with the original Series index.
|
|
885
|
-
|
|
886
|
-
Example:
|
|
887
|
-
```python
|
|
888
|
-
from openaivec._model import PreparedTask
|
|
889
|
-
from openaivec._proxy import AsyncBatchingMapProxy
|
|
890
|
-
|
|
891
|
-
# Create a shared cache with custom batch size and concurrency
|
|
892
|
-
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
893
|
-
|
|
894
|
-
# Assume you have a prepared task for sentiment analysis
|
|
895
|
-
sentiment_task = PreparedTask(...)
|
|
896
|
-
|
|
897
|
-
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
898
|
-
# Must be awaited
|
|
899
|
-
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
900
|
-
```
|
|
873
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
901
874
|
|
|
902
875
|
Note:
|
|
903
876
|
This is an asynchronous method and must be awaited.
|
|
904
877
|
"""
|
|
905
|
-
client = AsyncBatchResponses(
|
|
878
|
+
client: AsyncBatchResponses = AsyncBatchResponses(
|
|
906
879
|
client=CONTAINER.resolve(AsyncOpenAI),
|
|
907
880
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
908
|
-
system_message=
|
|
909
|
-
response_format=
|
|
881
|
+
system_message=instructions,
|
|
882
|
+
response_format=response_format,
|
|
910
883
|
cache=cache,
|
|
911
|
-
temperature=
|
|
912
|
-
top_p=
|
|
884
|
+
temperature=temperature,
|
|
885
|
+
top_p=top_p,
|
|
913
886
|
)
|
|
914
|
-
|
|
915
|
-
# Await the async operation
|
|
916
|
-
results = await client.parse(self._obj.tolist())
|
|
917
|
-
|
|
887
|
+
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
918
888
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
919
889
|
|
|
920
890
|
async def responses(
|
|
@@ -926,6 +896,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
926
896
|
top_p: float = 1.0,
|
|
927
897
|
max_concurrency: int = 8,
|
|
928
898
|
show_progress: bool = False,
|
|
899
|
+
**api_kwargs,
|
|
929
900
|
) -> pd.Series:
|
|
930
901
|
"""Call an LLM once for every Series element (asynchronously).
|
|
931
902
|
|
|
@@ -944,10 +915,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
944
915
|
show_progress=True
|
|
945
916
|
)
|
|
946
917
|
```
|
|
947
|
-
This method returns a Series of strings, each containing the
|
|
948
|
-
assistant's response to the corresponding input.
|
|
949
|
-
The model used is set by the `responses_model` function.
|
|
950
|
-
The default model is `gpt-4.1-mini`.
|
|
951
918
|
|
|
952
919
|
Args:
|
|
953
920
|
instructions (str): System prompt prepended to every user message.
|
|
@@ -956,7 +923,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
956
923
|
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
957
924
|
request. Defaults to ``None`` (automatic batch size optimization
|
|
958
925
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
959
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
926
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
960
927
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
961
928
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
962
929
|
requests. Defaults to ``8``.
|
|
@@ -976,6 +943,60 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
976
943
|
response_format=response_format,
|
|
977
944
|
temperature=temperature,
|
|
978
945
|
top_p=top_p,
|
|
946
|
+
**api_kwargs,
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
async def embeddings_with_cache(
|
|
950
|
+
self,
|
|
951
|
+
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
952
|
+
) -> pd.Series:
|
|
953
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
954
|
+
|
|
955
|
+
This method allows external control over caching behavior by accepting
|
|
956
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
957
|
+
across multiple operations or custom batch size management. The concurrency
|
|
958
|
+
is controlled by the cache instance itself.
|
|
959
|
+
|
|
960
|
+
Example:
|
|
961
|
+
```python
|
|
962
|
+
from openaivec._proxy import AsyncBatchingMapProxy
|
|
963
|
+
import numpy as np
|
|
964
|
+
|
|
965
|
+
# Create a shared cache with custom batch size and concurrency
|
|
966
|
+
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
967
|
+
batch_size=64, max_concurrency=4
|
|
968
|
+
)
|
|
969
|
+
|
|
970
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
971
|
+
# Must be awaited
|
|
972
|
+
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
973
|
+
```
|
|
974
|
+
|
|
975
|
+
Args:
|
|
976
|
+
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
977
|
+
instance for managing API call batching and deduplication.
|
|
978
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
979
|
+
|
|
980
|
+
Returns:
|
|
981
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
982
|
+
(dtype ``float32``).
|
|
983
|
+
|
|
984
|
+
Note:
|
|
985
|
+
This is an asynchronous method and must be awaited.
|
|
986
|
+
"""
|
|
987
|
+
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
988
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
989
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
990
|
+
cache=cache,
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
# Await the async operation
|
|
994
|
+
results = await client.create(self._obj.tolist())
|
|
995
|
+
|
|
996
|
+
return pd.Series(
|
|
997
|
+
results,
|
|
998
|
+
index=self._obj.index,
|
|
999
|
+
name=self._obj.name,
|
|
979
1000
|
)
|
|
980
1001
|
|
|
981
1002
|
async def embeddings(
|
|
@@ -997,10 +1018,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
997
1018
|
show_progress=True
|
|
998
1019
|
)
|
|
999
1020
|
```
|
|
1000
|
-
This method returns a Series of numpy arrays, each containing the
|
|
1001
|
-
embedding vector for the corresponding input.
|
|
1002
|
-
The embedding model is set by the `embeddings_model` function.
|
|
1003
|
-
The default embedding model is `text-embedding-3-small`.
|
|
1004
1021
|
|
|
1005
1022
|
Args:
|
|
1006
1023
|
batch_size (int | None, optional): Number of inputs grouped into a
|
|
@@ -1023,15 +1040,79 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1023
1040
|
),
|
|
1024
1041
|
)
|
|
1025
1042
|
|
|
1043
|
+
async def task_with_cache(
|
|
1044
|
+
self,
|
|
1045
|
+
task: PreparedTask[ResponseFormat],
|
|
1046
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1047
|
+
**api_kwargs,
|
|
1048
|
+
) -> pd.Series:
|
|
1049
|
+
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
1050
|
+
|
|
1051
|
+
This method allows external control over caching behavior by accepting
|
|
1052
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1053
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1054
|
+
is controlled by the cache instance itself.
|
|
1055
|
+
|
|
1056
|
+
Args:
|
|
1057
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
1058
|
+
response format, and other parameters for processing the inputs.
|
|
1059
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1060
|
+
instance for managing API call batching and deduplication.
|
|
1061
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1062
|
+
|
|
1063
|
+
Example:
|
|
1064
|
+
```python
|
|
1065
|
+
from openaivec._model import PreparedTask
|
|
1066
|
+
from openaivec._proxy import AsyncBatchingMapProxy
|
|
1067
|
+
|
|
1068
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1069
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
1070
|
+
|
|
1071
|
+
# Assume you have a prepared task for sentiment analysis
|
|
1072
|
+
sentiment_task = PreparedTask(...)
|
|
1073
|
+
|
|
1074
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
1075
|
+
# Must be awaited
|
|
1076
|
+
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
1077
|
+
```
|
|
1078
|
+
|
|
1079
|
+
Additional Keyword Args:
|
|
1080
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1081
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1082
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1083
|
+
library and cannot be overridden.
|
|
1084
|
+
|
|
1085
|
+
Returns:
|
|
1086
|
+
pandas.Series: Series whose values are instances of the task's
|
|
1087
|
+
response format, aligned with the original Series index.
|
|
1088
|
+
|
|
1089
|
+
Note:
|
|
1090
|
+
This is an asynchronous method and must be awaited.
|
|
1091
|
+
"""
|
|
1092
|
+
client = AsyncBatchResponses(
|
|
1093
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1094
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
1095
|
+
system_message=task.instructions,
|
|
1096
|
+
response_format=task.response_format,
|
|
1097
|
+
cache=cache,
|
|
1098
|
+
temperature=task.temperature,
|
|
1099
|
+
top_p=task.top_p,
|
|
1100
|
+
)
|
|
1101
|
+
# Await the async operation
|
|
1102
|
+
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
1103
|
+
|
|
1104
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
1105
|
+
|
|
1026
1106
|
async def task(
|
|
1027
|
-
self,
|
|
1107
|
+
self,
|
|
1108
|
+
task: PreparedTask,
|
|
1109
|
+
batch_size: int | None = None,
|
|
1110
|
+
max_concurrency: int = 8,
|
|
1111
|
+
show_progress: bool = False,
|
|
1112
|
+
**api_kwargs,
|
|
1028
1113
|
) -> pd.Series:
|
|
1029
1114
|
"""Execute a prepared task on every Series element (asynchronously).
|
|
1030
1115
|
|
|
1031
|
-
This method applies a pre-configured task to each element in the Series,
|
|
1032
|
-
using the task's instructions and response format to generate structured
|
|
1033
|
-
responses from the language model.
|
|
1034
|
-
|
|
1035
1116
|
Example:
|
|
1036
1117
|
```python
|
|
1037
1118
|
from openaivec._model import PreparedTask
|
|
@@ -1052,8 +1133,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1052
1133
|
show_progress=True
|
|
1053
1134
|
)
|
|
1054
1135
|
```
|
|
1055
|
-
This method returns a Series containing the task results for each
|
|
1056
|
-
corresponding input element, following the task's defined structure.
|
|
1057
1136
|
|
|
1058
1137
|
Args:
|
|
1059
1138
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
@@ -1065,6 +1144,12 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1065
1144
|
requests. Defaults to 8.
|
|
1066
1145
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1067
1146
|
|
|
1147
|
+
Additional Keyword Args:
|
|
1148
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1149
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1150
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1151
|
+
library and cannot be overridden.
|
|
1152
|
+
|
|
1068
1153
|
Returns:
|
|
1069
1154
|
pandas.Series: Series whose values are instances of the task's
|
|
1070
1155
|
response format, aligned with the original Series index.
|
|
@@ -1077,6 +1162,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1077
1162
|
cache=AsyncBatchingMapProxy(
|
|
1078
1163
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1079
1164
|
),
|
|
1165
|
+
**api_kwargs,
|
|
1080
1166
|
)
|
|
1081
1167
|
|
|
1082
1168
|
|
|
@@ -1094,27 +1180,15 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1094
1180
|
response_format: Type[ResponseFormat] = str,
|
|
1095
1181
|
temperature: float | None = 0.0,
|
|
1096
1182
|
top_p: float = 1.0,
|
|
1183
|
+
**api_kwargs,
|
|
1097
1184
|
) -> pd.Series:
|
|
1098
|
-
"""Generate a response for each row after
|
|
1185
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
|
|
1099
1186
|
|
|
1100
1187
|
This method allows external control over caching behavior by accepting
|
|
1101
1188
|
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1102
1189
|
across multiple operations or custom batch size management. The concurrency
|
|
1103
1190
|
is controlled by the cache instance itself.
|
|
1104
1191
|
|
|
1105
|
-
Args:
|
|
1106
|
-
instructions (str): System prompt for the assistant.
|
|
1107
|
-
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1108
|
-
instance for managing API call batching and deduplication.
|
|
1109
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1110
|
-
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1111
|
-
responses. Defaults to ``str``.
|
|
1112
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1113
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1114
|
-
|
|
1115
|
-
Returns:
|
|
1116
|
-
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
1117
|
-
|
|
1118
1192
|
Example:
|
|
1119
1193
|
```python
|
|
1120
1194
|
from openaivec._proxy import AsyncBatchingMapProxy
|
|
@@ -1134,23 +1208,30 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1134
1208
|
)
|
|
1135
1209
|
```
|
|
1136
1210
|
|
|
1211
|
+
Args:
|
|
1212
|
+
instructions (str): System prompt for the assistant.
|
|
1213
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1214
|
+
instance for managing API call batching and deduplication.
|
|
1215
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1216
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1217
|
+
responses. Defaults to ``str``.
|
|
1218
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1219
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1220
|
+
|
|
1221
|
+
Returns:
|
|
1222
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
1223
|
+
|
|
1137
1224
|
Note:
|
|
1138
1225
|
This is an asynchronous method and must be awaited.
|
|
1139
1226
|
"""
|
|
1140
|
-
series_of_json = self._obj.pipe(
|
|
1141
|
-
lambda df: (
|
|
1142
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
1143
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
1144
|
-
)
|
|
1145
|
-
)
|
|
1146
|
-
)
|
|
1147
1227
|
# Await the call to the async Series method using .aio
|
|
1148
|
-
return await
|
|
1228
|
+
return await _df_rows_to_json_series(self._obj).aio.responses_with_cache(
|
|
1149
1229
|
instructions=instructions,
|
|
1150
1230
|
cache=cache,
|
|
1151
1231
|
response_format=response_format,
|
|
1152
1232
|
temperature=temperature,
|
|
1153
1233
|
top_p=top_p,
|
|
1234
|
+
**api_kwargs,
|
|
1154
1235
|
)
|
|
1155
1236
|
|
|
1156
1237
|
async def responses(
|
|
@@ -1162,33 +1243,29 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1162
1243
|
top_p: float = 1.0,
|
|
1163
1244
|
max_concurrency: int = 8,
|
|
1164
1245
|
show_progress: bool = False,
|
|
1246
|
+
**api_kwargs,
|
|
1165
1247
|
) -> pd.Series:
|
|
1166
|
-
"""Generate a response for each row after
|
|
1248
|
+
"""Generate a response for each row after serializing it to JSON (asynchronously).
|
|
1167
1249
|
|
|
1168
1250
|
Example:
|
|
1169
1251
|
```python
|
|
1170
1252
|
df = pd.DataFrame([
|
|
1171
|
-
{
|
|
1172
|
-
{
|
|
1173
|
-
{
|
|
1253
|
+
{"name": "cat", "legs": 4},
|
|
1254
|
+
{"name": "dog", "legs": 4},
|
|
1255
|
+
{"name": "elephant", "legs": 4},
|
|
1174
1256
|
])
|
|
1175
1257
|
# Must be awaited
|
|
1176
|
-
results = await df.aio.responses(
|
|
1258
|
+
results = await df.aio.responses("what is the animal's name?")
|
|
1177
1259
|
|
|
1178
1260
|
# With progress bar for large datasets
|
|
1179
|
-
large_df = pd.DataFrame({
|
|
1261
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
1180
1262
|
results = await large_df.aio.responses(
|
|
1181
|
-
|
|
1263
|
+
"generate a name for this ID",
|
|
1182
1264
|
batch_size=20,
|
|
1183
1265
|
max_concurrency=4,
|
|
1184
1266
|
show_progress=True
|
|
1185
1267
|
)
|
|
1186
1268
|
```
|
|
1187
|
-
This method returns a Series of strings, each containing the
|
|
1188
|
-
assistant's response to the corresponding input.
|
|
1189
|
-
Each row is serialised to JSON before being sent to the assistant.
|
|
1190
|
-
The model used is set by the `responses_model` function.
|
|
1191
|
-
The default model is `gpt-4.1-mini`.
|
|
1192
1269
|
|
|
1193
1270
|
Args:
|
|
1194
1271
|
instructions (str): System prompt for the assistant.
|
|
@@ -1197,7 +1274,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1197
1274
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
1198
1275
|
Defaults to ``None`` (automatic batch size optimization
|
|
1199
1276
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
1200
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1277
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1201
1278
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1202
1279
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1203
1280
|
requests. Defaults to ``8``.
|
|
@@ -1217,17 +1294,47 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1217
1294
|
response_format=response_format,
|
|
1218
1295
|
temperature=temperature,
|
|
1219
1296
|
top_p=top_p,
|
|
1297
|
+
**api_kwargs,
|
|
1220
1298
|
)
|
|
1221
1299
|
|
|
1222
|
-
async def
|
|
1223
|
-
self,
|
|
1300
|
+
async def task_with_cache(
|
|
1301
|
+
self,
|
|
1302
|
+
task: PreparedTask[ResponseFormat],
|
|
1303
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1304
|
+
**api_kwargs,
|
|
1224
1305
|
) -> pd.Series:
|
|
1225
|
-
"""Execute a prepared task on each DataFrame row
|
|
1306
|
+
"""Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
|
|
1307
|
+
|
|
1308
|
+
After serializing each row to JSON, this method executes the prepared task.
|
|
1309
|
+
|
|
1310
|
+
Args:
|
|
1311
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
1312
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1313
|
+
|
|
1314
|
+
Additional Keyword Args:
|
|
1315
|
+
Arbitrary OpenAI Responses API parameters forwarded verbatim. Core routing keys are protected.
|
|
1316
|
+
|
|
1317
|
+
Returns:
|
|
1318
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
1319
|
+
|
|
1320
|
+
Note:
|
|
1321
|
+
This is an asynchronous method and must be awaited.
|
|
1322
|
+
"""
|
|
1323
|
+
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1324
|
+
task=task,
|
|
1325
|
+
cache=cache,
|
|
1326
|
+
**api_kwargs,
|
|
1327
|
+
)
|
|
1226
1328
|
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1329
|
+
async def task(
|
|
1330
|
+
self,
|
|
1331
|
+
task: PreparedTask,
|
|
1332
|
+
batch_size: int | None = None,
|
|
1333
|
+
max_concurrency: int = 8,
|
|
1334
|
+
show_progress: bool = False,
|
|
1335
|
+
**api_kwargs,
|
|
1336
|
+
) -> pd.Series:
|
|
1337
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
|
|
1231
1338
|
|
|
1232
1339
|
Example:
|
|
1233
1340
|
```python
|
|
@@ -1253,8 +1360,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1253
1360
|
show_progress=True
|
|
1254
1361
|
)
|
|
1255
1362
|
```
|
|
1256
|
-
This method returns a Series containing the task results for each
|
|
1257
|
-
corresponding row, following the task's defined structure.
|
|
1258
1363
|
|
|
1259
1364
|
Args:
|
|
1260
1365
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
@@ -1266,6 +1371,12 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1266
1371
|
requests. Defaults to 8.
|
|
1267
1372
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1268
1373
|
|
|
1374
|
+
Additional Keyword Args:
|
|
1375
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1376
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1377
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1378
|
+
library and cannot be overridden.
|
|
1379
|
+
|
|
1269
1380
|
Returns:
|
|
1270
1381
|
pandas.Series: Series whose values are instances of the task's
|
|
1271
1382
|
response format, aligned with the DataFrame's original index.
|
|
@@ -1273,28 +1384,33 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1273
1384
|
Note:
|
|
1274
1385
|
This is an asynchronous method and must be awaited.
|
|
1275
1386
|
"""
|
|
1276
|
-
series_of_json = self._obj.pipe(
|
|
1277
|
-
lambda df: (
|
|
1278
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
1279
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
1280
|
-
)
|
|
1281
|
-
)
|
|
1282
|
-
)
|
|
1283
1387
|
# Await the call to the async Series method using .aio
|
|
1284
|
-
return await
|
|
1388
|
+
return await _df_rows_to_json_series(self._obj).aio.task(
|
|
1285
1389
|
task=task,
|
|
1286
1390
|
batch_size=batch_size,
|
|
1287
1391
|
max_concurrency=max_concurrency,
|
|
1288
1392
|
show_progress=show_progress,
|
|
1393
|
+
**api_kwargs,
|
|
1289
1394
|
)
|
|
1290
1395
|
|
|
1291
1396
|
async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
|
|
1292
|
-
"""
|
|
1293
|
-
Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
|
|
1397
|
+
"""Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
|
|
1294
1398
|
|
|
1295
1399
|
This method allows chaining operations on the DataFrame, similar to pandas' `pipe` method,
|
|
1296
1400
|
but with support for asynchronous functions.
|
|
1297
1401
|
|
|
1402
|
+
Example:
|
|
1403
|
+
```python
|
|
1404
|
+
async def process_data(df):
|
|
1405
|
+
# Simulate an asynchronous computation
|
|
1406
|
+
await asyncio.sleep(1)
|
|
1407
|
+
return df.dropna()
|
|
1408
|
+
|
|
1409
|
+
df = pd.DataFrame({"col": [1, 2, None, 4]})
|
|
1410
|
+
# Must be awaited
|
|
1411
|
+
result = await df.aio.pipe(process_data)
|
|
1412
|
+
```
|
|
1413
|
+
|
|
1298
1414
|
Args:
|
|
1299
1415
|
func (Callable[[pd.DataFrame], Awaitable[T] | T]): A function that takes a DataFrame
|
|
1300
1416
|
as input and returns either a result or an awaitable result.
|
|
@@ -1371,7 +1487,13 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1371
1487
|
return df_current
|
|
1372
1488
|
|
|
1373
1489
|
async def fillna(
|
|
1374
|
-
self,
|
|
1490
|
+
self,
|
|
1491
|
+
target_column_name: str,
|
|
1492
|
+
max_examples: int = 500,
|
|
1493
|
+
batch_size: int | None = None,
|
|
1494
|
+
max_concurrency: int = 8,
|
|
1495
|
+
show_progress: bool = False,
|
|
1496
|
+
**api_kwargs,
|
|
1375
1497
|
) -> pd.DataFrame:
|
|
1376
1498
|
"""Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
|
|
1377
1499
|
|
|
@@ -1391,6 +1513,11 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1391
1513
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1392
1514
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1393
1515
|
requests. Defaults to 8.
|
|
1516
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1517
|
+
|
|
1518
|
+
Additional Keyword Args:
|
|
1519
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1520
|
+
``seed``, etc.) are forwarded verbatim to the underlying task execution.
|
|
1394
1521
|
|
|
1395
1522
|
Returns:
|
|
1396
1523
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
@@ -1406,6 +1533,15 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1406
1533
|
|
|
1407
1534
|
# Fill missing values in the 'name' column (must be awaited)
|
|
1408
1535
|
filled_df = await df.aio.fillna('name')
|
|
1536
|
+
|
|
1537
|
+
# With progress bar for large datasets
|
|
1538
|
+
large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
|
|
1539
|
+
filled_df = await large_df.aio.fillna(
|
|
1540
|
+
'name',
|
|
1541
|
+
batch_size=32,
|
|
1542
|
+
max_concurrency=4,
|
|
1543
|
+
show_progress=True
|
|
1544
|
+
)
|
|
1409
1545
|
```
|
|
1410
1546
|
|
|
1411
1547
|
Note:
|
|
@@ -1420,7 +1556,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1420
1556
|
return self._obj
|
|
1421
1557
|
|
|
1422
1558
|
filled_values: List[FillNaResponse] = await missing_rows.aio.task(
|
|
1423
|
-
task=task, batch_size=batch_size, max_concurrency=max_concurrency
|
|
1559
|
+
task=task, batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress, **api_kwargs
|
|
1424
1560
|
)
|
|
1425
1561
|
|
|
1426
1562
|
# get deep copy of the DataFrame to avoid modifying the original
|