openaivec 0.14.1__py3-none-any.whl → 0.14.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_responses.py +77 -25
- openaivec/_schema.py +413 -0
- openaivec/_serialize.py +178 -181
- openaivec/pandas_ext.py +242 -140
- openaivec/spark.py +21 -1
- openaivec/task/table/fillna.py +2 -2
- {openaivec-0.14.1.dist-info → openaivec-0.14.3.dist-info}/METADATA +1 -1
- {openaivec-0.14.1.dist-info → openaivec-0.14.3.dist-info}/RECORD +10 -9
- {openaivec-0.14.1.dist-info → openaivec-0.14.3.dist-info}/WHEEL +0 -0
- {openaivec-0.14.1.dist-info → openaivec-0.14.3.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -74,6 +74,21 @@ __all__ = [
|
|
|
74
74
|
_LOGGER = logging.getLogger(__name__)
|
|
75
75
|
|
|
76
76
|
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# Internal helpers (not exported)
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
def _df_rows_to_json_series(df: pd.DataFrame) -> pd.Series:
|
|
81
|
+
"""Return a Series of JSON strings (UTF-8, no ASCII escaping) representing DataFrame rows.
|
|
82
|
+
|
|
83
|
+
Each element is the JSON serialisation of the corresponding row as a dict. Index and
|
|
84
|
+
name are preserved so downstream operations retain alignment. This consolidates the
|
|
85
|
+
previously duplicated inline pipeline used by responses*/task* DataFrame helpers.
|
|
86
|
+
"""
|
|
87
|
+
return pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
88
|
+
lambda x: json.dumps(x, ensure_ascii=False)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
77
92
|
T = TypeVar("T") # For pipe function return type
|
|
78
93
|
|
|
79
94
|
|
|
@@ -165,6 +180,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
165
180
|
response_format: Type[ResponseFormat] = str,
|
|
166
181
|
temperature: float | None = 0.0,
|
|
167
182
|
top_p: float = 1.0,
|
|
183
|
+
**api_kwargs,
|
|
168
184
|
) -> pd.Series:
|
|
169
185
|
client: BatchResponses = BatchResponses(
|
|
170
186
|
client=CONTAINER.resolve(OpenAI),
|
|
@@ -176,7 +192,8 @@ class OpenAIVecSeriesAccessor:
|
|
|
176
192
|
top_p=top_p,
|
|
177
193
|
)
|
|
178
194
|
|
|
179
|
-
|
|
195
|
+
# Forward any extra kwargs to the underlying Responses API.
|
|
196
|
+
return pd.Series(client.parse(self._obj.tolist(), **api_kwargs), index=self._obj.index, name=self._obj.name)
|
|
180
197
|
|
|
181
198
|
def embeddings_with_cache(
|
|
182
199
|
self,
|
|
@@ -229,6 +246,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
229
246
|
temperature: float | None = 0.0,
|
|
230
247
|
top_p: float = 1.0,
|
|
231
248
|
show_progress: bool = False,
|
|
249
|
+
**api_kwargs,
|
|
232
250
|
) -> pd.Series:
|
|
233
251
|
"""Call an LLM once for every Series element.
|
|
234
252
|
|
|
@@ -246,10 +264,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
246
264
|
show_progress=True
|
|
247
265
|
)
|
|
248
266
|
```
|
|
249
|
-
This method returns a Series of strings, each containing the
|
|
250
|
-
assistant's response to the corresponding input.
|
|
251
|
-
The model used is set by the `responses_model` function.
|
|
252
|
-
The default model is `gpt-4.1-mini`.
|
|
253
267
|
|
|
254
268
|
Args:
|
|
255
269
|
instructions (str): System prompt prepended to every user message.
|
|
@@ -271,46 +285,41 @@ class OpenAIVecSeriesAccessor:
|
|
|
271
285
|
response_format=response_format,
|
|
272
286
|
temperature=temperature,
|
|
273
287
|
top_p=top_p,
|
|
288
|
+
**api_kwargs,
|
|
274
289
|
)
|
|
275
290
|
|
|
276
291
|
def task_with_cache(
|
|
277
292
|
self,
|
|
278
293
|
task: PreparedTask[ResponseFormat],
|
|
279
294
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
295
|
+
**api_kwargs,
|
|
280
296
|
) -> pd.Series:
|
|
281
297
|
"""Execute a prepared task on every Series element using a provided cache.
|
|
282
298
|
|
|
283
|
-
This
|
|
284
|
-
|
|
285
|
-
|
|
299
|
+
This mirrors ``responses_with_cache`` but uses the task's stored instructions,
|
|
300
|
+
response format, temperature and top_p. A supplied ``BatchingMapProxy`` enables
|
|
301
|
+
cross‑operation deduplicated reuse and external batch size / progress control.
|
|
286
302
|
|
|
287
303
|
Args:
|
|
288
|
-
task (PreparedTask):
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
304
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
305
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
306
|
+
|
|
307
|
+
Additional Keyword Args:
|
|
308
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
309
|
+
``seed``, etc.) forwarded verbatim to the underlying client. Core routing keys
|
|
310
|
+
(``model``, system instructions, user input) are managed internally and cannot be overridden.
|
|
293
311
|
|
|
294
312
|
Returns:
|
|
295
|
-
pandas.Series:
|
|
296
|
-
response format, aligned with the original Series index.
|
|
313
|
+
pandas.Series: Task results aligned with the original Series index.
|
|
297
314
|
|
|
298
315
|
Example:
|
|
299
316
|
```python
|
|
300
|
-
from openaivec._model import PreparedTask
|
|
301
317
|
from openaivec._proxy import BatchingMapProxy
|
|
302
|
-
|
|
303
|
-
# Create a shared cache with custom batch size
|
|
304
318
|
shared_cache = BatchingMapProxy(batch_size=64)
|
|
305
|
-
|
|
306
|
-
# Assume you have a prepared task for sentiment analysis
|
|
307
|
-
sentiment_task = PreparedTask(...)
|
|
308
|
-
|
|
309
|
-
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
310
|
-
results = reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
319
|
+
reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
311
320
|
```
|
|
312
321
|
"""
|
|
313
|
-
client = BatchResponses(
|
|
322
|
+
client: BatchResponses = BatchResponses(
|
|
314
323
|
client=CONTAINER.resolve(OpenAI),
|
|
315
324
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
316
325
|
system_message=task.instructions,
|
|
@@ -319,15 +328,17 @@ class OpenAIVecSeriesAccessor:
|
|
|
319
328
|
temperature=task.temperature,
|
|
320
329
|
top_p=task.top_p,
|
|
321
330
|
)
|
|
322
|
-
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
331
|
+
return pd.Series(client.parse(self._obj.tolist(), **api_kwargs), index=self._obj.index, name=self._obj.name)
|
|
323
332
|
|
|
324
|
-
def task(
|
|
333
|
+
def task(
|
|
334
|
+
self,
|
|
335
|
+
task: PreparedTask,
|
|
336
|
+
batch_size: int | None = None,
|
|
337
|
+
show_progress: bool = False,
|
|
338
|
+
**api_kwargs,
|
|
339
|
+
) -> pd.Series:
|
|
325
340
|
"""Execute a prepared task on every Series element.
|
|
326
341
|
|
|
327
|
-
This method applies a pre-configured task to each element in the Series,
|
|
328
|
-
using the task's instructions and response format to generate structured
|
|
329
|
-
responses from the language model.
|
|
330
|
-
|
|
331
342
|
Example:
|
|
332
343
|
```python
|
|
333
344
|
from openaivec._model import PreparedTask
|
|
@@ -347,8 +358,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
347
358
|
show_progress=True
|
|
348
359
|
)
|
|
349
360
|
```
|
|
350
|
-
This method returns a Series containing the task results for each
|
|
351
|
-
corresponding input element, following the task's defined structure.
|
|
352
361
|
|
|
353
362
|
Args:
|
|
354
363
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
@@ -358,13 +367,19 @@ class OpenAIVecSeriesAccessor:
|
|
|
358
367
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
359
368
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
360
369
|
|
|
370
|
+
Additional Keyword Args:
|
|
371
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
372
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
373
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
374
|
+
library and cannot be overridden.
|
|
375
|
+
|
|
361
376
|
Returns:
|
|
362
|
-
pandas.Series: Series whose values are instances of the task's
|
|
363
|
-
response format, aligned with the original Series index.
|
|
377
|
+
pandas.Series: Series whose values are instances of the task's response format.
|
|
364
378
|
"""
|
|
365
379
|
return self.task_with_cache(
|
|
366
380
|
task=task,
|
|
367
381
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
382
|
+
**api_kwargs,
|
|
368
383
|
)
|
|
369
384
|
|
|
370
385
|
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
@@ -383,10 +398,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
383
398
|
show_progress=True
|
|
384
399
|
)
|
|
385
400
|
```
|
|
386
|
-
This method returns a Series of numpy arrays, each containing the
|
|
387
|
-
embedding vector for the corresponding input.
|
|
388
|
-
The embedding model is set by the `embeddings_model` function.
|
|
389
|
-
The default embedding model is `text-embedding-3-small`.
|
|
390
401
|
|
|
391
402
|
Args:
|
|
392
403
|
batch_size (int | None, optional): Number of inputs grouped into a
|
|
@@ -495,6 +506,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
495
506
|
response_format: Type[ResponseFormat] = str,
|
|
496
507
|
temperature: float | None = 0.0,
|
|
497
508
|
top_p: float = 1.0,
|
|
509
|
+
**api_kwargs,
|
|
498
510
|
) -> pd.Series:
|
|
499
511
|
"""Generate a response for each row after serialising it to JSON using a provided cache.
|
|
500
512
|
|
|
@@ -533,18 +545,13 @@ class OpenAIVecDataFrameAccessor:
|
|
|
533
545
|
)
|
|
534
546
|
```
|
|
535
547
|
"""
|
|
536
|
-
return self._obj.
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
response_format=response_format,
|
|
544
|
-
temperature=temperature,
|
|
545
|
-
top_p=top_p,
|
|
546
|
-
)
|
|
547
|
-
)
|
|
548
|
+
return _df_rows_to_json_series(self._obj).ai.responses_with_cache(
|
|
549
|
+
instructions=instructions,
|
|
550
|
+
cache=cache,
|
|
551
|
+
response_format=response_format,
|
|
552
|
+
temperature=temperature,
|
|
553
|
+
top_p=top_p,
|
|
554
|
+
**api_kwargs,
|
|
548
555
|
)
|
|
549
556
|
|
|
550
557
|
def responses(
|
|
@@ -555,6 +562,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
555
562
|
temperature: float | None = 0.0,
|
|
556
563
|
top_p: float = 1.0,
|
|
557
564
|
show_progress: bool = False,
|
|
565
|
+
**api_kwargs,
|
|
558
566
|
) -> pd.Series:
|
|
559
567
|
"""Generate a response for each row after serialising it to JSON.
|
|
560
568
|
|
|
@@ -576,11 +584,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
576
584
|
show_progress=True
|
|
577
585
|
)
|
|
578
586
|
```
|
|
579
|
-
This method returns a Series of strings, each containing the
|
|
580
|
-
assistant's response to the corresponding input.
|
|
581
|
-
Each row is serialised to JSON before being sent to the assistant.
|
|
582
|
-
The model used is set by the `responses_model` function.
|
|
583
|
-
The default model is `gpt-4.1-mini`.
|
|
584
587
|
|
|
585
588
|
Args:
|
|
586
589
|
instructions (str): System prompt for the assistant.
|
|
@@ -602,16 +605,18 @@ class OpenAIVecDataFrameAccessor:
|
|
|
602
605
|
response_format=response_format,
|
|
603
606
|
temperature=temperature,
|
|
604
607
|
top_p=top_p,
|
|
608
|
+
**api_kwargs,
|
|
605
609
|
)
|
|
606
610
|
|
|
607
|
-
def task(
|
|
611
|
+
def task(
|
|
612
|
+
self,
|
|
613
|
+
task: PreparedTask,
|
|
614
|
+
batch_size: int | None = None,
|
|
615
|
+
show_progress: bool = False,
|
|
616
|
+
**api_kwargs,
|
|
617
|
+
) -> pd.Series:
|
|
608
618
|
"""Execute a prepared task on each DataFrame row after serialising it to JSON.
|
|
609
619
|
|
|
610
|
-
This method applies a pre-configured task to each row in the DataFrame,
|
|
611
|
-
using the task's instructions and response format to generate structured
|
|
612
|
-
responses from the language model. Each row is serialised to JSON before
|
|
613
|
-
being processed by the task.
|
|
614
|
-
|
|
615
620
|
Example:
|
|
616
621
|
```python
|
|
617
622
|
from openaivec._model import PreparedTask
|
|
@@ -624,10 +629,17 @@ class OpenAIVecDataFrameAccessor:
|
|
|
624
629
|
{"name": "dog", "legs": 4},
|
|
625
630
|
{"name": "elephant", "legs": 4},
|
|
626
631
|
])
|
|
632
|
+
# Basic usage
|
|
627
633
|
results = df.ai.task(analysis_task)
|
|
634
|
+
|
|
635
|
+
# With progress bar for large datasets
|
|
636
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
637
|
+
results = large_df.ai.task(
|
|
638
|
+
analysis_task,
|
|
639
|
+
batch_size=50,
|
|
640
|
+
show_progress=True
|
|
641
|
+
)
|
|
628
642
|
```
|
|
629
|
-
This method returns a Series containing the task results for each
|
|
630
|
-
corresponding row, following the task's defined structure.
|
|
631
643
|
|
|
632
644
|
Args:
|
|
633
645
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
@@ -637,19 +649,56 @@ class OpenAIVecDataFrameAccessor:
|
|
|
637
649
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
638
650
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
639
651
|
|
|
652
|
+
Additional Keyword Args:
|
|
653
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
654
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
655
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
656
|
+
library and cannot be overridden.
|
|
657
|
+
|
|
640
658
|
Returns:
|
|
641
659
|
pandas.Series: Series whose values are instances of the task's
|
|
642
660
|
response format, aligned with the DataFrame's original index.
|
|
643
661
|
"""
|
|
644
|
-
return self._obj.
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
662
|
+
return _df_rows_to_json_series(self._obj).ai.task(
|
|
663
|
+
task=task,
|
|
664
|
+
batch_size=batch_size,
|
|
665
|
+
show_progress=show_progress,
|
|
666
|
+
**api_kwargs,
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
def task_with_cache(
|
|
670
|
+
self,
|
|
671
|
+
task: PreparedTask[ResponseFormat],
|
|
672
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
673
|
+
**api_kwargs,
|
|
674
|
+
) -> pd.Series:
|
|
675
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
679
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
680
|
+
|
|
681
|
+
Additional Keyword Args:
|
|
682
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
683
|
+
``seed``) forwarded verbatim. Core routing keys are managed internally.
|
|
684
|
+
|
|
685
|
+
Returns:
|
|
686
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
687
|
+
"""
|
|
688
|
+
return _df_rows_to_json_series(self._obj).ai.task_with_cache(
|
|
689
|
+
task=task,
|
|
690
|
+
cache=cache,
|
|
691
|
+
**api_kwargs,
|
|
650
692
|
)
|
|
651
693
|
|
|
652
|
-
def fillna(
|
|
694
|
+
def fillna(
|
|
695
|
+
self,
|
|
696
|
+
target_column_name: str,
|
|
697
|
+
max_examples: int = 500,
|
|
698
|
+
batch_size: int | None = None,
|
|
699
|
+
show_progress: bool = False,
|
|
700
|
+
**api_kwargs,
|
|
701
|
+
) -> pd.DataFrame:
|
|
653
702
|
"""Fill missing values in a DataFrame column using AI-powered inference.
|
|
654
703
|
|
|
655
704
|
This method uses machine learning to intelligently fill missing (NaN) values
|
|
@@ -666,6 +715,11 @@ class OpenAIVecDataFrameAccessor:
|
|
|
666
715
|
batch_size (int | None, optional): Number of requests sent in one batch
|
|
667
716
|
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
668
717
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
718
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
719
|
+
|
|
720
|
+
Additional Keyword Args:
|
|
721
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
722
|
+
``seed``, etc.) are forwarded verbatim to the underlying task execution.
|
|
669
723
|
|
|
670
724
|
Returns:
|
|
671
725
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
@@ -681,6 +735,10 @@ class OpenAIVecDataFrameAccessor:
|
|
|
681
735
|
|
|
682
736
|
# Fill missing values in the 'name' column
|
|
683
737
|
filled_df = df.ai.fillna('name')
|
|
738
|
+
|
|
739
|
+
# With progress bar for large datasets
|
|
740
|
+
large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
|
|
741
|
+
filled_df = large_df.ai.fillna('name', batch_size=32, show_progress=True)
|
|
684
742
|
```
|
|
685
743
|
|
|
686
744
|
Note:
|
|
@@ -693,7 +751,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
693
751
|
if missing_rows.empty:
|
|
694
752
|
return self._obj
|
|
695
753
|
|
|
696
|
-
filled_values: List[FillNaResponse] = missing_rows.ai.task(
|
|
754
|
+
filled_values: List[FillNaResponse] = missing_rows.ai.task(
|
|
755
|
+
task=task, batch_size=batch_size, show_progress=show_progress, **api_kwargs
|
|
756
|
+
)
|
|
697
757
|
|
|
698
758
|
# get deep copy of the DataFrame to avoid modifying the original
|
|
699
759
|
df = self._obj.copy()
|
|
@@ -754,6 +814,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
754
814
|
response_format: Type[ResponseFormat] = str,
|
|
755
815
|
temperature: float | None = 0.0,
|
|
756
816
|
top_p: float = 1.0,
|
|
817
|
+
**api_kwargs,
|
|
757
818
|
) -> pd.Series:
|
|
758
819
|
"""Call an LLM once for every Series element using a provided cache (asynchronously).
|
|
759
820
|
|
|
@@ -769,24 +830,24 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
769
830
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
770
831
|
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
771
832
|
type the assistant should return. Defaults to ``str``.
|
|
772
|
-
temperature (float, optional): Sampling temperature.
|
|
833
|
+
temperature (float | None, optional): Sampling temperature. ``None`` omits the
|
|
834
|
+
parameter (recommended for reasoning models). Defaults to ``0.0``.
|
|
773
835
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
836
|
+
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
837
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``max_output_tokens``, penalties,
|
|
838
|
+
future parameters). Core batching keys (model, instructions, input,
|
|
839
|
+
text_format) are protected and silently ignored if provided.
|
|
774
840
|
|
|
775
841
|
Returns:
|
|
776
842
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
777
843
|
|
|
778
844
|
Example:
|
|
779
845
|
```python
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
animals = pd.Series(["cat", "dog", "elephant"])
|
|
786
|
-
# Must be awaited
|
|
787
|
-
result = await animals.aio.responses_with_cache(
|
|
788
|
-
"translate to French",
|
|
789
|
-
cache=shared_cache
|
|
846
|
+
result = await series.aio.responses_with_cache(
|
|
847
|
+
"classify",
|
|
848
|
+
cache=shared,
|
|
849
|
+
max_output_tokens=256,
|
|
850
|
+
frequency_penalty=0.2,
|
|
790
851
|
)
|
|
791
852
|
```
|
|
792
853
|
|
|
@@ -802,9 +863,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
802
863
|
temperature=temperature,
|
|
803
864
|
top_p=top_p,
|
|
804
865
|
)
|
|
805
|
-
|
|
806
|
-
results = await client.parse(self._obj.tolist())
|
|
807
|
-
|
|
866
|
+
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
808
867
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
809
868
|
|
|
810
869
|
async def embeddings_with_cache(
|
|
@@ -864,6 +923,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
864
923
|
self,
|
|
865
924
|
task: PreparedTask[ResponseFormat],
|
|
866
925
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
926
|
+
**api_kwargs,
|
|
867
927
|
) -> pd.Series:
|
|
868
928
|
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
869
929
|
|
|
@@ -879,6 +939,12 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
879
939
|
instance for managing API call batching and deduplication.
|
|
880
940
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
881
941
|
|
|
942
|
+
Additional Keyword Args:
|
|
943
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
944
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
945
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
946
|
+
library and cannot be overridden.
|
|
947
|
+
|
|
882
948
|
Returns:
|
|
883
949
|
pandas.Series: Series whose values are instances of the task's
|
|
884
950
|
response format, aligned with the original Series index.
|
|
@@ -911,9 +977,8 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
911
977
|
temperature=task.temperature,
|
|
912
978
|
top_p=task.top_p,
|
|
913
979
|
)
|
|
914
|
-
|
|
915
980
|
# Await the async operation
|
|
916
|
-
results = await client.parse(self._obj.tolist())
|
|
981
|
+
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
917
982
|
|
|
918
983
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
919
984
|
|
|
@@ -926,6 +991,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
926
991
|
top_p: float = 1.0,
|
|
927
992
|
max_concurrency: int = 8,
|
|
928
993
|
show_progress: bool = False,
|
|
994
|
+
**api_kwargs,
|
|
929
995
|
) -> pd.Series:
|
|
930
996
|
"""Call an LLM once for every Series element (asynchronously).
|
|
931
997
|
|
|
@@ -944,10 +1010,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
944
1010
|
show_progress=True
|
|
945
1011
|
)
|
|
946
1012
|
```
|
|
947
|
-
This method returns a Series of strings, each containing the
|
|
948
|
-
assistant's response to the corresponding input.
|
|
949
|
-
The model used is set by the `responses_model` function.
|
|
950
|
-
The default model is `gpt-4.1-mini`.
|
|
951
1013
|
|
|
952
1014
|
Args:
|
|
953
1015
|
instructions (str): System prompt prepended to every user message.
|
|
@@ -976,6 +1038,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
976
1038
|
response_format=response_format,
|
|
977
1039
|
temperature=temperature,
|
|
978
1040
|
top_p=top_p,
|
|
1041
|
+
**api_kwargs,
|
|
979
1042
|
)
|
|
980
1043
|
|
|
981
1044
|
async def embeddings(
|
|
@@ -997,10 +1060,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
997
1060
|
show_progress=True
|
|
998
1061
|
)
|
|
999
1062
|
```
|
|
1000
|
-
This method returns a Series of numpy arrays, each containing the
|
|
1001
|
-
embedding vector for the corresponding input.
|
|
1002
|
-
The embedding model is set by the `embeddings_model` function.
|
|
1003
|
-
The default embedding model is `text-embedding-3-small`.
|
|
1004
1063
|
|
|
1005
1064
|
Args:
|
|
1006
1065
|
batch_size (int | None, optional): Number of inputs grouped into a
|
|
@@ -1024,14 +1083,15 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1024
1083
|
)
|
|
1025
1084
|
|
|
1026
1085
|
async def task(
|
|
1027
|
-
self,
|
|
1086
|
+
self,
|
|
1087
|
+
task: PreparedTask,
|
|
1088
|
+
batch_size: int | None = None,
|
|
1089
|
+
max_concurrency: int = 8,
|
|
1090
|
+
show_progress: bool = False,
|
|
1091
|
+
**api_kwargs,
|
|
1028
1092
|
) -> pd.Series:
|
|
1029
1093
|
"""Execute a prepared task on every Series element (asynchronously).
|
|
1030
1094
|
|
|
1031
|
-
This method applies a pre-configured task to each element in the Series,
|
|
1032
|
-
using the task's instructions and response format to generate structured
|
|
1033
|
-
responses from the language model.
|
|
1034
|
-
|
|
1035
1095
|
Example:
|
|
1036
1096
|
```python
|
|
1037
1097
|
from openaivec._model import PreparedTask
|
|
@@ -1052,8 +1112,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1052
1112
|
show_progress=True
|
|
1053
1113
|
)
|
|
1054
1114
|
```
|
|
1055
|
-
This method returns a Series containing the task results for each
|
|
1056
|
-
corresponding input element, following the task's defined structure.
|
|
1057
1115
|
|
|
1058
1116
|
Args:
|
|
1059
1117
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
@@ -1065,6 +1123,12 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1065
1123
|
requests. Defaults to 8.
|
|
1066
1124
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1067
1125
|
|
|
1126
|
+
Additional Keyword Args:
|
|
1127
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1128
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1129
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1130
|
+
library and cannot be overridden.
|
|
1131
|
+
|
|
1068
1132
|
Returns:
|
|
1069
1133
|
pandas.Series: Series whose values are instances of the task's
|
|
1070
1134
|
response format, aligned with the original Series index.
|
|
@@ -1077,6 +1141,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1077
1141
|
cache=AsyncBatchingMapProxy(
|
|
1078
1142
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1079
1143
|
),
|
|
1144
|
+
**api_kwargs,
|
|
1080
1145
|
)
|
|
1081
1146
|
|
|
1082
1147
|
|
|
@@ -1094,6 +1159,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1094
1159
|
response_format: Type[ResponseFormat] = str,
|
|
1095
1160
|
temperature: float | None = 0.0,
|
|
1096
1161
|
top_p: float = 1.0,
|
|
1162
|
+
**api_kwargs,
|
|
1097
1163
|
) -> pd.Series:
|
|
1098
1164
|
"""Generate a response for each row after serialising it to JSON using a provided cache (asynchronously).
|
|
1099
1165
|
|
|
@@ -1137,20 +1203,14 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1137
1203
|
Note:
|
|
1138
1204
|
This is an asynchronous method and must be awaited.
|
|
1139
1205
|
"""
|
|
1140
|
-
series_of_json = self._obj.pipe(
|
|
1141
|
-
lambda df: (
|
|
1142
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
1143
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
1144
|
-
)
|
|
1145
|
-
)
|
|
1146
|
-
)
|
|
1147
1206
|
# Await the call to the async Series method using .aio
|
|
1148
|
-
return await
|
|
1207
|
+
return await _df_rows_to_json_series(self._obj).aio.responses_with_cache(
|
|
1149
1208
|
instructions=instructions,
|
|
1150
1209
|
cache=cache,
|
|
1151
1210
|
response_format=response_format,
|
|
1152
1211
|
temperature=temperature,
|
|
1153
1212
|
top_p=top_p,
|
|
1213
|
+
**api_kwargs,
|
|
1154
1214
|
)
|
|
1155
1215
|
|
|
1156
1216
|
async def responses(
|
|
@@ -1162,33 +1222,29 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1162
1222
|
top_p: float = 1.0,
|
|
1163
1223
|
max_concurrency: int = 8,
|
|
1164
1224
|
show_progress: bool = False,
|
|
1225
|
+
**api_kwargs,
|
|
1165
1226
|
) -> pd.Series:
|
|
1166
1227
|
"""Generate a response for each row after serialising it to JSON (asynchronously).
|
|
1167
1228
|
|
|
1168
1229
|
Example:
|
|
1169
1230
|
```python
|
|
1170
1231
|
df = pd.DataFrame([
|
|
1171
|
-
{
|
|
1172
|
-
{
|
|
1173
|
-
{
|
|
1232
|
+
{"name": "cat", "legs": 4},
|
|
1233
|
+
{"name": "dog", "legs": 4},
|
|
1234
|
+
{"name": "elephant", "legs": 4},
|
|
1174
1235
|
])
|
|
1175
1236
|
# Must be awaited
|
|
1176
|
-
results = await df.aio.responses(
|
|
1237
|
+
results = await df.aio.responses("what is the animal's name?")
|
|
1177
1238
|
|
|
1178
1239
|
# With progress bar for large datasets
|
|
1179
|
-
large_df = pd.DataFrame({
|
|
1240
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
1180
1241
|
results = await large_df.aio.responses(
|
|
1181
|
-
|
|
1242
|
+
"generate a name for this ID",
|
|
1182
1243
|
batch_size=20,
|
|
1183
1244
|
max_concurrency=4,
|
|
1184
1245
|
show_progress=True
|
|
1185
1246
|
)
|
|
1186
1247
|
```
|
|
1187
|
-
This method returns a Series of strings, each containing the
|
|
1188
|
-
assistant's response to the corresponding input.
|
|
1189
|
-
Each row is serialised to JSON before being sent to the assistant.
|
|
1190
|
-
The model used is set by the `responses_model` function.
|
|
1191
|
-
The default model is `gpt-4.1-mini`.
|
|
1192
1248
|
|
|
1193
1249
|
Args:
|
|
1194
1250
|
instructions (str): System prompt for the assistant.
|
|
@@ -1217,18 +1273,19 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1217
1273
|
response_format=response_format,
|
|
1218
1274
|
temperature=temperature,
|
|
1219
1275
|
top_p=top_p,
|
|
1276
|
+
**api_kwargs,
|
|
1220
1277
|
)
|
|
1221
1278
|
|
|
1222
1279
|
async def task(
|
|
1223
|
-
self,
|
|
1280
|
+
self,
|
|
1281
|
+
task: PreparedTask,
|
|
1282
|
+
batch_size: int | None = None,
|
|
1283
|
+
max_concurrency: int = 8,
|
|
1284
|
+
show_progress: bool = False,
|
|
1285
|
+
**api_kwargs,
|
|
1224
1286
|
) -> pd.Series:
|
|
1225
1287
|
"""Execute a prepared task on each DataFrame row after serialising it to JSON (asynchronously).
|
|
1226
1288
|
|
|
1227
|
-
This method applies a pre-configured task to each row in the DataFrame,
|
|
1228
|
-
using the task's instructions and response format to generate structured
|
|
1229
|
-
responses from the language model. Each row is serialised to JSON before
|
|
1230
|
-
being processed by the task.
|
|
1231
|
-
|
|
1232
1289
|
Example:
|
|
1233
1290
|
```python
|
|
1234
1291
|
from openaivec._model import PreparedTask
|
|
@@ -1253,8 +1310,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1253
1310
|
show_progress=True
|
|
1254
1311
|
)
|
|
1255
1312
|
```
|
|
1256
|
-
This method returns a Series containing the task results for each
|
|
1257
|
-
corresponding row, following the task's defined structure.
|
|
1258
1313
|
|
|
1259
1314
|
Args:
|
|
1260
1315
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
@@ -1266,6 +1321,12 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1266
1321
|
requests. Defaults to 8.
|
|
1267
1322
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1268
1323
|
|
|
1324
|
+
Additional Keyword Args:
|
|
1325
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1326
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1327
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1328
|
+
library and cannot be overridden.
|
|
1329
|
+
|
|
1269
1330
|
Returns:
|
|
1270
1331
|
pandas.Series: Series whose values are instances of the task's
|
|
1271
1332
|
response format, aligned with the DataFrame's original index.
|
|
@@ -1273,19 +1334,40 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1273
1334
|
Note:
|
|
1274
1335
|
This is an asynchronous method and must be awaited.
|
|
1275
1336
|
"""
|
|
1276
|
-
series_of_json = self._obj.pipe(
|
|
1277
|
-
lambda df: (
|
|
1278
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
1279
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
1280
|
-
)
|
|
1281
|
-
)
|
|
1282
|
-
)
|
|
1283
1337
|
# Await the call to the async Series method using .aio
|
|
1284
|
-
return await
|
|
1338
|
+
return await _df_rows_to_json_series(self._obj).aio.task(
|
|
1285
1339
|
task=task,
|
|
1286
1340
|
batch_size=batch_size,
|
|
1287
1341
|
max_concurrency=max_concurrency,
|
|
1288
1342
|
show_progress=show_progress,
|
|
1343
|
+
**api_kwargs,
|
|
1344
|
+
)
|
|
1345
|
+
|
|
1346
|
+
async def task_with_cache(
|
|
1347
|
+
self,
|
|
1348
|
+
task: PreparedTask[ResponseFormat],
|
|
1349
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1350
|
+
**api_kwargs,
|
|
1351
|
+
) -> pd.Series:
|
|
1352
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache (async).
|
|
1353
|
+
|
|
1354
|
+
Args:
|
|
1355
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
1356
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1357
|
+
|
|
1358
|
+
Additional Keyword Args:
|
|
1359
|
+
Arbitrary OpenAI Responses API parameters forwarded verbatim. Core routing keys are protected.
|
|
1360
|
+
|
|
1361
|
+
Returns:
|
|
1362
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
1363
|
+
|
|
1364
|
+
Note:
|
|
1365
|
+
This is an asynchronous method and must be awaited.
|
|
1366
|
+
"""
|
|
1367
|
+
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1368
|
+
task=task,
|
|
1369
|
+
cache=cache,
|
|
1370
|
+
**api_kwargs,
|
|
1289
1371
|
)
|
|
1290
1372
|
|
|
1291
1373
|
async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
|
|
@@ -1371,7 +1453,13 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1371
1453
|
return df_current
|
|
1372
1454
|
|
|
1373
1455
|
async def fillna(
|
|
1374
|
-
self,
|
|
1456
|
+
self,
|
|
1457
|
+
target_column_name: str,
|
|
1458
|
+
max_examples: int = 500,
|
|
1459
|
+
batch_size: int | None = None,
|
|
1460
|
+
max_concurrency: int = 8,
|
|
1461
|
+
show_progress: bool = False,
|
|
1462
|
+
**api_kwargs,
|
|
1375
1463
|
) -> pd.DataFrame:
|
|
1376
1464
|
"""Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
|
|
1377
1465
|
|
|
@@ -1391,6 +1479,11 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1391
1479
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1392
1480
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1393
1481
|
requests. Defaults to 8.
|
|
1482
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1483
|
+
|
|
1484
|
+
Additional Keyword Args:
|
|
1485
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1486
|
+
``seed``, etc.) are forwarded verbatim to the underlying task execution.
|
|
1394
1487
|
|
|
1395
1488
|
Returns:
|
|
1396
1489
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
@@ -1406,6 +1499,15 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1406
1499
|
|
|
1407
1500
|
# Fill missing values in the 'name' column (must be awaited)
|
|
1408
1501
|
filled_df = await df.aio.fillna('name')
|
|
1502
|
+
|
|
1503
|
+
# With progress bar for large datasets
|
|
1504
|
+
large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
|
|
1505
|
+
filled_df = await large_df.aio.fillna(
|
|
1506
|
+
'name',
|
|
1507
|
+
batch_size=32,
|
|
1508
|
+
max_concurrency=4,
|
|
1509
|
+
show_progress=True
|
|
1510
|
+
)
|
|
1409
1511
|
```
|
|
1410
1512
|
|
|
1411
1513
|
Note:
|
|
@@ -1420,7 +1522,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1420
1522
|
return self._obj
|
|
1421
1523
|
|
|
1422
1524
|
filled_values: List[FillNaResponse] = await missing_rows.aio.task(
|
|
1423
|
-
task=task, batch_size=batch_size, max_concurrency=max_concurrency
|
|
1525
|
+
task=task, batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress, **api_kwargs
|
|
1424
1526
|
)
|
|
1425
1527
|
|
|
1426
1528
|
# get deep copy of the DataFrame to avoid modifying the original
|