openaivec 0.13.5__py3-none-any.whl → 0.13.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/__init__.py +7 -2
- openaivec/di.py +2 -0
- openaivec/embeddings.py +10 -8
- openaivec/log.py +1 -1
- openaivec/model.py +12 -10
- openaivec/optimize.py +4 -2
- openaivec/pandas_ext.py +68 -42
- openaivec/prompt.py +58 -8
- openaivec/provider.py +12 -0
- openaivec/proxy.py +84 -65
- openaivec/responses.py +35 -18
- openaivec/serialize.py +1 -1
- openaivec/spark.py +49 -34
- openaivec/task/customer_support/inquiry_classification.py +9 -9
- openaivec/task/customer_support/urgency_analysis.py +13 -13
- openaivec/task/nlp/keyword_extraction.py +2 -2
- openaivec/task/nlp/named_entity_recognition.py +2 -2
- openaivec/util.py +4 -2
- {openaivec-0.13.5.dist-info → openaivec-0.13.7.dist-info}/METADATA +9 -9
- openaivec-0.13.7.dist-info/RECORD +35 -0
- openaivec-0.13.5.dist-info/RECORD +0 -35
- {openaivec-0.13.5.dist-info → openaivec-0.13.7.dist-info}/WHEEL +0 -0
- {openaivec-0.13.5.dist-info → openaivec-0.13.7.dist-info}/licenses/LICENSE +0 -0
openaivec/__init__.py
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
from .embeddings import AsyncBatchEmbeddings, BatchEmbeddings
|
|
2
|
+
from .model import PreparedTask
|
|
3
|
+
from .prompt import FewShotPrompt, FewShotPromptBuilder
|
|
2
4
|
from .responses import AsyncBatchResponses, BatchResponses
|
|
3
5
|
|
|
4
6
|
__all__ = [
|
|
5
|
-
"
|
|
7
|
+
"AsyncBatchEmbeddings",
|
|
6
8
|
"AsyncBatchResponses",
|
|
7
9
|
"BatchEmbeddings",
|
|
8
|
-
"
|
|
10
|
+
"BatchResponses",
|
|
11
|
+
"FewShotPrompt",
|
|
12
|
+
"FewShotPromptBuilder",
|
|
13
|
+
"PreparedTask",
|
|
9
14
|
]
|
openaivec/di.py
CHANGED
|
@@ -2,6 +2,8 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from threading import RLock
|
|
3
3
|
from typing import Any, Callable, Dict, Set, Type, TypeVar
|
|
4
4
|
|
|
5
|
+
__all__ = []
|
|
6
|
+
|
|
5
7
|
"""Simple dependency injection container with singleton lifecycle management.
|
|
6
8
|
|
|
7
9
|
This module provides a lightweight dependency injection container that manages
|
openaivec/embeddings.py
CHANGED
|
@@ -31,16 +31,17 @@ class BatchEmbeddings:
|
|
|
31
31
|
|
|
32
32
|
client: OpenAI
|
|
33
33
|
model_name: str
|
|
34
|
-
cache: BatchingMapProxy[str, NDArray[np.float32]] = field(default_factory=lambda: BatchingMapProxy(batch_size=
|
|
34
|
+
cache: BatchingMapProxy[str, NDArray[np.float32]] = field(default_factory=lambda: BatchingMapProxy(batch_size=None))
|
|
35
35
|
|
|
36
36
|
@classmethod
|
|
37
|
-
def of(cls, client: OpenAI, model_name: str, batch_size: int =
|
|
37
|
+
def of(cls, client: OpenAI, model_name: str, batch_size: int | None = None) -> "BatchEmbeddings":
|
|
38
38
|
"""Factory constructor.
|
|
39
39
|
|
|
40
40
|
Args:
|
|
41
41
|
client (OpenAI): OpenAI client.
|
|
42
42
|
model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name.
|
|
43
|
-
batch_size (int, optional): Max unique inputs per API call. Defaults to
|
|
43
|
+
batch_size (int | None, optional): Max unique inputs per API call. Defaults to None
|
|
44
|
+
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
44
45
|
|
|
45
46
|
Returns:
|
|
46
47
|
BatchEmbeddings: Configured instance backed by a batching proxy.
|
|
@@ -127,7 +128,7 @@ class AsyncBatchEmbeddings:
|
|
|
127
128
|
client: AsyncOpenAI
|
|
128
129
|
model_name: str
|
|
129
130
|
cache: AsyncBatchingMapProxy[str, NDArray[np.float32]] = field(
|
|
130
|
-
default_factory=lambda: AsyncBatchingMapProxy(batch_size=
|
|
131
|
+
default_factory=lambda: AsyncBatchingMapProxy(batch_size=None, max_concurrency=8)
|
|
131
132
|
)
|
|
132
133
|
|
|
133
134
|
@classmethod
|
|
@@ -135,7 +136,7 @@ class AsyncBatchEmbeddings:
|
|
|
135
136
|
cls,
|
|
136
137
|
client: AsyncOpenAI,
|
|
137
138
|
model_name: str,
|
|
138
|
-
batch_size: int =
|
|
139
|
+
batch_size: int | None = None,
|
|
139
140
|
max_concurrency: int = 8,
|
|
140
141
|
) -> "AsyncBatchEmbeddings":
|
|
141
142
|
"""Factory constructor.
|
|
@@ -143,7 +144,8 @@ class AsyncBatchEmbeddings:
|
|
|
143
144
|
Args:
|
|
144
145
|
client (AsyncOpenAI): OpenAI async client.
|
|
145
146
|
model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name.
|
|
146
|
-
batch_size (int, optional): Max unique inputs per API call. Defaults to
|
|
147
|
+
batch_size (int | None, optional): Max unique inputs per API call. Defaults to None
|
|
148
|
+
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
147
149
|
max_concurrency (int, optional): Max concurrent API calls. Defaults to 8.
|
|
148
150
|
|
|
149
151
|
Returns:
|
|
@@ -155,8 +157,8 @@ class AsyncBatchEmbeddings:
|
|
|
155
157
|
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
156
158
|
)
|
|
157
159
|
|
|
158
|
-
@observe(_LOGGER)
|
|
159
160
|
@backoff_async(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
|
|
161
|
+
@observe(_LOGGER)
|
|
160
162
|
async def _embed_chunk(self, inputs: List[str]) -> List[NDArray[np.float32]]:
|
|
161
163
|
"""Embed one minibatch of strings asynchronously.
|
|
162
164
|
|
|
@@ -186,4 +188,4 @@ class AsyncBatchEmbeddings:
|
|
|
186
188
|
Returns:
|
|
187
189
|
List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
|
|
188
190
|
"""
|
|
189
|
-
return await self.cache.map(inputs, self._embed_chunk)
|
|
191
|
+
return await self.cache.map(inputs, self._embed_chunk) # type: ignore[arg-type]
|
openaivec/log.py
CHANGED
openaivec/model.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Type, TypeVar
|
|
2
|
+
from typing import Generic, Type, TypeVar
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
__all__ = [
|
|
5
|
+
"PreparedTask",
|
|
6
|
+
]
|
|
5
7
|
|
|
6
|
-
ResponseFormat = TypeVar("ResponseFormat"
|
|
8
|
+
ResponseFormat = TypeVar("ResponseFormat")
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
@dataclass(frozen=True)
|
|
10
|
-
class PreparedTask:
|
|
12
|
+
class PreparedTask(Generic[ResponseFormat]):
|
|
11
13
|
"""A data class representing a complete task configuration for OpenAI API calls.
|
|
12
14
|
|
|
13
15
|
This class encapsulates all the necessary parameters for executing a task,
|
|
@@ -84,10 +86,10 @@ class OpenAIAPIKey:
|
|
|
84
86
|
"""Container for OpenAI API key configuration.
|
|
85
87
|
|
|
86
88
|
Attributes:
|
|
87
|
-
value (str): The API key for OpenAI services.
|
|
89
|
+
value (str | None): The API key for OpenAI services.
|
|
88
90
|
"""
|
|
89
91
|
|
|
90
|
-
value: str
|
|
92
|
+
value: str | None
|
|
91
93
|
|
|
92
94
|
|
|
93
95
|
@dataclass(frozen=True)
|
|
@@ -95,10 +97,10 @@ class AzureOpenAIAPIKey:
|
|
|
95
97
|
"""Container for Azure OpenAI API key configuration.
|
|
96
98
|
|
|
97
99
|
Attributes:
|
|
98
|
-
value (str): The API key for Azure OpenAI services.
|
|
100
|
+
value (str | None): The API key for Azure OpenAI services.
|
|
99
101
|
"""
|
|
100
102
|
|
|
101
|
-
value: str
|
|
103
|
+
value: str | None
|
|
102
104
|
|
|
103
105
|
|
|
104
106
|
@dataclass(frozen=True)
|
|
@@ -106,10 +108,10 @@ class AzureOpenAIBaseURL:
|
|
|
106
108
|
"""Container for Azure OpenAI base URL configuration.
|
|
107
109
|
|
|
108
110
|
Attributes:
|
|
109
|
-
value (str): The base URL for Azure OpenAI services.
|
|
111
|
+
value (str | None): The base URL for Azure OpenAI services.
|
|
110
112
|
"""
|
|
111
113
|
|
|
112
|
-
value: str
|
|
114
|
+
value: str | None
|
|
113
115
|
|
|
114
116
|
|
|
115
117
|
@dataclass(frozen=True)
|
openaivec/optimize.py
CHANGED
|
@@ -5,6 +5,8 @@ from dataclasses import dataclass, field
|
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
6
|
from typing import List
|
|
7
7
|
|
|
8
|
+
__all__ = []
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
@dataclass(frozen=True)
|
|
10
12
|
class PerformanceMetric:
|
|
@@ -20,8 +22,8 @@ class BatchSizeSuggester:
|
|
|
20
22
|
min_batch_size: int = 10
|
|
21
23
|
min_duration: float = 30.0
|
|
22
24
|
max_duration: float = 60.0
|
|
23
|
-
step_ratio: float = 0.
|
|
24
|
-
sample_size: int =
|
|
25
|
+
step_ratio: float = 0.2
|
|
26
|
+
sample_size: int = 4
|
|
25
27
|
_history: List[PerformanceMetric] = field(default_factory=list)
|
|
26
28
|
_lock: threading.RLock = field(default_factory=threading.RLock, repr=False)
|
|
27
29
|
_batch_size_changed_at: datetime | None = field(default=None, init=False)
|
openaivec/pandas_ext.py
CHANGED
|
@@ -42,12 +42,19 @@ to easily interact with OpenAI APIs for tasks like generating responses or embed
|
|
|
42
42
|
import inspect
|
|
43
43
|
import json
|
|
44
44
|
import logging
|
|
45
|
-
from typing import
|
|
45
|
+
from typing import Awaitable, Callable, List, Type, TypeVar
|
|
46
46
|
|
|
47
47
|
import numpy as np
|
|
48
48
|
import pandas as pd
|
|
49
49
|
import tiktoken
|
|
50
50
|
from openai import AsyncOpenAI, OpenAI
|
|
51
|
+
|
|
52
|
+
__all__ = [
|
|
53
|
+
"embeddings_model",
|
|
54
|
+
"responses_model",
|
|
55
|
+
"use",
|
|
56
|
+
"use_async",
|
|
57
|
+
]
|
|
51
58
|
from pydantic import BaseModel
|
|
52
59
|
|
|
53
60
|
from openaivec.embeddings import AsyncBatchEmbeddings, BatchEmbeddings
|
|
@@ -184,6 +191,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
184
191
|
Args:
|
|
185
192
|
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
186
193
|
instance for managing API call batching and deduplication.
|
|
194
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
187
195
|
|
|
188
196
|
Returns:
|
|
189
197
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -217,7 +225,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
217
225
|
self,
|
|
218
226
|
instructions: str,
|
|
219
227
|
response_format: Type[ResponseFormat] = str,
|
|
220
|
-
batch_size: int =
|
|
228
|
+
batch_size: int | None = None,
|
|
221
229
|
temperature: float | None = 0.0,
|
|
222
230
|
top_p: float = 1.0,
|
|
223
231
|
show_progress: bool = False,
|
|
@@ -247,8 +255,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
247
255
|
instructions (str): System prompt prepended to every user message.
|
|
248
256
|
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
249
257
|
type the assistant should return. Defaults to ``str``.
|
|
250
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
251
|
-
request. Defaults to ``
|
|
258
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
259
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
260
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
252
261
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
253
262
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
254
263
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -266,7 +275,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
266
275
|
|
|
267
276
|
def task_with_cache(
|
|
268
277
|
self,
|
|
269
|
-
task: PreparedTask,
|
|
278
|
+
task: PreparedTask[ResponseFormat],
|
|
270
279
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
271
280
|
) -> pd.Series:
|
|
272
281
|
"""Execute a prepared task on every Series element using a provided cache.
|
|
@@ -280,6 +289,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
280
289
|
response format, and other parameters for processing the inputs.
|
|
281
290
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
282
291
|
instance for managing API call batching and deduplication.
|
|
292
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
283
293
|
|
|
284
294
|
Returns:
|
|
285
295
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -311,7 +321,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
311
321
|
)
|
|
312
322
|
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
313
323
|
|
|
314
|
-
def task(self, task: PreparedTask, batch_size: int =
|
|
324
|
+
def task(self, task: PreparedTask, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
315
325
|
"""Execute a prepared task on every Series element.
|
|
316
326
|
|
|
317
327
|
This method applies a pre-configured task to each element in the Series,
|
|
@@ -343,8 +353,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
343
353
|
Args:
|
|
344
354
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
345
355
|
response format, and other parameters for processing the inputs.
|
|
346
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
347
|
-
request to optimize API usage. Defaults to
|
|
356
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
357
|
+
request to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
358
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
348
359
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
349
360
|
|
|
350
361
|
Returns:
|
|
@@ -356,7 +367,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
356
367
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
357
368
|
)
|
|
358
369
|
|
|
359
|
-
def embeddings(self, batch_size: int =
|
|
370
|
+
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
360
371
|
"""Compute OpenAI embeddings for every Series element.
|
|
361
372
|
|
|
362
373
|
Example:
|
|
@@ -378,8 +389,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
378
389
|
The default embedding model is `text-embedding-3-small`.
|
|
379
390
|
|
|
380
391
|
Args:
|
|
381
|
-
batch_size (int, optional): Number of inputs grouped into a
|
|
382
|
-
single request. Defaults to ``
|
|
392
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
393
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
394
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
383
395
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
384
396
|
|
|
385
397
|
Returns:
|
|
@@ -494,6 +506,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
494
506
|
instructions (str): System prompt for the assistant.
|
|
495
507
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
496
508
|
instance for managing API call batching and deduplication.
|
|
509
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
497
510
|
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
498
511
|
responses. Defaults to ``str``.
|
|
499
512
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
@@ -538,7 +551,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
538
551
|
self,
|
|
539
552
|
instructions: str,
|
|
540
553
|
response_format: Type[ResponseFormat] = str,
|
|
541
|
-
batch_size: int =
|
|
554
|
+
batch_size: int | None = None,
|
|
542
555
|
temperature: float | None = 0.0,
|
|
543
556
|
top_p: float = 1.0,
|
|
544
557
|
show_progress: bool = False,
|
|
@@ -573,8 +586,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
573
586
|
instructions (str): System prompt for the assistant.
|
|
574
587
|
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
575
588
|
responses. Defaults to ``str``.
|
|
576
|
-
batch_size (int, optional): Number of requests sent in one batch.
|
|
577
|
-
Defaults to ``
|
|
589
|
+
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
590
|
+
Defaults to ``None`` (automatic batch size optimization
|
|
591
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
578
592
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
579
593
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
580
594
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -590,7 +604,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
590
604
|
top_p=top_p,
|
|
591
605
|
)
|
|
592
606
|
|
|
593
|
-
def task(self, task: PreparedTask, batch_size: int =
|
|
607
|
+
def task(self, task: PreparedTask, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
594
608
|
"""Execute a prepared task on each DataFrame row after serialising it to JSON.
|
|
595
609
|
|
|
596
610
|
This method applies a pre-configured task to each row in the DataFrame,
|
|
@@ -618,8 +632,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
618
632
|
Args:
|
|
619
633
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
620
634
|
response format, and other parameters for processing the inputs.
|
|
621
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
622
|
-
to optimize API usage. Defaults to
|
|
635
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
636
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
637
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
623
638
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
624
639
|
|
|
625
640
|
Returns:
|
|
@@ -634,7 +649,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
634
649
|
)
|
|
635
650
|
)
|
|
636
651
|
|
|
637
|
-
def fillna(self, target_column_name: str, max_examples: int = 500, batch_size: int =
|
|
652
|
+
def fillna(self, target_column_name: str, max_examples: int = 500, batch_size: int | None = None) -> pd.DataFrame:
|
|
638
653
|
"""Fill missing values in a DataFrame column using AI-powered inference.
|
|
639
654
|
|
|
640
655
|
This method uses machine learning to intelligently fill missing (NaN) values
|
|
@@ -648,8 +663,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
648
663
|
max_examples (int, optional): The maximum number of example rows to use
|
|
649
664
|
for context when predicting missing values. Higher values may improve
|
|
650
665
|
accuracy but increase API costs and processing time. Defaults to 500.
|
|
651
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
652
|
-
to optimize API usage. Defaults to
|
|
666
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
667
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
668
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
653
669
|
|
|
654
670
|
Returns:
|
|
655
671
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
@@ -721,7 +737,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
721
737
|
return self._obj.apply(
|
|
722
738
|
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
723
739
|
axis=1,
|
|
724
|
-
).rename("similarity")
|
|
740
|
+
).rename("similarity") # type: ignore[arg-type]
|
|
725
741
|
|
|
726
742
|
|
|
727
743
|
@pd.api.extensions.register_series_accessor("aio")
|
|
@@ -750,6 +766,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
750
766
|
instructions (str): System prompt prepended to every user message.
|
|
751
767
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
752
768
|
instance for managing API call batching and deduplication.
|
|
769
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
753
770
|
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
754
771
|
type the assistant should return. Defaults to ``str``.
|
|
755
772
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
@@ -804,6 +821,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
804
821
|
Args:
|
|
805
822
|
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
806
823
|
instance for managing API call batching and deduplication.
|
|
824
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
807
825
|
|
|
808
826
|
Returns:
|
|
809
827
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -844,7 +862,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
844
862
|
|
|
845
863
|
async def task_with_cache(
|
|
846
864
|
self,
|
|
847
|
-
task: PreparedTask,
|
|
865
|
+
task: PreparedTask[ResponseFormat],
|
|
848
866
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
849
867
|
) -> pd.Series:
|
|
850
868
|
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
@@ -859,6 +877,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
859
877
|
response format, and other parameters for processing the inputs.
|
|
860
878
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
861
879
|
instance for managing API call batching and deduplication.
|
|
880
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
862
881
|
|
|
863
882
|
Returns:
|
|
864
883
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -902,7 +921,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
902
921
|
self,
|
|
903
922
|
instructions: str,
|
|
904
923
|
response_format: Type[ResponseFormat] = str,
|
|
905
|
-
batch_size: int =
|
|
924
|
+
batch_size: int | None = None,
|
|
906
925
|
temperature: float | None = 0.0,
|
|
907
926
|
top_p: float = 1.0,
|
|
908
927
|
max_concurrency: int = 8,
|
|
@@ -934,8 +953,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
934
953
|
instructions (str): System prompt prepended to every user message.
|
|
935
954
|
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
936
955
|
type the assistant should return. Defaults to ``str``.
|
|
937
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
938
|
-
request. Defaults to ``
|
|
956
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
957
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
958
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
939
959
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
940
960
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
941
961
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
@@ -959,7 +979,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
959
979
|
)
|
|
960
980
|
|
|
961
981
|
async def embeddings(
|
|
962
|
-
self, batch_size: int =
|
|
982
|
+
self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False
|
|
963
983
|
) -> pd.Series:
|
|
964
984
|
"""Compute OpenAI embeddings for every Series element (asynchronously).
|
|
965
985
|
|
|
@@ -983,8 +1003,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
983
1003
|
The default embedding model is `text-embedding-3-small`.
|
|
984
1004
|
|
|
985
1005
|
Args:
|
|
986
|
-
batch_size (int, optional): Number of inputs grouped into a
|
|
987
|
-
single request. Defaults to ``
|
|
1006
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
1007
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
1008
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
988
1009
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
989
1010
|
requests. Defaults to ``8``.
|
|
990
1011
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -1003,7 +1024,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1003
1024
|
)
|
|
1004
1025
|
|
|
1005
1026
|
async def task(
|
|
1006
|
-
self, task: PreparedTask, batch_size: int =
|
|
1027
|
+
self, task: PreparedTask, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False
|
|
1007
1028
|
) -> pd.Series:
|
|
1008
1029
|
"""Execute a prepared task on every Series element (asynchronously).
|
|
1009
1030
|
|
|
@@ -1037,8 +1058,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1037
1058
|
Args:
|
|
1038
1059
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
1039
1060
|
response format, and other parameters for processing the inputs.
|
|
1040
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
1041
|
-
request to optimize API usage. Defaults to
|
|
1061
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1062
|
+
request to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1063
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1042
1064
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1043
1065
|
requests. Defaults to 8.
|
|
1044
1066
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -1084,6 +1106,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1084
1106
|
instructions (str): System prompt for the assistant.
|
|
1085
1107
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1086
1108
|
instance for managing API call batching and deduplication.
|
|
1109
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1087
1110
|
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1088
1111
|
responses. Defaults to ``str``.
|
|
1089
1112
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
@@ -1134,7 +1157,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1134
1157
|
self,
|
|
1135
1158
|
instructions: str,
|
|
1136
1159
|
response_format: Type[ResponseFormat] = str,
|
|
1137
|
-
batch_size: int =
|
|
1160
|
+
batch_size: int | None = None,
|
|
1138
1161
|
temperature: float | None = 0.0,
|
|
1139
1162
|
top_p: float = 1.0,
|
|
1140
1163
|
max_concurrency: int = 8,
|
|
@@ -1171,8 +1194,9 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1171
1194
|
instructions (str): System prompt for the assistant.
|
|
1172
1195
|
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1173
1196
|
responses. Defaults to ``str``.
|
|
1174
|
-
batch_size (int, optional): Number of requests sent in one batch.
|
|
1175
|
-
Defaults to ``
|
|
1197
|
+
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
1198
|
+
Defaults to ``None`` (automatic batch size optimization
|
|
1199
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
1176
1200
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1177
1201
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1178
1202
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
@@ -1196,7 +1220,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1196
1220
|
)
|
|
1197
1221
|
|
|
1198
1222
|
async def task(
|
|
1199
|
-
self, task: PreparedTask, batch_size: int =
|
|
1223
|
+
self, task: PreparedTask, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False
|
|
1200
1224
|
) -> pd.Series:
|
|
1201
1225
|
"""Execute a prepared task on each DataFrame row after serialising it to JSON (asynchronously).
|
|
1202
1226
|
|
|
@@ -1235,8 +1259,9 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1235
1259
|
Args:
|
|
1236
1260
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
1237
1261
|
response format, and other parameters for processing the inputs.
|
|
1238
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
1239
|
-
to optimize API usage. Defaults to
|
|
1262
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
1263
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1264
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1240
1265
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1241
1266
|
requests. Defaults to 8.
|
|
1242
1267
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -1286,7 +1311,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1286
1311
|
else:
|
|
1287
1312
|
return result
|
|
1288
1313
|
|
|
1289
|
-
async def assign(self, **kwargs
|
|
1314
|
+
async def assign(self, **kwargs) -> pd.DataFrame:
|
|
1290
1315
|
"""Asynchronously assign new columns to the DataFrame, evaluating sequentially.
|
|
1291
1316
|
|
|
1292
1317
|
This method extends pandas' `assign` method by supporting asynchronous
|
|
@@ -1321,7 +1346,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1321
1346
|
```
|
|
1322
1347
|
|
|
1323
1348
|
Args:
|
|
1324
|
-
**kwargs:
|
|
1349
|
+
**kwargs: Column names as keys and either static values or callables
|
|
1325
1350
|
(synchronous or asynchronous) as values.
|
|
1326
1351
|
|
|
1327
1352
|
Returns:
|
|
@@ -1346,7 +1371,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1346
1371
|
return df_current
|
|
1347
1372
|
|
|
1348
1373
|
async def fillna(
|
|
1349
|
-
self, target_column_name: str, max_examples: int = 500, batch_size: int =
|
|
1374
|
+
self, target_column_name: str, max_examples: int = 500, batch_size: int | None = None, max_concurrency: int = 8
|
|
1350
1375
|
) -> pd.DataFrame:
|
|
1351
1376
|
"""Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
|
|
1352
1377
|
|
|
@@ -1361,8 +1386,9 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1361
1386
|
max_examples (int, optional): The maximum number of example rows to use
|
|
1362
1387
|
for context when predicting missing values. Higher values may improve
|
|
1363
1388
|
accuracy but increase API costs and processing time. Defaults to 500.
|
|
1364
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
1365
|
-
to optimize API usage. Defaults to
|
|
1389
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
1390
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1391
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1366
1392
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1367
1393
|
requests. Defaults to 8.
|
|
1368
1394
|
|