openaivec 0.12.5__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/embeddings.py +76 -60
- openaivec/model.py +20 -0
- openaivec/pandas_ext.py +455 -121
- openaivec/provider.py +67 -14
- openaivec/proxy.py +608 -0
- openaivec/responses.py +175 -105
- openaivec/serialize.py +41 -33
- openaivec/spark.py +137 -88
- openaivec/task/customer_support/__init__.py +3 -3
- openaivec/task/nlp/__init__.py +1 -1
- openaivec/task/table/__init__.py +1 -1
- openaivec/util.py +1 -69
- {openaivec-0.12.5.dist-info → openaivec-0.13.0.dist-info}/METADATA +39 -16
- {openaivec-0.12.5.dist-info → openaivec-0.13.0.dist-info}/RECORD +16 -15
- {openaivec-0.12.5.dist-info → openaivec-0.13.0.dist-info}/WHEEL +0 -0
- {openaivec-0.12.5.dist-info → openaivec-0.13.0.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -17,8 +17,8 @@ pandas_ext.use(client)
|
|
|
17
17
|
# Option 3: Use an existing Azure OpenAI client instance
|
|
18
18
|
azure_client = AzureOpenAI(
|
|
19
19
|
api_key="your-azure-key",
|
|
20
|
-
azure_endpoint="https
|
|
21
|
-
api_version="
|
|
20
|
+
azure_endpoint="https://<your-resource-name>.services.ai.azure.com",
|
|
21
|
+
api_version="2025-04-01-preview"
|
|
22
22
|
)
|
|
23
23
|
pandas_ext.use(azure_client)
|
|
24
24
|
|
|
@@ -46,10 +46,10 @@ import tiktoken
|
|
|
46
46
|
from openai import AsyncOpenAI, OpenAI
|
|
47
47
|
from pydantic import BaseModel
|
|
48
48
|
|
|
49
|
-
from .di import Container
|
|
50
49
|
from .embeddings import AsyncBatchEmbeddings, BatchEmbeddings
|
|
51
50
|
from .model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
|
|
52
|
-
from .provider import
|
|
51
|
+
from .provider import CONTAINER
|
|
52
|
+
from .proxy import AsyncBatchingMapProxy, BatchingMapProxy
|
|
53
53
|
from .responses import AsyncBatchResponses, BatchResponses
|
|
54
54
|
from .task.table import FillNaResponse, fillna
|
|
55
55
|
|
|
@@ -65,27 +65,6 @@ _LOGGER = logging.getLogger(__name__)
|
|
|
65
65
|
|
|
66
66
|
T = TypeVar("T") # For pipe function return type
|
|
67
67
|
|
|
68
|
-
_DI = Container()
|
|
69
|
-
_DI.register(OpenAI, provide_openai_client)
|
|
70
|
-
_DI.register(AsyncOpenAI, provide_async_openai_client)
|
|
71
|
-
_DI.register(ResponsesModelName, lambda: ResponsesModelName("gpt-4.1-mini"))
|
|
72
|
-
_DI.register(EmbeddingsModelName, lambda: EmbeddingsModelName("text-embedding-3-small"))
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def _provide_tiktoken_encoding() -> tiktoken.Encoding:
|
|
76
|
-
model_name = _DI.resolve(ResponsesModelName).value
|
|
77
|
-
try:
|
|
78
|
-
return tiktoken.encoding_for_model(model_name)
|
|
79
|
-
except KeyError:
|
|
80
|
-
_LOGGER.info(
|
|
81
|
-
"The model name '%s' is not supported by tiktoken. Using 'o200k_base' encoding instead.",
|
|
82
|
-
model_name,
|
|
83
|
-
)
|
|
84
|
-
return tiktoken.get_encoding("o200k_base")
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
_DI.register(tiktoken.Encoding, _provide_tiktoken_encoding)
|
|
88
|
-
|
|
89
68
|
|
|
90
69
|
def use(client: OpenAI) -> None:
|
|
91
70
|
"""Register a custom OpenAI‑compatible client.
|
|
@@ -95,7 +74,7 @@ def use(client: OpenAI) -> None:
|
|
|
95
74
|
`openai.AzureOpenAI` instance.
|
|
96
75
|
The same instance is reused by every helper in this module.
|
|
97
76
|
"""
|
|
98
|
-
|
|
77
|
+
CONTAINER.register(OpenAI, lambda: client)
|
|
99
78
|
|
|
100
79
|
|
|
101
80
|
def use_async(client: AsyncOpenAI) -> None:
|
|
@@ -106,7 +85,7 @@ def use_async(client: AsyncOpenAI) -> None:
|
|
|
106
85
|
`openai.AsyncAzureOpenAI` instance.
|
|
107
86
|
The same instance is reused by every helper in this module.
|
|
108
87
|
"""
|
|
109
|
-
|
|
88
|
+
CONTAINER.register(AsyncOpenAI, lambda: client)
|
|
110
89
|
|
|
111
90
|
|
|
112
91
|
def responses_model(name: str) -> None:
|
|
@@ -116,8 +95,7 @@ def responses_model(name: str) -> None:
|
|
|
116
95
|
name (str): Model name as listed in the OpenAI API
|
|
117
96
|
(for example, ``gpt-4.1-mini``).
|
|
118
97
|
"""
|
|
119
|
-
|
|
120
|
-
_DI.register(tiktoken.Encoding, _provide_tiktoken_encoding)
|
|
98
|
+
CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(name))
|
|
121
99
|
|
|
122
100
|
|
|
123
101
|
def embeddings_model(name: str) -> None:
|
|
@@ -126,7 +104,7 @@ def embeddings_model(name: str) -> None:
|
|
|
126
104
|
Args:
|
|
127
105
|
name (str): Embedding model name, e.g. ``text-embedding-3-small``.
|
|
128
106
|
"""
|
|
129
|
-
|
|
107
|
+
CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(name))
|
|
130
108
|
|
|
131
109
|
|
|
132
110
|
def _extract_value(x, series_name):
|
|
@@ -160,6 +138,68 @@ class OpenAIVecSeriesAccessor:
|
|
|
160
138
|
def __init__(self, series_obj: pd.Series):
|
|
161
139
|
self._obj = series_obj
|
|
162
140
|
|
|
141
|
+
def responses_with_cache(
|
|
142
|
+
self,
|
|
143
|
+
instructions: str,
|
|
144
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
145
|
+
response_format: Type[ResponseFormat] = str,
|
|
146
|
+
temperature: float = 0.0,
|
|
147
|
+
top_p: float = 1.0,
|
|
148
|
+
) -> pd.Series:
|
|
149
|
+
client: BatchResponses = BatchResponses(
|
|
150
|
+
client=CONTAINER.resolve(OpenAI),
|
|
151
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
152
|
+
system_message=instructions,
|
|
153
|
+
response_format=response_format,
|
|
154
|
+
cache=cache,
|
|
155
|
+
temperature=temperature,
|
|
156
|
+
top_p=top_p,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
160
|
+
|
|
161
|
+
def embeddings_with_cache(
|
|
162
|
+
self,
|
|
163
|
+
cache: BatchingMapProxy[str, np.ndarray],
|
|
164
|
+
) -> pd.Series:
|
|
165
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache.
|
|
166
|
+
|
|
167
|
+
This method allows external control over caching behavior by accepting
|
|
168
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
169
|
+
across multiple operations or custom batch size management.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
173
|
+
instance for managing API call batching and deduplication.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
177
|
+
(dtype ``float32``).
|
|
178
|
+
|
|
179
|
+
Example:
|
|
180
|
+
```python
|
|
181
|
+
from openaivec.proxy import BatchingMapProxy
|
|
182
|
+
import numpy as np
|
|
183
|
+
|
|
184
|
+
# Create a shared cache with custom batch size
|
|
185
|
+
shared_cache = BatchingMapProxy[str, np.ndarray](batch_size=64)
|
|
186
|
+
|
|
187
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
188
|
+
embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
|
|
189
|
+
```
|
|
190
|
+
"""
|
|
191
|
+
client: BatchEmbeddings = BatchEmbeddings(
|
|
192
|
+
client=CONTAINER.resolve(OpenAI),
|
|
193
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
194
|
+
cache=cache,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
return pd.Series(
|
|
198
|
+
client.create(self._obj.tolist()),
|
|
199
|
+
index=self._obj.index,
|
|
200
|
+
name=self._obj.name,
|
|
201
|
+
)
|
|
202
|
+
|
|
163
203
|
def responses(
|
|
164
204
|
self,
|
|
165
205
|
instructions: str,
|
|
@@ -192,20 +232,60 @@ class OpenAIVecSeriesAccessor:
|
|
|
192
232
|
Returns:
|
|
193
233
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
194
234
|
"""
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
system_message=instructions,
|
|
235
|
+
return self.responses_with_cache(
|
|
236
|
+
instructions=instructions,
|
|
237
|
+
cache=BatchingMapProxy(batch_size=batch_size),
|
|
199
238
|
response_format=response_format,
|
|
200
239
|
temperature=temperature,
|
|
201
240
|
top_p=top_p,
|
|
202
241
|
)
|
|
203
242
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
243
|
+
def task_with_cache(
|
|
244
|
+
self,
|
|
245
|
+
task: PreparedTask,
|
|
246
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
247
|
+
) -> pd.Series:
|
|
248
|
+
"""Execute a prepared task on every Series element using a provided cache.
|
|
249
|
+
|
|
250
|
+
This method allows external control over caching behavior by accepting
|
|
251
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
252
|
+
across multiple operations or custom batch size management.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
256
|
+
response format, and other parameters for processing the inputs.
|
|
257
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
258
|
+
instance for managing API call batching and deduplication.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
pandas.Series: Series whose values are instances of the task's
|
|
262
|
+
response format, aligned with the original Series index.
|
|
263
|
+
|
|
264
|
+
Example:
|
|
265
|
+
```python
|
|
266
|
+
from openaivec.model import PreparedTask
|
|
267
|
+
from openaivec.proxy import BatchingMapProxy
|
|
268
|
+
|
|
269
|
+
# Create a shared cache with custom batch size
|
|
270
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
271
|
+
|
|
272
|
+
# Assume you have a prepared task for sentiment analysis
|
|
273
|
+
sentiment_task = PreparedTask(...)
|
|
274
|
+
|
|
275
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
276
|
+
results = reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
277
|
+
```
|
|
278
|
+
"""
|
|
279
|
+
client = BatchResponses(
|
|
280
|
+
client=CONTAINER.resolve(OpenAI),
|
|
281
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
282
|
+
system_message=task.instructions,
|
|
283
|
+
response_format=task.response_format,
|
|
284
|
+
cache=cache,
|
|
285
|
+
temperature=task.temperature,
|
|
286
|
+
top_p=task.top_p,
|
|
208
287
|
)
|
|
288
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
209
289
|
|
|
210
290
|
def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
|
|
211
291
|
"""Execute a prepared task on every Series element.
|
|
@@ -237,14 +317,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
237
317
|
pandas.Series: Series whose values are instances of the task's
|
|
238
318
|
response format, aligned with the original Series index.
|
|
239
319
|
"""
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
return pd.Series(
|
|
245
|
-
client.parse(self._obj.tolist(), batch_size=batch_size),
|
|
246
|
-
index=self._obj.index,
|
|
247
|
-
name=self._obj.name,
|
|
320
|
+
return self.task_with_cache(
|
|
321
|
+
task=task,
|
|
322
|
+
cache=BatchingMapProxy(batch_size=batch_size),
|
|
248
323
|
)
|
|
249
324
|
|
|
250
325
|
def embeddings(self, batch_size: int = 128) -> pd.Series:
|
|
@@ -268,15 +343,8 @@ class OpenAIVecSeriesAccessor:
|
|
|
268
343
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
269
344
|
(dtype ``float32``).
|
|
270
345
|
"""
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
model_name=_DI.resolve(EmbeddingsModelName).value,
|
|
274
|
-
)
|
|
275
|
-
|
|
276
|
-
return pd.Series(
|
|
277
|
-
client.create(self._obj.tolist(), batch_size=batch_size),
|
|
278
|
-
index=self._obj.index,
|
|
279
|
-
name=self._obj.name,
|
|
346
|
+
return self.embeddings_with_cache(
|
|
347
|
+
cache=BatchingMapProxy(batch_size=batch_size),
|
|
280
348
|
)
|
|
281
349
|
|
|
282
350
|
def count_tokens(self) -> pd.Series:
|
|
@@ -293,7 +361,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
293
361
|
Returns:
|
|
294
362
|
pandas.Series: Token counts for each element.
|
|
295
363
|
"""
|
|
296
|
-
encoding: tiktoken.Encoding =
|
|
364
|
+
encoding: tiktoken.Encoding = CONTAINER.resolve(tiktoken.Encoding)
|
|
297
365
|
return self._obj.map(encoding.encode).map(len).rename("num_tokens")
|
|
298
366
|
|
|
299
367
|
def extract(self) -> pd.DataFrame:
|
|
@@ -365,6 +433,64 @@ class OpenAIVecDataFrameAccessor:
|
|
|
365
433
|
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
366
434
|
)
|
|
367
435
|
|
|
436
|
+
def responses_with_cache(
|
|
437
|
+
self,
|
|
438
|
+
instructions: str,
|
|
439
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
440
|
+
response_format: Type[ResponseFormat] = str,
|
|
441
|
+
temperature: float = 0.0,
|
|
442
|
+
top_p: float = 1.0,
|
|
443
|
+
) -> pd.Series:
|
|
444
|
+
"""Generate a response for each row after serialising it to JSON using a provided cache.
|
|
445
|
+
|
|
446
|
+
This method allows external control over caching behavior by accepting
|
|
447
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
448
|
+
across multiple operations or custom batch size management.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
instructions (str): System prompt for the assistant.
|
|
452
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
453
|
+
instance for managing API call batching and deduplication.
|
|
454
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
455
|
+
responses. Defaults to ``str``.
|
|
456
|
+
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
457
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
461
|
+
|
|
462
|
+
Example:
|
|
463
|
+
```python
|
|
464
|
+
from openaivec.proxy import BatchingMapProxy
|
|
465
|
+
|
|
466
|
+
# Create a shared cache with custom batch size
|
|
467
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
468
|
+
|
|
469
|
+
df = pd.DataFrame([
|
|
470
|
+
{"name": "cat", "legs": 4},
|
|
471
|
+
{"name": "dog", "legs": 4},
|
|
472
|
+
{"name": "elephant", "legs": 4},
|
|
473
|
+
])
|
|
474
|
+
result = df.ai.responses_with_cache(
|
|
475
|
+
"what is the animal's name?",
|
|
476
|
+
cache=shared_cache
|
|
477
|
+
)
|
|
478
|
+
```
|
|
479
|
+
"""
|
|
480
|
+
return self._obj.pipe(
|
|
481
|
+
lambda df: (
|
|
482
|
+
df.pipe(lambda df: pd.Series(df.to_dict(orient="records"), index=df.index, name="record"))
|
|
483
|
+
.map(lambda x: json.dumps(x, ensure_ascii=False))
|
|
484
|
+
.ai.responses_with_cache(
|
|
485
|
+
instructions=instructions,
|
|
486
|
+
cache=cache,
|
|
487
|
+
response_format=response_format,
|
|
488
|
+
temperature=temperature,
|
|
489
|
+
top_p=top_p,
|
|
490
|
+
)
|
|
491
|
+
)
|
|
492
|
+
)
|
|
493
|
+
|
|
368
494
|
def responses(
|
|
369
495
|
self,
|
|
370
496
|
instructions: str,
|
|
@@ -400,20 +526,14 @@ class OpenAIVecDataFrameAccessor:
|
|
|
400
526
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
401
527
|
|
|
402
528
|
Returns:
|
|
403
|
-
pandas.Series: Responses aligned with the DataFrame
|
|
529
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
404
530
|
"""
|
|
405
|
-
return self.
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
response_format=response_format,
|
|
412
|
-
batch_size=batch_size,
|
|
413
|
-
temperature=temperature,
|
|
414
|
-
top_p=top_p,
|
|
415
|
-
)
|
|
416
|
-
)
|
|
531
|
+
return self.responses_with_cache(
|
|
532
|
+
instructions=instructions,
|
|
533
|
+
cache=BatchingMapProxy(batch_size=batch_size),
|
|
534
|
+
response_format=response_format,
|
|
535
|
+
temperature=temperature,
|
|
536
|
+
top_p=top_p,
|
|
417
537
|
)
|
|
418
538
|
|
|
419
539
|
def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
|
|
@@ -519,6 +639,30 @@ class OpenAIVecDataFrameAccessor:
|
|
|
519
639
|
return df
|
|
520
640
|
|
|
521
641
|
def similarity(self, col1: str, col2: str) -> pd.Series:
|
|
642
|
+
"""Compute cosine similarity between two columns containing embedding vectors.
|
|
643
|
+
|
|
644
|
+
This method calculates the cosine similarity between vectors stored in
|
|
645
|
+
two columns of the DataFrame. The vectors should be numpy arrays or
|
|
646
|
+
array-like objects that support dot product operations.
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
col1 (str): Name of the first column containing embedding vectors.
|
|
650
|
+
col2 (str): Name of the second column containing embedding vectors.
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
pandas.Series: Series containing cosine similarity scores between
|
|
654
|
+
corresponding vectors in col1 and col2, with values ranging
|
|
655
|
+
from -1 to 1, where 1 indicates identical direction.
|
|
656
|
+
|
|
657
|
+
Example:
|
|
658
|
+
```python
|
|
659
|
+
df = pd.DataFrame({
|
|
660
|
+
'vec1': [np.array([1, 0, 0]), np.array([0, 1, 0])],
|
|
661
|
+
'vec2': [np.array([1, 0, 0]), np.array([1, 1, 0])]
|
|
662
|
+
})
|
|
663
|
+
similarities = df.ai.similarity('vec1', 'vec2')
|
|
664
|
+
```
|
|
665
|
+
"""
|
|
522
666
|
return self._obj.apply(
|
|
523
667
|
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
524
668
|
axis=1,
|
|
@@ -532,6 +676,173 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
532
676
|
def __init__(self, series_obj: pd.Series):
|
|
533
677
|
self._obj = series_obj
|
|
534
678
|
|
|
679
|
+
async def responses_with_cache(
|
|
680
|
+
self,
|
|
681
|
+
instructions: str,
|
|
682
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
683
|
+
response_format: Type[ResponseFormat] = str,
|
|
684
|
+
temperature: float = 0.0,
|
|
685
|
+
top_p: float = 1.0,
|
|
686
|
+
) -> pd.Series:
|
|
687
|
+
"""Call an LLM once for every Series element using a provided cache (asynchronously).
|
|
688
|
+
|
|
689
|
+
This method allows external control over caching behavior by accepting
|
|
690
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
691
|
+
across multiple operations or custom batch size management. The concurrency
|
|
692
|
+
is controlled by the cache instance itself.
|
|
693
|
+
|
|
694
|
+
Args:
|
|
695
|
+
instructions (str): System prompt prepended to every user message.
|
|
696
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
697
|
+
instance for managing API call batching and deduplication.
|
|
698
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
699
|
+
type the assistant should return. Defaults to ``str``.
|
|
700
|
+
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
701
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
705
|
+
|
|
706
|
+
Example:
|
|
707
|
+
```python
|
|
708
|
+
from openaivec.proxy import AsyncBatchingMapProxy
|
|
709
|
+
|
|
710
|
+
# Create a shared cache with custom batch size and concurrency
|
|
711
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
712
|
+
|
|
713
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
714
|
+
# Must be awaited
|
|
715
|
+
result = await animals.aio.responses_with_cache(
|
|
716
|
+
"translate to French",
|
|
717
|
+
cache=shared_cache
|
|
718
|
+
)
|
|
719
|
+
```
|
|
720
|
+
|
|
721
|
+
Note:
|
|
722
|
+
This is an asynchronous method and must be awaited.
|
|
723
|
+
"""
|
|
724
|
+
client: AsyncBatchResponses = AsyncBatchResponses(
|
|
725
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
726
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
727
|
+
system_message=instructions,
|
|
728
|
+
response_format=response_format,
|
|
729
|
+
cache=cache,
|
|
730
|
+
temperature=temperature,
|
|
731
|
+
top_p=top_p,
|
|
732
|
+
)
|
|
733
|
+
# Await the async operation
|
|
734
|
+
results = await client.parse(self._obj.tolist())
|
|
735
|
+
|
|
736
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
737
|
+
|
|
738
|
+
async def embeddings_with_cache(
|
|
739
|
+
self,
|
|
740
|
+
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
741
|
+
) -> pd.Series:
|
|
742
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
743
|
+
|
|
744
|
+
This method allows external control over caching behavior by accepting
|
|
745
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
746
|
+
across multiple operations or custom batch size management. The concurrency
|
|
747
|
+
is controlled by the cache instance itself.
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
751
|
+
instance for managing API call batching and deduplication.
|
|
752
|
+
|
|
753
|
+
Returns:
|
|
754
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
755
|
+
(dtype ``float32``).
|
|
756
|
+
|
|
757
|
+
Example:
|
|
758
|
+
```python
|
|
759
|
+
from openaivec.proxy import AsyncBatchingMapProxy
|
|
760
|
+
import numpy as np
|
|
761
|
+
|
|
762
|
+
# Create a shared cache with custom batch size and concurrency
|
|
763
|
+
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
764
|
+
batch_size=64, max_concurrency=4
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
768
|
+
# Must be awaited
|
|
769
|
+
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
770
|
+
```
|
|
771
|
+
|
|
772
|
+
Note:
|
|
773
|
+
This is an asynchronous method and must be awaited.
|
|
774
|
+
"""
|
|
775
|
+
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
776
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
777
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
778
|
+
cache=cache,
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
# Await the async operation
|
|
782
|
+
results = await client.create(self._obj.tolist())
|
|
783
|
+
|
|
784
|
+
return pd.Series(
|
|
785
|
+
results,
|
|
786
|
+
index=self._obj.index,
|
|
787
|
+
name=self._obj.name,
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
async def task_with_cache(
|
|
791
|
+
self,
|
|
792
|
+
task: PreparedTask,
|
|
793
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
794
|
+
) -> pd.Series:
|
|
795
|
+
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
796
|
+
|
|
797
|
+
This method allows external control over caching behavior by accepting
|
|
798
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
799
|
+
across multiple operations or custom batch size management. The concurrency
|
|
800
|
+
is controlled by the cache instance itself.
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
804
|
+
response format, and other parameters for processing the inputs.
|
|
805
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
806
|
+
instance for managing API call batching and deduplication.
|
|
807
|
+
|
|
808
|
+
Returns:
|
|
809
|
+
pandas.Series: Series whose values are instances of the task's
|
|
810
|
+
response format, aligned with the original Series index.
|
|
811
|
+
|
|
812
|
+
Example:
|
|
813
|
+
```python
|
|
814
|
+
from openaivec.model import PreparedTask
|
|
815
|
+
from openaivec.proxy import AsyncBatchingMapProxy
|
|
816
|
+
|
|
817
|
+
# Create a shared cache with custom batch size and concurrency
|
|
818
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
819
|
+
|
|
820
|
+
# Assume you have a prepared task for sentiment analysis
|
|
821
|
+
sentiment_task = PreparedTask(...)
|
|
822
|
+
|
|
823
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
824
|
+
# Must be awaited
|
|
825
|
+
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
826
|
+
```
|
|
827
|
+
|
|
828
|
+
Note:
|
|
829
|
+
This is an asynchronous method and must be awaited.
|
|
830
|
+
"""
|
|
831
|
+
client = AsyncBatchResponses(
|
|
832
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
833
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
834
|
+
system_message=task.instructions,
|
|
835
|
+
response_format=task.response_format,
|
|
836
|
+
cache=cache,
|
|
837
|
+
temperature=task.temperature,
|
|
838
|
+
top_p=task.top_p,
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
# Await the async operation
|
|
842
|
+
results = await client.parse(self._obj.tolist())
|
|
843
|
+
|
|
844
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
845
|
+
|
|
535
846
|
async def responses(
|
|
536
847
|
self,
|
|
537
848
|
instructions: str,
|
|
@@ -571,23 +882,12 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
571
882
|
Note:
|
|
572
883
|
This is an asynchronous method and must be awaited.
|
|
573
884
|
"""
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
system_message=instructions,
|
|
885
|
+
return await self.responses_with_cache(
|
|
886
|
+
instructions=instructions,
|
|
887
|
+
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
578
888
|
response_format=response_format,
|
|
579
889
|
temperature=temperature,
|
|
580
890
|
top_p=top_p,
|
|
581
|
-
max_concurrency=max_concurrency,
|
|
582
|
-
)
|
|
583
|
-
|
|
584
|
-
# Await the async operation
|
|
585
|
-
results = await client.parse(self._obj.tolist(), batch_size=batch_size)
|
|
586
|
-
|
|
587
|
-
return pd.Series(
|
|
588
|
-
results,
|
|
589
|
-
index=self._obj.index,
|
|
590
|
-
name=self._obj.name,
|
|
591
891
|
)
|
|
592
892
|
|
|
593
893
|
async def embeddings(self, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
|
|
@@ -617,19 +917,8 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
617
917
|
Note:
|
|
618
918
|
This is an asynchronous method and must be awaited.
|
|
619
919
|
"""
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
model_name=_DI.resolve(EmbeddingsModelName).value,
|
|
623
|
-
max_concurrency=max_concurrency,
|
|
624
|
-
)
|
|
625
|
-
|
|
626
|
-
# Await the async operation
|
|
627
|
-
results = await client.create(self._obj.tolist(), batch_size=batch_size)
|
|
628
|
-
|
|
629
|
-
return pd.Series(
|
|
630
|
-
results,
|
|
631
|
-
index=self._obj.index,
|
|
632
|
-
name=self._obj.name,
|
|
920
|
+
return await self.embeddings_with_cache(
|
|
921
|
+
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
633
922
|
)
|
|
634
923
|
|
|
635
924
|
async def task(self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
|
|
@@ -668,20 +957,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
668
957
|
Note:
|
|
669
958
|
This is an asynchronous method and must be awaited.
|
|
670
959
|
"""
|
|
671
|
-
|
|
672
|
-
client=_DI.resolve(AsyncOpenAI),
|
|
673
|
-
model_name=_DI.resolve(ResponsesModelName).value,
|
|
960
|
+
return await self.task_with_cache(
|
|
674
961
|
task=task,
|
|
675
|
-
max_concurrency=max_concurrency,
|
|
676
|
-
)
|
|
677
|
-
|
|
678
|
-
# Await the async operation
|
|
679
|
-
results = await client.parse(self._obj.tolist(), batch_size=batch_size)
|
|
680
|
-
|
|
681
|
-
return pd.Series(
|
|
682
|
-
results,
|
|
683
|
-
index=self._obj.index,
|
|
684
|
-
name=self._obj.name,
|
|
962
|
+
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
685
963
|
)
|
|
686
964
|
|
|
687
965
|
|
|
@@ -692,6 +970,71 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
692
970
|
def __init__(self, df_obj: pd.DataFrame):
|
|
693
971
|
self._obj = df_obj
|
|
694
972
|
|
|
973
|
+
async def responses_with_cache(
|
|
974
|
+
self,
|
|
975
|
+
instructions: str,
|
|
976
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
977
|
+
response_format: Type[ResponseFormat] = str,
|
|
978
|
+
temperature: float = 0.0,
|
|
979
|
+
top_p: float = 1.0,
|
|
980
|
+
) -> pd.Series:
|
|
981
|
+
"""Generate a response for each row after serialising it to JSON using a provided cache (asynchronously).
|
|
982
|
+
|
|
983
|
+
This method allows external control over caching behavior by accepting
|
|
984
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
985
|
+
across multiple operations or custom batch size management. The concurrency
|
|
986
|
+
is controlled by the cache instance itself.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
instructions (str): System prompt for the assistant.
|
|
990
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
991
|
+
instance for managing API call batching and deduplication.
|
|
992
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
993
|
+
responses. Defaults to ``str``.
|
|
994
|
+
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
995
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
996
|
+
|
|
997
|
+
Returns:
|
|
998
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
999
|
+
|
|
1000
|
+
Example:
|
|
1001
|
+
```python
|
|
1002
|
+
from openaivec.proxy import AsyncBatchingMapProxy
|
|
1003
|
+
|
|
1004
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1005
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
1006
|
+
|
|
1007
|
+
df = pd.DataFrame([
|
|
1008
|
+
{"name": "cat", "legs": 4},
|
|
1009
|
+
{"name": "dog", "legs": 4},
|
|
1010
|
+
{"name": "elephant", "legs": 4},
|
|
1011
|
+
])
|
|
1012
|
+
# Must be awaited
|
|
1013
|
+
result = await df.aio.responses_with_cache(
|
|
1014
|
+
"what is the animal's name?",
|
|
1015
|
+
cache=shared_cache
|
|
1016
|
+
)
|
|
1017
|
+
```
|
|
1018
|
+
|
|
1019
|
+
Note:
|
|
1020
|
+
This is an asynchronous method and must be awaited.
|
|
1021
|
+
"""
|
|
1022
|
+
series_of_json = self._obj.pipe(
|
|
1023
|
+
lambda df: (
|
|
1024
|
+
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
1025
|
+
lambda x: json.dumps(x, ensure_ascii=False)
|
|
1026
|
+
)
|
|
1027
|
+
)
|
|
1028
|
+
)
|
|
1029
|
+
# Await the call to the async Series method using .aio
|
|
1030
|
+
return await series_of_json.aio.responses_with_cache(
|
|
1031
|
+
instructions=instructions,
|
|
1032
|
+
cache=cache,
|
|
1033
|
+
response_format=response_format,
|
|
1034
|
+
temperature=temperature,
|
|
1035
|
+
top_p=top_p,
|
|
1036
|
+
)
|
|
1037
|
+
|
|
695
1038
|
async def responses(
|
|
696
1039
|
self,
|
|
697
1040
|
instructions: str,
|
|
@@ -731,26 +1074,17 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
731
1074
|
requests. Defaults to ``8``.
|
|
732
1075
|
|
|
733
1076
|
Returns:
|
|
734
|
-
pandas.Series: Responses aligned with the DataFrame
|
|
1077
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
735
1078
|
|
|
736
1079
|
Note:
|
|
737
1080
|
This is an asynchronous method and must be awaited.
|
|
738
1081
|
"""
|
|
739
|
-
|
|
740
|
-
lambda df: (
|
|
741
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
742
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
743
|
-
)
|
|
744
|
-
)
|
|
745
|
-
)
|
|
746
|
-
# Await the call to the async Series method using .aio
|
|
747
|
-
return await series_of_json.aio.responses(
|
|
1082
|
+
return await self.responses_with_cache(
|
|
748
1083
|
instructions=instructions,
|
|
1084
|
+
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
749
1085
|
response_format=response_format,
|
|
750
|
-
batch_size=batch_size,
|
|
751
1086
|
temperature=temperature,
|
|
752
1087
|
top_p=top_p,
|
|
753
|
-
max_concurrency=max_concurrency,
|
|
754
1088
|
)
|
|
755
1089
|
|
|
756
1090
|
async def task(self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
|