openaivec 0.12.6__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/embeddings.py +79 -63
- openaivec/pandas_ext.py +449 -92
- openaivec/provider.py +3 -3
- openaivec/proxy.py +608 -0
- openaivec/responses.py +178 -108
- openaivec/serialize.py +41 -33
- openaivec/spark.py +109 -54
- openaivec/task/customer_support/__init__.py +3 -3
- openaivec/task/nlp/__init__.py +1 -1
- openaivec/task/table/__init__.py +1 -1
- openaivec/util.py +18 -80
- {openaivec-0.12.6.dist-info → openaivec-0.13.1.dist-info}/METADATA +38 -15
- {openaivec-0.12.6.dist-info → openaivec-0.13.1.dist-info}/RECORD +15 -14
- {openaivec-0.12.6.dist-info → openaivec-0.13.1.dist-info}/WHEEL +0 -0
- {openaivec-0.12.6.dist-info → openaivec-0.13.1.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -17,8 +17,8 @@ pandas_ext.use(client)
|
|
|
17
17
|
# Option 3: Use an existing Azure OpenAI client instance
|
|
18
18
|
azure_client = AzureOpenAI(
|
|
19
19
|
api_key="your-azure-key",
|
|
20
|
-
azure_endpoint="https
|
|
21
|
-
api_version="
|
|
20
|
+
azure_endpoint="https://<your-resource-name>.services.ai.azure.com",
|
|
21
|
+
api_version="2025-04-01-preview"
|
|
22
22
|
)
|
|
23
23
|
pandas_ext.use(azure_client)
|
|
24
24
|
|
|
@@ -49,6 +49,7 @@ from pydantic import BaseModel
|
|
|
49
49
|
from .embeddings import AsyncBatchEmbeddings, BatchEmbeddings
|
|
50
50
|
from .model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
|
|
51
51
|
from .provider import CONTAINER
|
|
52
|
+
from .proxy import AsyncBatchingMapProxy, BatchingMapProxy
|
|
52
53
|
from .responses import AsyncBatchResponses, BatchResponses
|
|
53
54
|
from .task.table import FillNaResponse, fillna
|
|
54
55
|
|
|
@@ -137,6 +138,68 @@ class OpenAIVecSeriesAccessor:
|
|
|
137
138
|
def __init__(self, series_obj: pd.Series):
|
|
138
139
|
self._obj = series_obj
|
|
139
140
|
|
|
141
|
+
def responses_with_cache(
|
|
142
|
+
self,
|
|
143
|
+
instructions: str,
|
|
144
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
145
|
+
response_format: Type[ResponseFormat] = str,
|
|
146
|
+
temperature: float = 0.0,
|
|
147
|
+
top_p: float = 1.0,
|
|
148
|
+
) -> pd.Series:
|
|
149
|
+
client: BatchResponses = BatchResponses(
|
|
150
|
+
client=CONTAINER.resolve(OpenAI),
|
|
151
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
152
|
+
system_message=instructions,
|
|
153
|
+
response_format=response_format,
|
|
154
|
+
cache=cache,
|
|
155
|
+
temperature=temperature,
|
|
156
|
+
top_p=top_p,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
160
|
+
|
|
161
|
+
def embeddings_with_cache(
|
|
162
|
+
self,
|
|
163
|
+
cache: BatchingMapProxy[str, np.ndarray],
|
|
164
|
+
) -> pd.Series:
|
|
165
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache.
|
|
166
|
+
|
|
167
|
+
This method allows external control over caching behavior by accepting
|
|
168
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
169
|
+
across multiple operations or custom batch size management.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
173
|
+
instance for managing API call batching and deduplication.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
177
|
+
(dtype ``float32``).
|
|
178
|
+
|
|
179
|
+
Example:
|
|
180
|
+
```python
|
|
181
|
+
from openaivec.proxy import BatchingMapProxy
|
|
182
|
+
import numpy as np
|
|
183
|
+
|
|
184
|
+
# Create a shared cache with custom batch size
|
|
185
|
+
shared_cache = BatchingMapProxy[str, np.ndarray](batch_size=64)
|
|
186
|
+
|
|
187
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
188
|
+
embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
|
|
189
|
+
```
|
|
190
|
+
"""
|
|
191
|
+
client: BatchEmbeddings = BatchEmbeddings(
|
|
192
|
+
client=CONTAINER.resolve(OpenAI),
|
|
193
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
194
|
+
cache=cache,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
return pd.Series(
|
|
198
|
+
client.create(self._obj.tolist()),
|
|
199
|
+
index=self._obj.index,
|
|
200
|
+
name=self._obj.name,
|
|
201
|
+
)
|
|
202
|
+
|
|
140
203
|
def responses(
|
|
141
204
|
self,
|
|
142
205
|
instructions: str,
|
|
@@ -169,20 +232,60 @@ class OpenAIVecSeriesAccessor:
|
|
|
169
232
|
Returns:
|
|
170
233
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
171
234
|
"""
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
system_message=instructions,
|
|
235
|
+
return self.responses_with_cache(
|
|
236
|
+
instructions=instructions,
|
|
237
|
+
cache=BatchingMapProxy(batch_size=batch_size),
|
|
176
238
|
response_format=response_format,
|
|
177
239
|
temperature=temperature,
|
|
178
240
|
top_p=top_p,
|
|
179
241
|
)
|
|
180
242
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
243
|
+
def task_with_cache(
|
|
244
|
+
self,
|
|
245
|
+
task: PreparedTask,
|
|
246
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
247
|
+
) -> pd.Series:
|
|
248
|
+
"""Execute a prepared task on every Series element using a provided cache.
|
|
249
|
+
|
|
250
|
+
This method allows external control over caching behavior by accepting
|
|
251
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
252
|
+
across multiple operations or custom batch size management.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
256
|
+
response format, and other parameters for processing the inputs.
|
|
257
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
258
|
+
instance for managing API call batching and deduplication.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
pandas.Series: Series whose values are instances of the task's
|
|
262
|
+
response format, aligned with the original Series index.
|
|
263
|
+
|
|
264
|
+
Example:
|
|
265
|
+
```python
|
|
266
|
+
from openaivec.model import PreparedTask
|
|
267
|
+
from openaivec.proxy import BatchingMapProxy
|
|
268
|
+
|
|
269
|
+
# Create a shared cache with custom batch size
|
|
270
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
271
|
+
|
|
272
|
+
# Assume you have a prepared task for sentiment analysis
|
|
273
|
+
sentiment_task = PreparedTask(...)
|
|
274
|
+
|
|
275
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
276
|
+
results = reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
277
|
+
```
|
|
278
|
+
"""
|
|
279
|
+
client = BatchResponses(
|
|
280
|
+
client=CONTAINER.resolve(OpenAI),
|
|
281
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
282
|
+
system_message=task.instructions,
|
|
283
|
+
response_format=task.response_format,
|
|
284
|
+
cache=cache,
|
|
285
|
+
temperature=task.temperature,
|
|
286
|
+
top_p=task.top_p,
|
|
185
287
|
)
|
|
288
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
186
289
|
|
|
187
290
|
def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
|
|
188
291
|
"""Execute a prepared task on every Series element.
|
|
@@ -214,14 +317,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
214
317
|
pandas.Series: Series whose values are instances of the task's
|
|
215
318
|
response format, aligned with the original Series index.
|
|
216
319
|
"""
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
return pd.Series(
|
|
222
|
-
client.parse(self._obj.tolist(), batch_size=batch_size),
|
|
223
|
-
index=self._obj.index,
|
|
224
|
-
name=self._obj.name,
|
|
320
|
+
return self.task_with_cache(
|
|
321
|
+
task=task,
|
|
322
|
+
cache=BatchingMapProxy(batch_size=batch_size),
|
|
225
323
|
)
|
|
226
324
|
|
|
227
325
|
def embeddings(self, batch_size: int = 128) -> pd.Series:
|
|
@@ -245,15 +343,8 @@ class OpenAIVecSeriesAccessor:
|
|
|
245
343
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
246
344
|
(dtype ``float32``).
|
|
247
345
|
"""
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
251
|
-
)
|
|
252
|
-
|
|
253
|
-
return pd.Series(
|
|
254
|
-
client.create(self._obj.tolist(), batch_size=batch_size),
|
|
255
|
-
index=self._obj.index,
|
|
256
|
-
name=self._obj.name,
|
|
346
|
+
return self.embeddings_with_cache(
|
|
347
|
+
cache=BatchingMapProxy(batch_size=batch_size),
|
|
257
348
|
)
|
|
258
349
|
|
|
259
350
|
def count_tokens(self) -> pd.Series:
|
|
@@ -342,6 +433,64 @@ class OpenAIVecDataFrameAccessor:
|
|
|
342
433
|
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
343
434
|
)
|
|
344
435
|
|
|
436
|
+
def responses_with_cache(
|
|
437
|
+
self,
|
|
438
|
+
instructions: str,
|
|
439
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
440
|
+
response_format: Type[ResponseFormat] = str,
|
|
441
|
+
temperature: float = 0.0,
|
|
442
|
+
top_p: float = 1.0,
|
|
443
|
+
) -> pd.Series:
|
|
444
|
+
"""Generate a response for each row after serialising it to JSON using a provided cache.
|
|
445
|
+
|
|
446
|
+
This method allows external control over caching behavior by accepting
|
|
447
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
448
|
+
across multiple operations or custom batch size management.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
instructions (str): System prompt for the assistant.
|
|
452
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
453
|
+
instance for managing API call batching and deduplication.
|
|
454
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
455
|
+
responses. Defaults to ``str``.
|
|
456
|
+
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
457
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
461
|
+
|
|
462
|
+
Example:
|
|
463
|
+
```python
|
|
464
|
+
from openaivec.proxy import BatchingMapProxy
|
|
465
|
+
|
|
466
|
+
# Create a shared cache with custom batch size
|
|
467
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
468
|
+
|
|
469
|
+
df = pd.DataFrame([
|
|
470
|
+
{"name": "cat", "legs": 4},
|
|
471
|
+
{"name": "dog", "legs": 4},
|
|
472
|
+
{"name": "elephant", "legs": 4},
|
|
473
|
+
])
|
|
474
|
+
result = df.ai.responses_with_cache(
|
|
475
|
+
"what is the animal's name?",
|
|
476
|
+
cache=shared_cache
|
|
477
|
+
)
|
|
478
|
+
```
|
|
479
|
+
"""
|
|
480
|
+
return self._obj.pipe(
|
|
481
|
+
lambda df: (
|
|
482
|
+
df.pipe(lambda df: pd.Series(df.to_dict(orient="records"), index=df.index, name="record"))
|
|
483
|
+
.map(lambda x: json.dumps(x, ensure_ascii=False))
|
|
484
|
+
.ai.responses_with_cache(
|
|
485
|
+
instructions=instructions,
|
|
486
|
+
cache=cache,
|
|
487
|
+
response_format=response_format,
|
|
488
|
+
temperature=temperature,
|
|
489
|
+
top_p=top_p,
|
|
490
|
+
)
|
|
491
|
+
)
|
|
492
|
+
)
|
|
493
|
+
|
|
345
494
|
def responses(
|
|
346
495
|
self,
|
|
347
496
|
instructions: str,
|
|
@@ -377,20 +526,14 @@ class OpenAIVecDataFrameAccessor:
|
|
|
377
526
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
378
527
|
|
|
379
528
|
Returns:
|
|
380
|
-
pandas.Series: Responses aligned with the DataFrame
|
|
529
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
381
530
|
"""
|
|
382
|
-
return self.
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
response_format=response_format,
|
|
389
|
-
batch_size=batch_size,
|
|
390
|
-
temperature=temperature,
|
|
391
|
-
top_p=top_p,
|
|
392
|
-
)
|
|
393
|
-
)
|
|
531
|
+
return self.responses_with_cache(
|
|
532
|
+
instructions=instructions,
|
|
533
|
+
cache=BatchingMapProxy(batch_size=batch_size),
|
|
534
|
+
response_format=response_format,
|
|
535
|
+
temperature=temperature,
|
|
536
|
+
top_p=top_p,
|
|
394
537
|
)
|
|
395
538
|
|
|
396
539
|
def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
|
|
@@ -496,6 +639,30 @@ class OpenAIVecDataFrameAccessor:
|
|
|
496
639
|
return df
|
|
497
640
|
|
|
498
641
|
def similarity(self, col1: str, col2: str) -> pd.Series:
|
|
642
|
+
"""Compute cosine similarity between two columns containing embedding vectors.
|
|
643
|
+
|
|
644
|
+
This method calculates the cosine similarity between vectors stored in
|
|
645
|
+
two columns of the DataFrame. The vectors should be numpy arrays or
|
|
646
|
+
array-like objects that support dot product operations.
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
col1 (str): Name of the first column containing embedding vectors.
|
|
650
|
+
col2 (str): Name of the second column containing embedding vectors.
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
pandas.Series: Series containing cosine similarity scores between
|
|
654
|
+
corresponding vectors in col1 and col2, with values ranging
|
|
655
|
+
from -1 to 1, where 1 indicates identical direction.
|
|
656
|
+
|
|
657
|
+
Example:
|
|
658
|
+
```python
|
|
659
|
+
df = pd.DataFrame({
|
|
660
|
+
'vec1': [np.array([1, 0, 0]), np.array([0, 1, 0])],
|
|
661
|
+
'vec2': [np.array([1, 0, 0]), np.array([1, 1, 0])]
|
|
662
|
+
})
|
|
663
|
+
similarities = df.ai.similarity('vec1', 'vec2')
|
|
664
|
+
```
|
|
665
|
+
"""
|
|
499
666
|
return self._obj.apply(
|
|
500
667
|
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
501
668
|
axis=1,
|
|
@@ -509,6 +676,173 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
509
676
|
def __init__(self, series_obj: pd.Series):
|
|
510
677
|
self._obj = series_obj
|
|
511
678
|
|
|
679
|
+
async def responses_with_cache(
|
|
680
|
+
self,
|
|
681
|
+
instructions: str,
|
|
682
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
683
|
+
response_format: Type[ResponseFormat] = str,
|
|
684
|
+
temperature: float = 0.0,
|
|
685
|
+
top_p: float = 1.0,
|
|
686
|
+
) -> pd.Series:
|
|
687
|
+
"""Call an LLM once for every Series element using a provided cache (asynchronously).
|
|
688
|
+
|
|
689
|
+
This method allows external control over caching behavior by accepting
|
|
690
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
691
|
+
across multiple operations or custom batch size management. The concurrency
|
|
692
|
+
is controlled by the cache instance itself.
|
|
693
|
+
|
|
694
|
+
Args:
|
|
695
|
+
instructions (str): System prompt prepended to every user message.
|
|
696
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
697
|
+
instance for managing API call batching and deduplication.
|
|
698
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
699
|
+
type the assistant should return. Defaults to ``str``.
|
|
700
|
+
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
701
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
705
|
+
|
|
706
|
+
Example:
|
|
707
|
+
```python
|
|
708
|
+
from openaivec.proxy import AsyncBatchingMapProxy
|
|
709
|
+
|
|
710
|
+
# Create a shared cache with custom batch size and concurrency
|
|
711
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
712
|
+
|
|
713
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
714
|
+
# Must be awaited
|
|
715
|
+
result = await animals.aio.responses_with_cache(
|
|
716
|
+
"translate to French",
|
|
717
|
+
cache=shared_cache
|
|
718
|
+
)
|
|
719
|
+
```
|
|
720
|
+
|
|
721
|
+
Note:
|
|
722
|
+
This is an asynchronous method and must be awaited.
|
|
723
|
+
"""
|
|
724
|
+
client: AsyncBatchResponses = AsyncBatchResponses(
|
|
725
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
726
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
727
|
+
system_message=instructions,
|
|
728
|
+
response_format=response_format,
|
|
729
|
+
cache=cache,
|
|
730
|
+
temperature=temperature,
|
|
731
|
+
top_p=top_p,
|
|
732
|
+
)
|
|
733
|
+
# Await the async operation
|
|
734
|
+
results = await client.parse(self._obj.tolist())
|
|
735
|
+
|
|
736
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
737
|
+
|
|
738
|
+
async def embeddings_with_cache(
|
|
739
|
+
self,
|
|
740
|
+
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
741
|
+
) -> pd.Series:
|
|
742
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
743
|
+
|
|
744
|
+
This method allows external control over caching behavior by accepting
|
|
745
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
746
|
+
across multiple operations or custom batch size management. The concurrency
|
|
747
|
+
is controlled by the cache instance itself.
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
751
|
+
instance for managing API call batching and deduplication.
|
|
752
|
+
|
|
753
|
+
Returns:
|
|
754
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
755
|
+
(dtype ``float32``).
|
|
756
|
+
|
|
757
|
+
Example:
|
|
758
|
+
```python
|
|
759
|
+
from openaivec.proxy import AsyncBatchingMapProxy
|
|
760
|
+
import numpy as np
|
|
761
|
+
|
|
762
|
+
# Create a shared cache with custom batch size and concurrency
|
|
763
|
+
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
764
|
+
batch_size=64, max_concurrency=4
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
768
|
+
# Must be awaited
|
|
769
|
+
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
770
|
+
```
|
|
771
|
+
|
|
772
|
+
Note:
|
|
773
|
+
This is an asynchronous method and must be awaited.
|
|
774
|
+
"""
|
|
775
|
+
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
776
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
777
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
778
|
+
cache=cache,
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
# Await the async operation
|
|
782
|
+
results = await client.create(self._obj.tolist())
|
|
783
|
+
|
|
784
|
+
return pd.Series(
|
|
785
|
+
results,
|
|
786
|
+
index=self._obj.index,
|
|
787
|
+
name=self._obj.name,
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
async def task_with_cache(
|
|
791
|
+
self,
|
|
792
|
+
task: PreparedTask,
|
|
793
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
794
|
+
) -> pd.Series:
|
|
795
|
+
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
796
|
+
|
|
797
|
+
This method allows external control over caching behavior by accepting
|
|
798
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
799
|
+
across multiple operations or custom batch size management. The concurrency
|
|
800
|
+
is controlled by the cache instance itself.
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
804
|
+
response format, and other parameters for processing the inputs.
|
|
805
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
806
|
+
instance for managing API call batching and deduplication.
|
|
807
|
+
|
|
808
|
+
Returns:
|
|
809
|
+
pandas.Series: Series whose values are instances of the task's
|
|
810
|
+
response format, aligned with the original Series index.
|
|
811
|
+
|
|
812
|
+
Example:
|
|
813
|
+
```python
|
|
814
|
+
from openaivec.model import PreparedTask
|
|
815
|
+
from openaivec.proxy import AsyncBatchingMapProxy
|
|
816
|
+
|
|
817
|
+
# Create a shared cache with custom batch size and concurrency
|
|
818
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
819
|
+
|
|
820
|
+
# Assume you have a prepared task for sentiment analysis
|
|
821
|
+
sentiment_task = PreparedTask(...)
|
|
822
|
+
|
|
823
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
824
|
+
# Must be awaited
|
|
825
|
+
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
826
|
+
```
|
|
827
|
+
|
|
828
|
+
Note:
|
|
829
|
+
This is an asynchronous method and must be awaited.
|
|
830
|
+
"""
|
|
831
|
+
client = AsyncBatchResponses(
|
|
832
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
833
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
834
|
+
system_message=task.instructions,
|
|
835
|
+
response_format=task.response_format,
|
|
836
|
+
cache=cache,
|
|
837
|
+
temperature=task.temperature,
|
|
838
|
+
top_p=task.top_p,
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
# Await the async operation
|
|
842
|
+
results = await client.parse(self._obj.tolist())
|
|
843
|
+
|
|
844
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
845
|
+
|
|
512
846
|
async def responses(
|
|
513
847
|
self,
|
|
514
848
|
instructions: str,
|
|
@@ -548,23 +882,12 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
548
882
|
Note:
|
|
549
883
|
This is an asynchronous method and must be awaited.
|
|
550
884
|
"""
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
system_message=instructions,
|
|
885
|
+
return await self.responses_with_cache(
|
|
886
|
+
instructions=instructions,
|
|
887
|
+
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
555
888
|
response_format=response_format,
|
|
556
889
|
temperature=temperature,
|
|
557
890
|
top_p=top_p,
|
|
558
|
-
max_concurrency=max_concurrency,
|
|
559
|
-
)
|
|
560
|
-
|
|
561
|
-
# Await the async operation
|
|
562
|
-
results = await client.parse(self._obj.tolist(), batch_size=batch_size)
|
|
563
|
-
|
|
564
|
-
return pd.Series(
|
|
565
|
-
results,
|
|
566
|
-
index=self._obj.index,
|
|
567
|
-
name=self._obj.name,
|
|
568
891
|
)
|
|
569
892
|
|
|
570
893
|
async def embeddings(self, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
|
|
@@ -594,19 +917,8 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
594
917
|
Note:
|
|
595
918
|
This is an asynchronous method and must be awaited.
|
|
596
919
|
"""
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
600
|
-
max_concurrency=max_concurrency,
|
|
601
|
-
)
|
|
602
|
-
|
|
603
|
-
# Await the async operation
|
|
604
|
-
results = await client.create(self._obj.tolist(), batch_size=batch_size)
|
|
605
|
-
|
|
606
|
-
return pd.Series(
|
|
607
|
-
results,
|
|
608
|
-
index=self._obj.index,
|
|
609
|
-
name=self._obj.name,
|
|
920
|
+
return await self.embeddings_with_cache(
|
|
921
|
+
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
610
922
|
)
|
|
611
923
|
|
|
612
924
|
async def task(self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
|
|
@@ -645,20 +957,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
645
957
|
Note:
|
|
646
958
|
This is an asynchronous method and must be awaited.
|
|
647
959
|
"""
|
|
648
|
-
|
|
649
|
-
client=CONTAINER.resolve(AsyncOpenAI),
|
|
650
|
-
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
960
|
+
return await self.task_with_cache(
|
|
651
961
|
task=task,
|
|
652
|
-
max_concurrency=max_concurrency,
|
|
653
|
-
)
|
|
654
|
-
|
|
655
|
-
# Await the async operation
|
|
656
|
-
results = await client.parse(self._obj.tolist(), batch_size=batch_size)
|
|
657
|
-
|
|
658
|
-
return pd.Series(
|
|
659
|
-
results,
|
|
660
|
-
index=self._obj.index,
|
|
661
|
-
name=self._obj.name,
|
|
962
|
+
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
662
963
|
)
|
|
663
964
|
|
|
664
965
|
|
|
@@ -669,6 +970,71 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
669
970
|
def __init__(self, df_obj: pd.DataFrame):
|
|
670
971
|
self._obj = df_obj
|
|
671
972
|
|
|
973
|
+
async def responses_with_cache(
|
|
974
|
+
self,
|
|
975
|
+
instructions: str,
|
|
976
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
977
|
+
response_format: Type[ResponseFormat] = str,
|
|
978
|
+
temperature: float = 0.0,
|
|
979
|
+
top_p: float = 1.0,
|
|
980
|
+
) -> pd.Series:
|
|
981
|
+
"""Generate a response for each row after serialising it to JSON using a provided cache (asynchronously).
|
|
982
|
+
|
|
983
|
+
This method allows external control over caching behavior by accepting
|
|
984
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
985
|
+
across multiple operations or custom batch size management. The concurrency
|
|
986
|
+
is controlled by the cache instance itself.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
instructions (str): System prompt for the assistant.
|
|
990
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
991
|
+
instance for managing API call batching and deduplication.
|
|
992
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
993
|
+
responses. Defaults to ``str``.
|
|
994
|
+
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
995
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
996
|
+
|
|
997
|
+
Returns:
|
|
998
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
999
|
+
|
|
1000
|
+
Example:
|
|
1001
|
+
```python
|
|
1002
|
+
from openaivec.proxy import AsyncBatchingMapProxy
|
|
1003
|
+
|
|
1004
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1005
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
1006
|
+
|
|
1007
|
+
df = pd.DataFrame([
|
|
1008
|
+
{"name": "cat", "legs": 4},
|
|
1009
|
+
{"name": "dog", "legs": 4},
|
|
1010
|
+
{"name": "elephant", "legs": 4},
|
|
1011
|
+
])
|
|
1012
|
+
# Must be awaited
|
|
1013
|
+
result = await df.aio.responses_with_cache(
|
|
1014
|
+
"what is the animal's name?",
|
|
1015
|
+
cache=shared_cache
|
|
1016
|
+
)
|
|
1017
|
+
```
|
|
1018
|
+
|
|
1019
|
+
Note:
|
|
1020
|
+
This is an asynchronous method and must be awaited.
|
|
1021
|
+
"""
|
|
1022
|
+
series_of_json = self._obj.pipe(
|
|
1023
|
+
lambda df: (
|
|
1024
|
+
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
1025
|
+
lambda x: json.dumps(x, ensure_ascii=False)
|
|
1026
|
+
)
|
|
1027
|
+
)
|
|
1028
|
+
)
|
|
1029
|
+
# Await the call to the async Series method using .aio
|
|
1030
|
+
return await series_of_json.aio.responses_with_cache(
|
|
1031
|
+
instructions=instructions,
|
|
1032
|
+
cache=cache,
|
|
1033
|
+
response_format=response_format,
|
|
1034
|
+
temperature=temperature,
|
|
1035
|
+
top_p=top_p,
|
|
1036
|
+
)
|
|
1037
|
+
|
|
672
1038
|
async def responses(
|
|
673
1039
|
self,
|
|
674
1040
|
instructions: str,
|
|
@@ -708,26 +1074,17 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
708
1074
|
requests. Defaults to ``8``.
|
|
709
1075
|
|
|
710
1076
|
Returns:
|
|
711
|
-
pandas.Series: Responses aligned with the DataFrame
|
|
1077
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
712
1078
|
|
|
713
1079
|
Note:
|
|
714
1080
|
This is an asynchronous method and must be awaited.
|
|
715
1081
|
"""
|
|
716
|
-
|
|
717
|
-
lambda df: (
|
|
718
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
719
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
720
|
-
)
|
|
721
|
-
)
|
|
722
|
-
)
|
|
723
|
-
# Await the call to the async Series method using .aio
|
|
724
|
-
return await series_of_json.aio.responses(
|
|
1082
|
+
return await self.responses_with_cache(
|
|
725
1083
|
instructions=instructions,
|
|
1084
|
+
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
726
1085
|
response_format=response_format,
|
|
727
|
-
batch_size=batch_size,
|
|
728
1086
|
temperature=temperature,
|
|
729
1087
|
top_p=top_p,
|
|
730
|
-
max_concurrency=max_concurrency,
|
|
731
1088
|
)
|
|
732
1089
|
|
|
733
1090
|
async def task(self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
|
openaivec/provider.py
CHANGED
|
@@ -3,8 +3,8 @@ import os
|
|
|
3
3
|
import tiktoken
|
|
4
4
|
from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
6
|
+
from . import di
|
|
7
|
+
from .model import (
|
|
8
8
|
AzureOpenAIAPIKey,
|
|
9
9
|
AzureOpenAIAPIVersion,
|
|
10
10
|
AzureOpenAIEndpoint,
|
|
@@ -12,7 +12,7 @@ from openaivec.model import (
|
|
|
12
12
|
OpenAIAPIKey,
|
|
13
13
|
ResponsesModelName,
|
|
14
14
|
)
|
|
15
|
-
from
|
|
15
|
+
from .util import TextChunker
|
|
16
16
|
|
|
17
17
|
CONTAINER = di.Container()
|
|
18
18
|
|