openaivec 0.14.3__py3-none-any.whl → 0.14.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_provider.py +15 -0
- openaivec/_proxy.py +24 -2
- openaivec/_schema.py +47 -6
- openaivec/pandas_ext.py +844 -329
- {openaivec-0.14.3.dist-info → openaivec-0.14.5.dist-info}/METADATA +1 -1
- {openaivec-0.14.3.dist-info → openaivec-0.14.5.dist-info}/RECORD +8 -8
- {openaivec-0.14.3.dist-info → openaivec-0.14.5.dist-info}/WHEEL +0 -0
- {openaivec-0.14.3.dist-info → openaivec-0.14.5.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -49,6 +49,8 @@ import pandas as pd
|
|
|
49
49
|
import tiktoken
|
|
50
50
|
from openai import AsyncOpenAI, OpenAI
|
|
51
51
|
|
|
52
|
+
from openaivec._schema import InferredSchema, SchemaInferenceInput, SchemaInferer
|
|
53
|
+
|
|
52
54
|
__all__ = [
|
|
53
55
|
"embeddings_model",
|
|
54
56
|
"responses_model",
|
|
@@ -182,6 +184,27 @@ class OpenAIVecSeriesAccessor:
|
|
|
182
184
|
top_p: float = 1.0,
|
|
183
185
|
**api_kwargs,
|
|
184
186
|
) -> pd.Series:
|
|
187
|
+
"""Call an LLM once for every Series element using a provided cache.
|
|
188
|
+
|
|
189
|
+
This is a lower-level method that allows explicit cache management for advanced
|
|
190
|
+
use cases. Most users should use the standard ``responses`` method instead.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
instructions (str): System prompt prepended to every user message.
|
|
194
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
|
|
195
|
+
batching and deduplication control.
|
|
196
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built-in
|
|
197
|
+
type the assistant should return. Defaults to ``str``.
|
|
198
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
199
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
200
|
+
|
|
201
|
+
Additional Keyword Args:
|
|
202
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
203
|
+
``seed``, etc.) are forwarded verbatim to the underlying client.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
207
|
+
"""
|
|
185
208
|
client: BatchResponses = BatchResponses(
|
|
186
209
|
client=CONTAINER.resolve(OpenAI),
|
|
187
210
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
@@ -195,6 +218,56 @@ class OpenAIVecSeriesAccessor:
|
|
|
195
218
|
# Forward any extra kwargs to the underlying Responses API.
|
|
196
219
|
return pd.Series(client.parse(self._obj.tolist(), **api_kwargs), index=self._obj.index, name=self._obj.name)
|
|
197
220
|
|
|
221
|
+
def responses(
|
|
222
|
+
self,
|
|
223
|
+
instructions: str,
|
|
224
|
+
response_format: Type[ResponseFormat] = str,
|
|
225
|
+
batch_size: int | None = None,
|
|
226
|
+
temperature: float | None = 0.0,
|
|
227
|
+
top_p: float = 1.0,
|
|
228
|
+
show_progress: bool = False,
|
|
229
|
+
**api_kwargs,
|
|
230
|
+
) -> pd.Series:
|
|
231
|
+
"""Call an LLM once for every Series element.
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
```python
|
|
235
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
236
|
+
# Basic usage
|
|
237
|
+
animals.ai.responses("translate to French")
|
|
238
|
+
|
|
239
|
+
# With progress bar in Jupyter notebooks
|
|
240
|
+
large_series = pd.Series(["data"] * 1000)
|
|
241
|
+
large_series.ai.responses(
|
|
242
|
+
"analyze this data",
|
|
243
|
+
batch_size=32,
|
|
244
|
+
show_progress=True
|
|
245
|
+
)
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
instructions (str): System prompt prepended to every user message.
|
|
250
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
251
|
+
type the assistant should return. Defaults to ``str``.
|
|
252
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
253
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
254
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
255
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
256
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
257
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
261
|
+
"""
|
|
262
|
+
return self.responses_with_cache(
|
|
263
|
+
instructions=instructions,
|
|
264
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
265
|
+
response_format=response_format,
|
|
266
|
+
temperature=temperature,
|
|
267
|
+
top_p=top_p,
|
|
268
|
+
**api_kwargs,
|
|
269
|
+
)
|
|
270
|
+
|
|
198
271
|
def embeddings_with_cache(
|
|
199
272
|
self,
|
|
200
273
|
cache: BatchingMapProxy[str, np.ndarray],
|
|
@@ -205,15 +278,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
205
278
|
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
206
279
|
across multiple operations or custom batch size management.
|
|
207
280
|
|
|
208
|
-
Args:
|
|
209
|
-
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
210
|
-
instance for managing API call batching and deduplication.
|
|
211
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
212
|
-
|
|
213
|
-
Returns:
|
|
214
|
-
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
215
|
-
(dtype ``float32``).
|
|
216
|
-
|
|
217
281
|
Example:
|
|
218
282
|
```python
|
|
219
283
|
from openaivec._proxy import BatchingMapProxy
|
|
@@ -225,6 +289,15 @@ class OpenAIVecSeriesAccessor:
|
|
|
225
289
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
226
290
|
embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
|
|
227
291
|
```
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
295
|
+
instance for managing API call batching and deduplication.
|
|
296
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
300
|
+
(dtype ``float32``).
|
|
228
301
|
"""
|
|
229
302
|
client: BatchEmbeddings = BatchEmbeddings(
|
|
230
303
|
client=CONTAINER.resolve(OpenAI),
|
|
@@ -238,54 +311,35 @@ class OpenAIVecSeriesAccessor:
|
|
|
238
311
|
name=self._obj.name,
|
|
239
312
|
)
|
|
240
313
|
|
|
241
|
-
def
|
|
242
|
-
|
|
243
|
-
instructions: str,
|
|
244
|
-
response_format: Type[ResponseFormat] = str,
|
|
245
|
-
batch_size: int | None = None,
|
|
246
|
-
temperature: float | None = 0.0,
|
|
247
|
-
top_p: float = 1.0,
|
|
248
|
-
show_progress: bool = False,
|
|
249
|
-
**api_kwargs,
|
|
250
|
-
) -> pd.Series:
|
|
251
|
-
"""Call an LLM once for every Series element.
|
|
314
|
+
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
315
|
+
"""Compute OpenAI embeddings for every Series element.
|
|
252
316
|
|
|
253
317
|
Example:
|
|
254
318
|
```python
|
|
255
319
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
256
320
|
# Basic usage
|
|
257
|
-
animals.ai.
|
|
321
|
+
animals.ai.embeddings()
|
|
258
322
|
|
|
259
|
-
# With progress bar
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
batch_size=32,
|
|
323
|
+
# With progress bar for large datasets
|
|
324
|
+
large_texts = pd.Series(["text"] * 5000)
|
|
325
|
+
embeddings = large_texts.ai.embeddings(
|
|
326
|
+
batch_size=100,
|
|
264
327
|
show_progress=True
|
|
265
328
|
)
|
|
266
329
|
```
|
|
267
330
|
|
|
268
331
|
Args:
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
type the assistant should return. Defaults to ``str``.
|
|
272
|
-
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
273
|
-
request. Defaults to ``None`` (automatic batch size optimization
|
|
332
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
333
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
274
334
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
275
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
276
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
277
335
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
278
336
|
|
|
279
337
|
Returns:
|
|
280
|
-
pandas.Series: Series whose values are
|
|
338
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
339
|
+
(dtype ``float32``).
|
|
281
340
|
"""
|
|
282
|
-
return self.
|
|
283
|
-
instructions=instructions,
|
|
341
|
+
return self.embeddings_with_cache(
|
|
284
342
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
285
|
-
response_format=response_format,
|
|
286
|
-
temperature=temperature,
|
|
287
|
-
top_p=top_p,
|
|
288
|
-
**api_kwargs,
|
|
289
343
|
)
|
|
290
344
|
|
|
291
345
|
def task_with_cache(
|
|
@@ -300,6 +354,13 @@ class OpenAIVecSeriesAccessor:
|
|
|
300
354
|
response format, temperature and top_p. A supplied ``BatchingMapProxy`` enables
|
|
301
355
|
cross‑operation deduplicated reuse and external batch size / progress control.
|
|
302
356
|
|
|
357
|
+
Example:
|
|
358
|
+
```python
|
|
359
|
+
from openaivec._proxy import BatchingMapProxy
|
|
360
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
361
|
+
reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
362
|
+
```
|
|
363
|
+
|
|
303
364
|
Args:
|
|
304
365
|
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
305
366
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
@@ -311,13 +372,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
311
372
|
|
|
312
373
|
Returns:
|
|
313
374
|
pandas.Series: Task results aligned with the original Series index.
|
|
314
|
-
|
|
315
|
-
Example:
|
|
316
|
-
```python
|
|
317
|
-
from openaivec._proxy import BatchingMapProxy
|
|
318
|
-
shared_cache = BatchingMapProxy(batch_size=64)
|
|
319
|
-
reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
320
|
-
```
|
|
321
375
|
"""
|
|
322
376
|
client: BatchResponses = BatchResponses(
|
|
323
377
|
client=CONTAINER.resolve(OpenAI),
|
|
@@ -382,36 +436,60 @@ class OpenAIVecSeriesAccessor:
|
|
|
382
436
|
**api_kwargs,
|
|
383
437
|
)
|
|
384
438
|
|
|
385
|
-
def
|
|
386
|
-
"""
|
|
439
|
+
def infer_schema(self, purpose: str, max_examples: int = 100) -> InferredSchema:
|
|
440
|
+
"""Infer a structured data schema from Series content using AI.
|
|
441
|
+
|
|
442
|
+
This method analyzes a sample of the Series values to automatically infer
|
|
443
|
+
a structured schema that can be used for consistent data extraction.
|
|
444
|
+
The inferred schema includes field names, types, descriptions, and
|
|
445
|
+
potential enum values based on patterns found in the data.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
purpose (str): Plain language description of how the extracted
|
|
449
|
+
structured data will be used (e.g., "Extract customer sentiment
|
|
450
|
+
signals for analytics", "Parse product features for search").
|
|
451
|
+
This guides field relevance and helps exclude irrelevant information.
|
|
452
|
+
max_examples (int): Maximum number of examples to analyze from the
|
|
453
|
+
Series. The method will sample randomly from the Series up to this
|
|
454
|
+
limit. Defaults to 100.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
InferredSchema: An object containing:
|
|
458
|
+
- purpose: Normalized statement of the extraction objective
|
|
459
|
+
- fields: List of field specifications with names, types, and descriptions
|
|
460
|
+
- inference_prompt: Reusable prompt for future extractions
|
|
461
|
+
- model: Dynamically generated Pydantic model for parsing
|
|
462
|
+
- task: PreparedTask for batch extraction operations
|
|
387
463
|
|
|
388
464
|
Example:
|
|
389
465
|
```python
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
466
|
+
reviews = pd.Series([
|
|
467
|
+
"Great product! Fast shipping and excellent quality.",
|
|
468
|
+
"Terrible experience. Item broke after 2 days.",
|
|
469
|
+
"Average product. Price is fair but nothing special."
|
|
470
|
+
])
|
|
393
471
|
|
|
394
|
-
#
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
batch_size=100,
|
|
398
|
-
show_progress=True
|
|
472
|
+
# Infer schema for sentiment analysis
|
|
473
|
+
schema = reviews.ai.infer_schema(
|
|
474
|
+
purpose="Extract sentiment and product quality indicators"
|
|
399
475
|
)
|
|
400
|
-
```
|
|
401
476
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
based on execution time). Set to a positive integer for fixed batch size.
|
|
406
|
-
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
477
|
+
# Use the inferred schema for batch extraction
|
|
478
|
+
extracted = reviews.ai.task(schema.task)
|
|
479
|
+
```
|
|
407
480
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
481
|
+
Note:
|
|
482
|
+
The schema inference uses AI to analyze patterns in the data and may
|
|
483
|
+
require multiple attempts to produce a valid schema. Fields are limited
|
|
484
|
+
to primitive types (string, integer, float, boolean) with optional
|
|
485
|
+
enum values for categorical fields.
|
|
411
486
|
"""
|
|
412
|
-
|
|
413
|
-
|
|
487
|
+
inferer = CONTAINER.resolve(SchemaInferer)
|
|
488
|
+
|
|
489
|
+
input: SchemaInferenceInput = SchemaInferenceInput(
|
|
490
|
+
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(), purpose=purpose
|
|
414
491
|
)
|
|
492
|
+
return inferer.infer_schema(input)
|
|
415
493
|
|
|
416
494
|
def count_tokens(self) -> pd.Series:
|
|
417
495
|
"""Count `tiktoken` tokens per row.
|
|
@@ -459,45 +537,97 @@ class OpenAIVecSeriesAccessor:
|
|
|
459
537
|
extracted.columns = [f"{self._obj.name}_{col}" for col in extracted.columns]
|
|
460
538
|
return extracted
|
|
461
539
|
|
|
540
|
+
def auto_extract(
|
|
541
|
+
self,
|
|
542
|
+
purpose: str,
|
|
543
|
+
max_examples: int = 100,
|
|
544
|
+
batch_size: int | None = None,
|
|
545
|
+
show_progress: bool = False,
|
|
546
|
+
**api_kwargs,
|
|
547
|
+
) -> pd.DataFrame:
|
|
548
|
+
"""Automatically infer schema and extract structured data in one step.
|
|
462
549
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
550
|
+
This convenience method combines schema inference and data extraction into
|
|
551
|
+
a single operation. It first analyzes a sample of the Series to infer an
|
|
552
|
+
appropriate schema based on the stated purpose, then immediately applies
|
|
553
|
+
that schema to extract structured data from all values in the Series.
|
|
466
554
|
|
|
467
|
-
|
|
468
|
-
|
|
555
|
+
Args:
|
|
556
|
+
purpose (str): Plain language description of what information to extract
|
|
557
|
+
and how it will be used (e.g., "Extract product features for search",
|
|
558
|
+
"Parse customer feedback for sentiment analysis"). This guides both
|
|
559
|
+
schema inference and field selection.
|
|
560
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
561
|
+
A larger sample may produce more accurate schemas but increases
|
|
562
|
+
inference time. Defaults to 100.
|
|
563
|
+
batch_size (int | None): Number of requests to process in parallel during
|
|
564
|
+
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
565
|
+
value to control API usage and performance.
|
|
566
|
+
show_progress (bool): Whether to display a progress bar during extraction.
|
|
567
|
+
Useful for large datasets. Defaults to False.
|
|
568
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
569
|
+
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
469
570
|
|
|
470
|
-
|
|
471
|
-
|
|
571
|
+
Returns:
|
|
572
|
+
pd.DataFrame: A DataFrame with extracted structured data. Each inferred
|
|
573
|
+
field becomes a column, with the same index as the original Series.
|
|
574
|
+
Column names and types are determined by the inferred schema.
|
|
472
575
|
|
|
473
576
|
Example:
|
|
474
577
|
```python
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
578
|
+
# Extract structured data from product reviews
|
|
579
|
+
reviews = pd.Series([
|
|
580
|
+
"Great laptop! 16GB RAM, fast SSD, battery lasts 10 hours",
|
|
581
|
+
"Decent phone. 128GB storage, camera is okay, screen is bright",
|
|
582
|
+
"Gaming desktop with RTX 4090, 32GB RAM, runs everything smoothly"
|
|
479
583
|
])
|
|
480
|
-
df.ai.extract("animal")
|
|
481
|
-
```
|
|
482
|
-
This method returns a DataFrame with the same index as the original,
|
|
483
|
-
where each column corresponds to a key in the dictionaries.
|
|
484
|
-
The source column is dropped.
|
|
485
584
|
|
|
486
|
-
|
|
487
|
-
|
|
585
|
+
# One-step extraction
|
|
586
|
+
extracted = reviews.ai.auto_extract(
|
|
587
|
+
purpose="Extract product specifications and performance metrics",
|
|
588
|
+
show_progress=True
|
|
589
|
+
)
|
|
590
|
+
# Result: DataFrame with columns like 'ram', 'storage', 'battery_life', etc.
|
|
488
591
|
|
|
489
|
-
|
|
490
|
-
|
|
592
|
+
# Extract sentiment and issues from support tickets
|
|
593
|
+
tickets = pd.Series([
|
|
594
|
+
"Account locked, can't reset password, very frustrated",
|
|
595
|
+
"Billing error, charged twice for subscription",
|
|
596
|
+
"Great support! Issue resolved quickly"
|
|
597
|
+
])
|
|
598
|
+
|
|
599
|
+
features = tickets.ai.auto_extract(
|
|
600
|
+
purpose="Extract issue type and customer sentiment for support analytics"
|
|
601
|
+
)
|
|
602
|
+
```
|
|
603
|
+
|
|
604
|
+
Note:
|
|
605
|
+
This method is ideal for exploratory data analysis when you don't have
|
|
606
|
+
a predefined schema. For production use cases with stable schemas,
|
|
607
|
+
consider using `infer_schema()` once and reusing the schema with `task()`.
|
|
608
|
+
The inferred schema is not returned, so if you need to inspect or save it,
|
|
609
|
+
use `infer_schema()` and `task()` separately.
|
|
491
610
|
"""
|
|
492
|
-
|
|
493
|
-
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
|
611
|
+
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
494
612
|
|
|
495
|
-
return (
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
613
|
+
return pd.DataFrame(
|
|
614
|
+
{
|
|
615
|
+
"inferred": self._obj.ai.task(
|
|
616
|
+
task=schema.task,
|
|
617
|
+
batch_size=batch_size,
|
|
618
|
+
show_progress=show_progress,
|
|
619
|
+
**api_kwargs,
|
|
620
|
+
),
|
|
621
|
+
}
|
|
622
|
+
).ai.extract("inferred")
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
@pd.api.extensions.register_dataframe_accessor("ai")
|
|
626
|
+
class OpenAIVecDataFrameAccessor:
|
|
627
|
+
"""pandas DataFrame accessor (``.ai``) that adds OpenAI helpers."""
|
|
628
|
+
|
|
629
|
+
def __init__(self, df_obj: pd.DataFrame):
|
|
630
|
+
self._obj = df_obj
|
|
501
631
|
|
|
502
632
|
def responses_with_cache(
|
|
503
633
|
self,
|
|
@@ -508,25 +638,12 @@ class OpenAIVecDataFrameAccessor:
|
|
|
508
638
|
top_p: float = 1.0,
|
|
509
639
|
**api_kwargs,
|
|
510
640
|
) -> pd.Series:
|
|
511
|
-
"""Generate a response for each row after
|
|
641
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache.
|
|
512
642
|
|
|
513
643
|
This method allows external control over caching behavior by accepting
|
|
514
644
|
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
515
645
|
across multiple operations or custom batch size management.
|
|
516
646
|
|
|
517
|
-
Args:
|
|
518
|
-
instructions (str): System prompt for the assistant.
|
|
519
|
-
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
520
|
-
instance for managing API call batching and deduplication.
|
|
521
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
522
|
-
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
523
|
-
responses. Defaults to ``str``.
|
|
524
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
525
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
526
|
-
|
|
527
|
-
Returns:
|
|
528
|
-
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
529
|
-
|
|
530
647
|
Example:
|
|
531
648
|
```python
|
|
532
649
|
from openaivec._proxy import BatchingMapProxy
|
|
@@ -544,6 +661,19 @@ class OpenAIVecDataFrameAccessor:
|
|
|
544
661
|
cache=shared_cache
|
|
545
662
|
)
|
|
546
663
|
```
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
instructions (str): System prompt for the assistant.
|
|
667
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
668
|
+
instance for managing API call batching and deduplication.
|
|
669
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
670
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
671
|
+
responses. Defaults to ``str``.
|
|
672
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
673
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
674
|
+
|
|
675
|
+
Returns:
|
|
676
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
547
677
|
"""
|
|
548
678
|
return _df_rows_to_json_series(self._obj).ai.responses_with_cache(
|
|
549
679
|
instructions=instructions,
|
|
@@ -564,7 +694,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
564
694
|
show_progress: bool = False,
|
|
565
695
|
**api_kwargs,
|
|
566
696
|
) -> pd.Series:
|
|
567
|
-
"""Generate a response for each row after
|
|
697
|
+
"""Generate a response for each row after serializing it to JSON.
|
|
568
698
|
|
|
569
699
|
Example:
|
|
570
700
|
```python
|
|
@@ -592,7 +722,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
592
722
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
593
723
|
Defaults to ``None`` (automatic batch size optimization
|
|
594
724
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
595
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
725
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
596
726
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
597
727
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
598
728
|
|
|
@@ -608,18 +738,43 @@ class OpenAIVecDataFrameAccessor:
|
|
|
608
738
|
**api_kwargs,
|
|
609
739
|
)
|
|
610
740
|
|
|
611
|
-
def
|
|
741
|
+
def task_with_cache(
|
|
612
742
|
self,
|
|
613
|
-
task: PreparedTask,
|
|
614
|
-
|
|
615
|
-
show_progress: bool = False,
|
|
743
|
+
task: PreparedTask[ResponseFormat],
|
|
744
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
616
745
|
**api_kwargs,
|
|
617
746
|
) -> pd.Series:
|
|
618
|
-
"""Execute a prepared task on each DataFrame row after
|
|
747
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
|
|
619
748
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
749
|
+
Args:
|
|
750
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
751
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
752
|
+
|
|
753
|
+
Additional Keyword Args:
|
|
754
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
755
|
+
``seed``) forwarded verbatim. Core routing keys are managed internally.
|
|
756
|
+
|
|
757
|
+
Returns:
|
|
758
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
759
|
+
"""
|
|
760
|
+
return _df_rows_to_json_series(self._obj).ai.task_with_cache(
|
|
761
|
+
task=task,
|
|
762
|
+
cache=cache,
|
|
763
|
+
**api_kwargs,
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
def task(
|
|
767
|
+
self,
|
|
768
|
+
task: PreparedTask,
|
|
769
|
+
batch_size: int | None = None,
|
|
770
|
+
show_progress: bool = False,
|
|
771
|
+
**api_kwargs,
|
|
772
|
+
) -> pd.Series:
|
|
773
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON.
|
|
774
|
+
|
|
775
|
+
Example:
|
|
776
|
+
```python
|
|
777
|
+
from openaivec._model import PreparedTask
|
|
623
778
|
|
|
624
779
|
# Assume you have a prepared task for data analysis
|
|
625
780
|
analysis_task = PreparedTask(...)
|
|
@@ -666,29 +821,92 @@ class OpenAIVecDataFrameAccessor:
|
|
|
666
821
|
**api_kwargs,
|
|
667
822
|
)
|
|
668
823
|
|
|
669
|
-
def
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
824
|
+
def infer_schema(self, purpose: str, max_examples: int = 100) -> InferredSchema:
|
|
825
|
+
"""Infer a structured data schema from DataFrame rows using AI.
|
|
826
|
+
|
|
827
|
+
This method analyzes a sample of DataFrame rows to automatically infer
|
|
828
|
+
a structured schema that can be used for consistent data extraction.
|
|
829
|
+
Each row is converted to JSON format and analyzed to identify patterns,
|
|
830
|
+
field types, and potential categorical values.
|
|
676
831
|
|
|
677
832
|
Args:
|
|
678
|
-
|
|
679
|
-
|
|
833
|
+
purpose (str): Plain language description of how the extracted
|
|
834
|
+
structured data will be used (e.g., "Extract operational metrics
|
|
835
|
+
for dashboard", "Parse customer attributes for segmentation").
|
|
836
|
+
This guides field relevance and helps exclude irrelevant information.
|
|
837
|
+
max_examples (int): Maximum number of rows to analyze from the
|
|
838
|
+
DataFrame. The method will sample randomly up to this limit.
|
|
839
|
+
Defaults to 100.
|
|
680
840
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
841
|
+
Returns:
|
|
842
|
+
InferredSchema: An object containing:
|
|
843
|
+
- purpose: Normalized statement of the extraction objective
|
|
844
|
+
- fields: List of field specifications with names, types, and descriptions
|
|
845
|
+
- inference_prompt: Reusable prompt for future extractions
|
|
846
|
+
- model: Dynamically generated Pydantic model for parsing
|
|
847
|
+
- task: PreparedTask for batch extraction operations
|
|
848
|
+
|
|
849
|
+
Example:
|
|
850
|
+
```python
|
|
851
|
+
df = pd.DataFrame({
|
|
852
|
+
'text': [
|
|
853
|
+
"Order #123: Shipped to NYC, arriving Tuesday",
|
|
854
|
+
"Order #456: Delayed due to weather, new ETA Friday",
|
|
855
|
+
"Order #789: Delivered to customer in LA"
|
|
856
|
+
],
|
|
857
|
+
'timestamp': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
858
|
+
})
|
|
859
|
+
|
|
860
|
+
# Infer schema for logistics tracking
|
|
861
|
+
schema = df.ai.infer_schema(
|
|
862
|
+
purpose="Extract shipping status and location data for logistics tracking"
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
# Apply the schema to extract structured data
|
|
866
|
+
extracted_df = df.ai.task(schema.task)
|
|
867
|
+
```
|
|
868
|
+
|
|
869
|
+
Note:
|
|
870
|
+
The DataFrame rows are internally converted to JSON format before
|
|
871
|
+
analysis. The inferred schema is flat (no nested structures) and
|
|
872
|
+
uses only primitive types to ensure compatibility with pandas and
|
|
873
|
+
Spark operations.
|
|
874
|
+
"""
|
|
875
|
+
return _df_rows_to_json_series(self._obj).ai.infer_schema(
|
|
876
|
+
purpose=purpose,
|
|
877
|
+
max_examples=max_examples,
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
def extract(self, column: str) -> pd.DataFrame:
|
|
881
|
+
"""Flatten one column of Pydantic models/dicts into top‑level columns.
|
|
882
|
+
|
|
883
|
+
Example:
|
|
884
|
+
```python
|
|
885
|
+
df = pd.DataFrame([
|
|
886
|
+
{"animal": {"name": "cat", "legs": 4}},
|
|
887
|
+
{"animal": {"name": "dog", "legs": 4}},
|
|
888
|
+
{"animal": {"name": "elephant", "legs": 4}},
|
|
889
|
+
])
|
|
890
|
+
df.ai.extract("animal")
|
|
891
|
+
```
|
|
892
|
+
This method returns a DataFrame with the same index as the original,
|
|
893
|
+
where each column corresponds to a key in the dictionaries.
|
|
894
|
+
The source column is dropped.
|
|
895
|
+
|
|
896
|
+
Args:
|
|
897
|
+
column (str): Column to expand.
|
|
684
898
|
|
|
685
899
|
Returns:
|
|
686
|
-
pandas.
|
|
900
|
+
pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
|
|
687
901
|
"""
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
902
|
+
if column not in self._obj.columns:
|
|
903
|
+
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
|
904
|
+
|
|
905
|
+
return (
|
|
906
|
+
self._obj.pipe(lambda df: df.reset_index(drop=True))
|
|
907
|
+
.pipe(lambda df: df.join(df[column].ai.extract()))
|
|
908
|
+
.pipe(lambda df: df.set_index(self._obj.index))
|
|
909
|
+
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
692
910
|
)
|
|
693
911
|
|
|
694
912
|
def fillna(
|
|
@@ -769,6 +987,100 @@ class OpenAIVecDataFrameAccessor:
|
|
|
769
987
|
|
|
770
988
|
return df
|
|
771
989
|
|
|
990
|
+
def auto_extract(
|
|
991
|
+
self,
|
|
992
|
+
purpose: str,
|
|
993
|
+
max_examples: int = 100,
|
|
994
|
+
batch_size: int | None = None,
|
|
995
|
+
show_progress: bool = False,
|
|
996
|
+
**api_kwargs,
|
|
997
|
+
) -> pd.DataFrame:
|
|
998
|
+
"""Automatically infer schema and add extracted fields to the DataFrame.
|
|
999
|
+
|
|
1000
|
+
This convenience method combines schema inference and data extraction to
|
|
1001
|
+
automatically add new columns to the existing DataFrame. It analyzes a
|
|
1002
|
+
sample of the DataFrame rows to infer an appropriate schema based on the
|
|
1003
|
+
stated purpose, then extracts structured data and joins it with the
|
|
1004
|
+
original DataFrame.
|
|
1005
|
+
|
|
1006
|
+
Args:
|
|
1007
|
+
purpose (str): Plain language description of what information to extract
|
|
1008
|
+
and how it will be used (e.g., "Extract customer sentiment metrics",
|
|
1009
|
+
"Parse product attributes for analytics"). This guides both schema
|
|
1010
|
+
inference and field selection.
|
|
1011
|
+
max_examples (int): Maximum number of rows to use for schema inference.
|
|
1012
|
+
A larger sample may produce more accurate schemas but increases
|
|
1013
|
+
inference time. Defaults to 100.
|
|
1014
|
+
batch_size (int | None): Number of requests to process in parallel during
|
|
1015
|
+
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
1016
|
+
value to control API usage and performance.
|
|
1017
|
+
show_progress (bool): Whether to display a progress bar during extraction.
|
|
1018
|
+
Useful for large datasets. Defaults to False.
|
|
1019
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
1020
|
+
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
1021
|
+
|
|
1022
|
+
Returns:
|
|
1023
|
+
pd.DataFrame: The original DataFrame with new columns added from the
|
|
1024
|
+
inferred structured data. Each inferred field becomes a new column.
|
|
1025
|
+
The original columns and index are preserved.
|
|
1026
|
+
|
|
1027
|
+
Example:
|
|
1028
|
+
```python
|
|
1029
|
+
# Add sentiment and issue type to support tickets
|
|
1030
|
+
df = pd.DataFrame({
|
|
1031
|
+
'ticket_id': [1, 2, 3],
|
|
1032
|
+
'description': [
|
|
1033
|
+
"Can't login, password reset not working",
|
|
1034
|
+
"Billing error, charged twice last month",
|
|
1035
|
+
"Great service, issue resolved quickly!"
|
|
1036
|
+
],
|
|
1037
|
+
'date': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
1038
|
+
})
|
|
1039
|
+
|
|
1040
|
+
# Add inferred fields to existing DataFrame
|
|
1041
|
+
enriched_df = df.ai.auto_extract(
|
|
1042
|
+
purpose="Extract issue type and sentiment for support dashboard",
|
|
1043
|
+
show_progress=True
|
|
1044
|
+
)
|
|
1045
|
+
# Result: Original df with new columns like 'issue_type', 'sentiment', etc.
|
|
1046
|
+
|
|
1047
|
+
# Add product specifications to inventory data
|
|
1048
|
+
inventory = pd.DataFrame({
|
|
1049
|
+
'sku': ['A001', 'B002', 'C003'],
|
|
1050
|
+
'description': [
|
|
1051
|
+
"Laptop 16GB RAM, 512GB SSD, Intel i7",
|
|
1052
|
+
"Phone 128GB, 5G, dual camera",
|
|
1053
|
+
"Tablet 10-inch, WiFi only, 64GB"
|
|
1054
|
+
]
|
|
1055
|
+
})
|
|
1056
|
+
|
|
1057
|
+
enriched_inventory = inventory.ai.auto_extract(
|
|
1058
|
+
purpose="Extract technical specifications for inventory system"
|
|
1059
|
+
)
|
|
1060
|
+
```
|
|
1061
|
+
|
|
1062
|
+
Note:
|
|
1063
|
+
This method is ideal for enriching existing DataFrames with additional
|
|
1064
|
+
structured fields extracted from text columns. The schema is inferred
|
|
1065
|
+
from the entire DataFrame content (converted to JSON format). For
|
|
1066
|
+
production use cases with stable schemas, consider using `infer_schema()`
|
|
1067
|
+
once and reusing the schema with `task()`.
|
|
1068
|
+
"""
|
|
1069
|
+
# Infer schema from DataFrame rows
|
|
1070
|
+
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
1071
|
+
|
|
1072
|
+
# Extract structured data using the inferred schema
|
|
1073
|
+
inferred_series = self._obj.ai.task(
|
|
1074
|
+
task=schema.task,
|
|
1075
|
+
batch_size=batch_size,
|
|
1076
|
+
show_progress=show_progress,
|
|
1077
|
+
**api_kwargs,
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1080
|
+
return self._obj.assign(
|
|
1081
|
+
inferred=inferred_series,
|
|
1082
|
+
).ai.extract("inferred")
|
|
1083
|
+
|
|
772
1084
|
def similarity(self, col1: str, col2: str) -> pd.Series:
|
|
773
1085
|
"""Compute cosine similarity between two columns containing embedding vectors.
|
|
774
1086
|
|
|
@@ -776,15 +1088,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
776
1088
|
two columns of the DataFrame. The vectors should be numpy arrays or
|
|
777
1089
|
array-like objects that support dot product operations.
|
|
778
1090
|
|
|
779
|
-
Args:
|
|
780
|
-
col1 (str): Name of the first column containing embedding vectors.
|
|
781
|
-
col2 (str): Name of the second column containing embedding vectors.
|
|
782
|
-
|
|
783
|
-
Returns:
|
|
784
|
-
pandas.Series: Series containing cosine similarity scores between
|
|
785
|
-
corresponding vectors in col1 and col2, with values ranging
|
|
786
|
-
from -1 to 1, where 1 indicates identical direction.
|
|
787
|
-
|
|
788
1091
|
Example:
|
|
789
1092
|
```python
|
|
790
1093
|
df = pd.DataFrame({
|
|
@@ -793,6 +1096,15 @@ class OpenAIVecDataFrameAccessor:
|
|
|
793
1096
|
})
|
|
794
1097
|
similarities = df.ai.similarity('vec1', 'vec2')
|
|
795
1098
|
```
|
|
1099
|
+
|
|
1100
|
+
Args:
|
|
1101
|
+
col1 (str): Name of the first column containing embedding vectors.
|
|
1102
|
+
col2 (str): Name of the second column containing embedding vectors.
|
|
1103
|
+
|
|
1104
|
+
Returns:
|
|
1105
|
+
pandas.Series: Series containing cosine similarity scores between
|
|
1106
|
+
corresponding vectors in col1 and col2, with values ranging
|
|
1107
|
+
from -1 to 1, where 1 indicates identical direction.
|
|
796
1108
|
"""
|
|
797
1109
|
return self._obj.apply(
|
|
798
1110
|
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
@@ -823,6 +1135,16 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
823
1135
|
across multiple operations or custom batch size management. The concurrency
|
|
824
1136
|
is controlled by the cache instance itself.
|
|
825
1137
|
|
|
1138
|
+
Example:
|
|
1139
|
+
```python
|
|
1140
|
+
result = await series.aio.responses_with_cache(
|
|
1141
|
+
"classify",
|
|
1142
|
+
cache=shared,
|
|
1143
|
+
max_output_tokens=256,
|
|
1144
|
+
frequency_penalty=0.2,
|
|
1145
|
+
)
|
|
1146
|
+
```
|
|
1147
|
+
|
|
826
1148
|
Args:
|
|
827
1149
|
instructions (str): System prompt prepended to every user message.
|
|
828
1150
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
@@ -841,16 +1163,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
841
1163
|
Returns:
|
|
842
1164
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
843
1165
|
|
|
844
|
-
Example:
|
|
845
|
-
```python
|
|
846
|
-
result = await series.aio.responses_with_cache(
|
|
847
|
-
"classify",
|
|
848
|
-
cache=shared,
|
|
849
|
-
max_output_tokens=256,
|
|
850
|
-
frequency_penalty=0.2,
|
|
851
|
-
)
|
|
852
|
-
```
|
|
853
|
-
|
|
854
1166
|
Note:
|
|
855
1167
|
This is an asynchronous method and must be awaited.
|
|
856
1168
|
"""
|
|
@@ -866,122 +1178,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
866
1178
|
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
867
1179
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
868
1180
|
|
|
869
|
-
async def embeddings_with_cache(
|
|
870
|
-
self,
|
|
871
|
-
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
872
|
-
) -> pd.Series:
|
|
873
|
-
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
874
|
-
|
|
875
|
-
This method allows external control over caching behavior by accepting
|
|
876
|
-
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
877
|
-
across multiple operations or custom batch size management. The concurrency
|
|
878
|
-
is controlled by the cache instance itself.
|
|
879
|
-
|
|
880
|
-
Args:
|
|
881
|
-
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
882
|
-
instance for managing API call batching and deduplication.
|
|
883
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
884
|
-
|
|
885
|
-
Returns:
|
|
886
|
-
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
887
|
-
(dtype ``float32``).
|
|
888
|
-
|
|
889
|
-
Example:
|
|
890
|
-
```python
|
|
891
|
-
from openaivec._proxy import AsyncBatchingMapProxy
|
|
892
|
-
import numpy as np
|
|
893
|
-
|
|
894
|
-
# Create a shared cache with custom batch size and concurrency
|
|
895
|
-
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
896
|
-
batch_size=64, max_concurrency=4
|
|
897
|
-
)
|
|
898
|
-
|
|
899
|
-
animals = pd.Series(["cat", "dog", "elephant"])
|
|
900
|
-
# Must be awaited
|
|
901
|
-
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
902
|
-
```
|
|
903
|
-
|
|
904
|
-
Note:
|
|
905
|
-
This is an asynchronous method and must be awaited.
|
|
906
|
-
"""
|
|
907
|
-
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
908
|
-
client=CONTAINER.resolve(AsyncOpenAI),
|
|
909
|
-
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
910
|
-
cache=cache,
|
|
911
|
-
)
|
|
912
|
-
|
|
913
|
-
# Await the async operation
|
|
914
|
-
results = await client.create(self._obj.tolist())
|
|
915
|
-
|
|
916
|
-
return pd.Series(
|
|
917
|
-
results,
|
|
918
|
-
index=self._obj.index,
|
|
919
|
-
name=self._obj.name,
|
|
920
|
-
)
|
|
921
|
-
|
|
922
|
-
async def task_with_cache(
|
|
923
|
-
self,
|
|
924
|
-
task: PreparedTask[ResponseFormat],
|
|
925
|
-
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
926
|
-
**api_kwargs,
|
|
927
|
-
) -> pd.Series:
|
|
928
|
-
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
929
|
-
|
|
930
|
-
This method allows external control over caching behavior by accepting
|
|
931
|
-
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
932
|
-
across multiple operations or custom batch size management. The concurrency
|
|
933
|
-
is controlled by the cache instance itself.
|
|
934
|
-
|
|
935
|
-
Args:
|
|
936
|
-
task (PreparedTask): A pre-configured task containing instructions,
|
|
937
|
-
response format, and other parameters for processing the inputs.
|
|
938
|
-
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
939
|
-
instance for managing API call batching and deduplication.
|
|
940
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
941
|
-
|
|
942
|
-
Additional Keyword Args:
|
|
943
|
-
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
944
|
-
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
945
|
-
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
946
|
-
library and cannot be overridden.
|
|
947
|
-
|
|
948
|
-
Returns:
|
|
949
|
-
pandas.Series: Series whose values are instances of the task's
|
|
950
|
-
response format, aligned with the original Series index.
|
|
951
|
-
|
|
952
|
-
Example:
|
|
953
|
-
```python
|
|
954
|
-
from openaivec._model import PreparedTask
|
|
955
|
-
from openaivec._proxy import AsyncBatchingMapProxy
|
|
956
|
-
|
|
957
|
-
# Create a shared cache with custom batch size and concurrency
|
|
958
|
-
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
959
|
-
|
|
960
|
-
# Assume you have a prepared task for sentiment analysis
|
|
961
|
-
sentiment_task = PreparedTask(...)
|
|
962
|
-
|
|
963
|
-
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
964
|
-
# Must be awaited
|
|
965
|
-
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
966
|
-
```
|
|
967
|
-
|
|
968
|
-
Note:
|
|
969
|
-
This is an asynchronous method and must be awaited.
|
|
970
|
-
"""
|
|
971
|
-
client = AsyncBatchResponses(
|
|
972
|
-
client=CONTAINER.resolve(AsyncOpenAI),
|
|
973
|
-
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
974
|
-
system_message=task.instructions,
|
|
975
|
-
response_format=task.response_format,
|
|
976
|
-
cache=cache,
|
|
977
|
-
temperature=task.temperature,
|
|
978
|
-
top_p=task.top_p,
|
|
979
|
-
)
|
|
980
|
-
# Await the async operation
|
|
981
|
-
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
982
|
-
|
|
983
|
-
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
984
|
-
|
|
985
1181
|
async def responses(
|
|
986
1182
|
self,
|
|
987
1183
|
instructions: str,
|
|
@@ -1018,27 +1214,80 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1018
1214
|
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1019
1215
|
request. Defaults to ``None`` (automatic batch size optimization
|
|
1020
1216
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
1021
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1217
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1022
1218
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1023
1219
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1024
1220
|
requests. Defaults to ``8``.
|
|
1025
1221
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1026
1222
|
|
|
1027
1223
|
Returns:
|
|
1028
|
-
pandas.Series: Series whose values are instances of ``response_format``.
|
|
1224
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
1225
|
+
|
|
1226
|
+
Note:
|
|
1227
|
+
This is an asynchronous method and must be awaited.
|
|
1228
|
+
"""
|
|
1229
|
+
return await self.responses_with_cache(
|
|
1230
|
+
instructions=instructions,
|
|
1231
|
+
cache=AsyncBatchingMapProxy(
|
|
1232
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1233
|
+
),
|
|
1234
|
+
response_format=response_format,
|
|
1235
|
+
temperature=temperature,
|
|
1236
|
+
top_p=top_p,
|
|
1237
|
+
**api_kwargs,
|
|
1238
|
+
)
|
|
1239
|
+
|
|
1240
|
+
async def embeddings_with_cache(
|
|
1241
|
+
self,
|
|
1242
|
+
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
1243
|
+
) -> pd.Series:
|
|
1244
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
1245
|
+
|
|
1246
|
+
This method allows external control over caching behavior by accepting
|
|
1247
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1248
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1249
|
+
is controlled by the cache instance itself.
|
|
1250
|
+
|
|
1251
|
+
Example:
|
|
1252
|
+
```python
|
|
1253
|
+
from openaivec._proxy import AsyncBatchingMapProxy
|
|
1254
|
+
import numpy as np
|
|
1255
|
+
|
|
1256
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1257
|
+
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
1258
|
+
batch_size=64, max_concurrency=4
|
|
1259
|
+
)
|
|
1260
|
+
|
|
1261
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
1262
|
+
# Must be awaited
|
|
1263
|
+
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
1264
|
+
```
|
|
1265
|
+
|
|
1266
|
+
Args:
|
|
1267
|
+
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
1268
|
+
instance for managing API call batching and deduplication.
|
|
1269
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1270
|
+
|
|
1271
|
+
Returns:
|
|
1272
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
1273
|
+
(dtype ``float32``).
|
|
1029
1274
|
|
|
1030
1275
|
Note:
|
|
1031
1276
|
This is an asynchronous method and must be awaited.
|
|
1032
1277
|
"""
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1278
|
+
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
1279
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1280
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
1281
|
+
cache=cache,
|
|
1282
|
+
)
|
|
1283
|
+
|
|
1284
|
+
# Await the async operation
|
|
1285
|
+
results = await client.create(self._obj.tolist())
|
|
1286
|
+
|
|
1287
|
+
return pd.Series(
|
|
1288
|
+
results,
|
|
1289
|
+
index=self._obj.index,
|
|
1290
|
+
name=self._obj.name,
|
|
1042
1291
|
)
|
|
1043
1292
|
|
|
1044
1293
|
async def embeddings(
|
|
@@ -1082,6 +1331,69 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1082
1331
|
),
|
|
1083
1332
|
)
|
|
1084
1333
|
|
|
1334
|
+
async def task_with_cache(
|
|
1335
|
+
self,
|
|
1336
|
+
task: PreparedTask[ResponseFormat],
|
|
1337
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1338
|
+
**api_kwargs,
|
|
1339
|
+
) -> pd.Series:
|
|
1340
|
+
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
1341
|
+
|
|
1342
|
+
This method allows external control over caching behavior by accepting
|
|
1343
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1344
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1345
|
+
is controlled by the cache instance itself.
|
|
1346
|
+
|
|
1347
|
+
Args:
|
|
1348
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
1349
|
+
response format, and other parameters for processing the inputs.
|
|
1350
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1351
|
+
instance for managing API call batching and deduplication.
|
|
1352
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1353
|
+
|
|
1354
|
+
Example:
|
|
1355
|
+
```python
|
|
1356
|
+
from openaivec._model import PreparedTask
|
|
1357
|
+
from openaivec._proxy import AsyncBatchingMapProxy
|
|
1358
|
+
|
|
1359
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1360
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
1361
|
+
|
|
1362
|
+
# Assume you have a prepared task for sentiment analysis
|
|
1363
|
+
sentiment_task = PreparedTask(...)
|
|
1364
|
+
|
|
1365
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
1366
|
+
# Must be awaited
|
|
1367
|
+
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
1368
|
+
```
|
|
1369
|
+
|
|
1370
|
+
Additional Keyword Args:
|
|
1371
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1372
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1373
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1374
|
+
library and cannot be overridden.
|
|
1375
|
+
|
|
1376
|
+
Returns:
|
|
1377
|
+
pandas.Series: Series whose values are instances of the task's
|
|
1378
|
+
response format, aligned with the original Series index.
|
|
1379
|
+
|
|
1380
|
+
Note:
|
|
1381
|
+
This is an asynchronous method and must be awaited.
|
|
1382
|
+
"""
|
|
1383
|
+
client = AsyncBatchResponses(
|
|
1384
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1385
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
1386
|
+
system_message=task.instructions,
|
|
1387
|
+
response_format=task.response_format,
|
|
1388
|
+
cache=cache,
|
|
1389
|
+
temperature=task.temperature,
|
|
1390
|
+
top_p=task.top_p,
|
|
1391
|
+
)
|
|
1392
|
+
# Await the async operation
|
|
1393
|
+
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
1394
|
+
|
|
1395
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
1396
|
+
|
|
1085
1397
|
async def task(
|
|
1086
1398
|
self,
|
|
1087
1399
|
task: PreparedTask,
|
|
@@ -1144,6 +1456,96 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1144
1456
|
**api_kwargs,
|
|
1145
1457
|
)
|
|
1146
1458
|
|
|
1459
|
+
async def auto_extract(
|
|
1460
|
+
self,
|
|
1461
|
+
purpose: str,
|
|
1462
|
+
max_examples: int = 100,
|
|
1463
|
+
batch_size: int | None = None,
|
|
1464
|
+
max_concurrency: int = 8,
|
|
1465
|
+
show_progress: bool = False,
|
|
1466
|
+
**api_kwargs,
|
|
1467
|
+
) -> pd.DataFrame:
|
|
1468
|
+
"""Automatically infer schema and extract structured data in one step (asynchronously).
|
|
1469
|
+
|
|
1470
|
+
This convenience method combines schema inference and data extraction into
|
|
1471
|
+
a single operation. It first analyzes a sample of the Series to infer an
|
|
1472
|
+
appropriate schema based on the stated purpose, then immediately applies
|
|
1473
|
+
that schema to extract structured data from all values in the Series.
|
|
1474
|
+
|
|
1475
|
+
Args:
|
|
1476
|
+
purpose (str): Plain language description of what information to extract
|
|
1477
|
+
and how it will be used (e.g., "Extract product features for search",
|
|
1478
|
+
"Parse customer feedback for sentiment analysis"). This guides both
|
|
1479
|
+
schema inference and field selection.
|
|
1480
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
1481
|
+
A larger sample may produce more accurate schemas but increases
|
|
1482
|
+
inference time. Defaults to 100.
|
|
1483
|
+
batch_size (int | None): Number of requests to process in parallel during
|
|
1484
|
+
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
1485
|
+
value to control API usage and performance.
|
|
1486
|
+
max_concurrency (int): Maximum number of concurrent requests during
|
|
1487
|
+
extraction. Defaults to 8.
|
|
1488
|
+
show_progress (bool): Whether to display a progress bar during extraction.
|
|
1489
|
+
Useful for large datasets. Defaults to False.
|
|
1490
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
1491
|
+
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
1492
|
+
|
|
1493
|
+
Returns:
|
|
1494
|
+
pd.DataFrame: A DataFrame with extracted structured data. Each inferred
|
|
1495
|
+
field becomes a column, with the same index as the original Series.
|
|
1496
|
+
Column names and types are determined by the inferred schema.
|
|
1497
|
+
|
|
1498
|
+
Example:
|
|
1499
|
+
```python
|
|
1500
|
+
# Extract structured data from product reviews
|
|
1501
|
+
reviews = pd.Series([
|
|
1502
|
+
"Great laptop! 16GB RAM, fast SSD, battery lasts 10 hours",
|
|
1503
|
+
"Decent phone. 128GB storage, camera is okay, screen is bright",
|
|
1504
|
+
"Gaming desktop with RTX 4090, 32GB RAM, runs everything smoothly"
|
|
1505
|
+
])
|
|
1506
|
+
|
|
1507
|
+
# One-step extraction (must be awaited)
|
|
1508
|
+
extracted = await reviews.aio.auto_extract(
|
|
1509
|
+
purpose="Extract product specifications and performance metrics",
|
|
1510
|
+
max_concurrency=4,
|
|
1511
|
+
show_progress=True
|
|
1512
|
+
)
|
|
1513
|
+
# Result: DataFrame with columns like 'ram', 'storage', 'battery_life', etc.
|
|
1514
|
+
|
|
1515
|
+
# Extract sentiment and issues from support tickets
|
|
1516
|
+
tickets = pd.Series([
|
|
1517
|
+
"Account locked, can't reset password, very frustrated",
|
|
1518
|
+
"Billing error, charged twice for subscription",
|
|
1519
|
+
"Great support! Issue resolved quickly"
|
|
1520
|
+
])
|
|
1521
|
+
|
|
1522
|
+
features = await tickets.aio.auto_extract(
|
|
1523
|
+
purpose="Extract issue type and customer sentiment for support analytics",
|
|
1524
|
+
batch_size=32
|
|
1525
|
+
)
|
|
1526
|
+
```
|
|
1527
|
+
|
|
1528
|
+
Note:
|
|
1529
|
+
This is an asynchronous method and must be awaited. This method is ideal
|
|
1530
|
+
for exploratory data analysis when you don't have a predefined schema.
|
|
1531
|
+
For production use cases with stable schemas, consider using the synchronous
|
|
1532
|
+
`infer_schema()` once and reusing the schema with `task()`. The inferred
|
|
1533
|
+
schema is not returned, so if you need to inspect or save it, use
|
|
1534
|
+
`infer_schema()` and `task()` separately.
|
|
1535
|
+
"""
|
|
1536
|
+
# Use synchronous infer_schema since it's not async
|
|
1537
|
+
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
1538
|
+
|
|
1539
|
+
inferred_series = await self._obj.aio.task(
|
|
1540
|
+
task=schema.task,
|
|
1541
|
+
batch_size=batch_size,
|
|
1542
|
+
max_concurrency=max_concurrency,
|
|
1543
|
+
show_progress=show_progress,
|
|
1544
|
+
**api_kwargs,
|
|
1545
|
+
)
|
|
1546
|
+
|
|
1547
|
+
return pd.DataFrame({"inferred": inferred_series}).ai.extract("inferred")
|
|
1548
|
+
|
|
1147
1549
|
|
|
1148
1550
|
@pd.api.extensions.register_dataframe_accessor("aio")
|
|
1149
1551
|
class AsyncOpenAIVecDataFrameAccessor:
|
|
@@ -1161,26 +1563,13 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1161
1563
|
top_p: float = 1.0,
|
|
1162
1564
|
**api_kwargs,
|
|
1163
1565
|
) -> pd.Series:
|
|
1164
|
-
"""Generate a response for each row after
|
|
1566
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
|
|
1165
1567
|
|
|
1166
1568
|
This method allows external control over caching behavior by accepting
|
|
1167
1569
|
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1168
1570
|
across multiple operations or custom batch size management. The concurrency
|
|
1169
1571
|
is controlled by the cache instance itself.
|
|
1170
1572
|
|
|
1171
|
-
Args:
|
|
1172
|
-
instructions (str): System prompt for the assistant.
|
|
1173
|
-
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1174
|
-
instance for managing API call batching and deduplication.
|
|
1175
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1176
|
-
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1177
|
-
responses. Defaults to ``str``.
|
|
1178
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1179
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1180
|
-
|
|
1181
|
-
Returns:
|
|
1182
|
-
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
1183
|
-
|
|
1184
1573
|
Example:
|
|
1185
1574
|
```python
|
|
1186
1575
|
from openaivec._proxy import AsyncBatchingMapProxy
|
|
@@ -1200,6 +1589,19 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1200
1589
|
)
|
|
1201
1590
|
```
|
|
1202
1591
|
|
|
1592
|
+
Args:
|
|
1593
|
+
instructions (str): System prompt for the assistant.
|
|
1594
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1595
|
+
instance for managing API call batching and deduplication.
|
|
1596
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1597
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1598
|
+
responses. Defaults to ``str``.
|
|
1599
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1600
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1601
|
+
|
|
1602
|
+
Returns:
|
|
1603
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
1604
|
+
|
|
1203
1605
|
Note:
|
|
1204
1606
|
This is an asynchronous method and must be awaited.
|
|
1205
1607
|
"""
|
|
@@ -1224,7 +1626,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1224
1626
|
show_progress: bool = False,
|
|
1225
1627
|
**api_kwargs,
|
|
1226
1628
|
) -> pd.Series:
|
|
1227
|
-
"""Generate a response for each row after
|
|
1629
|
+
"""Generate a response for each row after serializing it to JSON (asynchronously).
|
|
1228
1630
|
|
|
1229
1631
|
Example:
|
|
1230
1632
|
```python
|
|
@@ -1253,7 +1655,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1253
1655
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
1254
1656
|
Defaults to ``None`` (automatic batch size optimization
|
|
1255
1657
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
1256
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1658
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1257
1659
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1258
1660
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1259
1661
|
requests. Defaults to ``8``.
|
|
@@ -1276,6 +1678,35 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1276
1678
|
**api_kwargs,
|
|
1277
1679
|
)
|
|
1278
1680
|
|
|
1681
|
+
async def task_with_cache(
|
|
1682
|
+
self,
|
|
1683
|
+
task: PreparedTask[ResponseFormat],
|
|
1684
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1685
|
+
**api_kwargs,
|
|
1686
|
+
) -> pd.Series:
|
|
1687
|
+
"""Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
|
|
1688
|
+
|
|
1689
|
+
After serializing each row to JSON, this method executes the prepared task.
|
|
1690
|
+
|
|
1691
|
+
Args:
|
|
1692
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
1693
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1694
|
+
|
|
1695
|
+
Additional Keyword Args:
|
|
1696
|
+
Arbitrary OpenAI Responses API parameters forwarded verbatim. Core routing keys are protected.
|
|
1697
|
+
|
|
1698
|
+
Returns:
|
|
1699
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
1700
|
+
|
|
1701
|
+
Note:
|
|
1702
|
+
This is an asynchronous method and must be awaited.
|
|
1703
|
+
"""
|
|
1704
|
+
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1705
|
+
task=task,
|
|
1706
|
+
cache=cache,
|
|
1707
|
+
**api_kwargs,
|
|
1708
|
+
)
|
|
1709
|
+
|
|
1279
1710
|
async def task(
|
|
1280
1711
|
self,
|
|
1281
1712
|
task: PreparedTask,
|
|
@@ -1284,7 +1715,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1284
1715
|
show_progress: bool = False,
|
|
1285
1716
|
**api_kwargs,
|
|
1286
1717
|
) -> pd.Series:
|
|
1287
|
-
"""Execute a prepared task on each DataFrame row after
|
|
1718
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
|
|
1288
1719
|
|
|
1289
1720
|
Example:
|
|
1290
1721
|
```python
|
|
@@ -1343,40 +1774,24 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1343
1774
|
**api_kwargs,
|
|
1344
1775
|
)
|
|
1345
1776
|
|
|
1346
|
-
async def task_with_cache(
|
|
1347
|
-
self,
|
|
1348
|
-
task: PreparedTask[ResponseFormat],
|
|
1349
|
-
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1350
|
-
**api_kwargs,
|
|
1351
|
-
) -> pd.Series:
|
|
1352
|
-
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache (async).
|
|
1353
|
-
|
|
1354
|
-
Args:
|
|
1355
|
-
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
1356
|
-
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1357
|
-
|
|
1358
|
-
Additional Keyword Args:
|
|
1359
|
-
Arbitrary OpenAI Responses API parameters forwarded verbatim. Core routing keys are protected.
|
|
1360
|
-
|
|
1361
|
-
Returns:
|
|
1362
|
-
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
1363
|
-
|
|
1364
|
-
Note:
|
|
1365
|
-
This is an asynchronous method and must be awaited.
|
|
1366
|
-
"""
|
|
1367
|
-
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1368
|
-
task=task,
|
|
1369
|
-
cache=cache,
|
|
1370
|
-
**api_kwargs,
|
|
1371
|
-
)
|
|
1372
|
-
|
|
1373
1777
|
async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
|
|
1374
|
-
"""
|
|
1375
|
-
Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
|
|
1778
|
+
"""Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
|
|
1376
1779
|
|
|
1377
1780
|
This method allows chaining operations on the DataFrame, similar to pandas' `pipe` method,
|
|
1378
1781
|
but with support for asynchronous functions.
|
|
1379
1782
|
|
|
1783
|
+
Example:
|
|
1784
|
+
```python
|
|
1785
|
+
async def process_data(df):
|
|
1786
|
+
# Simulate an asynchronous computation
|
|
1787
|
+
await asyncio.sleep(1)
|
|
1788
|
+
return df.dropna()
|
|
1789
|
+
|
|
1790
|
+
df = pd.DataFrame({"col": [1, 2, None, 4]})
|
|
1791
|
+
# Must be awaited
|
|
1792
|
+
result = await df.aio.pipe(process_data)
|
|
1793
|
+
```
|
|
1794
|
+
|
|
1380
1795
|
Args:
|
|
1381
1796
|
func (Callable[[pd.DataFrame], Awaitable[T] | T]): A function that takes a DataFrame
|
|
1382
1797
|
as input and returns either a result or an awaitable result.
|
|
@@ -1538,3 +1953,103 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1538
1953
|
df.at[actual_index, target_column_name] = result.output
|
|
1539
1954
|
|
|
1540
1955
|
return df
|
|
1956
|
+
|
|
1957
|
+
async def auto_extract(
|
|
1958
|
+
self,
|
|
1959
|
+
purpose: str,
|
|
1960
|
+
max_examples: int = 100,
|
|
1961
|
+
batch_size: int | None = None,
|
|
1962
|
+
max_concurrency: int = 8,
|
|
1963
|
+
show_progress: bool = False,
|
|
1964
|
+
**api_kwargs,
|
|
1965
|
+
) -> pd.DataFrame:
|
|
1966
|
+
"""Automatically infer schema and add extracted fields to the DataFrame (asynchronously).
|
|
1967
|
+
|
|
1968
|
+
This convenience method combines schema inference and data extraction to
|
|
1969
|
+
automatically add new columns to the existing DataFrame. It analyzes a
|
|
1970
|
+
sample of the DataFrame rows to infer an appropriate schema based on the
|
|
1971
|
+
stated purpose, then extracts structured data and joins it with the
|
|
1972
|
+
original DataFrame.
|
|
1973
|
+
|
|
1974
|
+
Args:
|
|
1975
|
+
purpose (str): Plain language description of what information to extract
|
|
1976
|
+
and how it will be used (e.g., "Extract customer sentiment metrics",
|
|
1977
|
+
"Parse product attributes for analytics"). This guides both schema
|
|
1978
|
+
inference and field selection.
|
|
1979
|
+
max_examples (int): Maximum number of rows to use for schema inference.
|
|
1980
|
+
A larger sample may produce more accurate schemas but increases
|
|
1981
|
+
inference time. Defaults to 100.
|
|
1982
|
+
batch_size (int | None): Number of requests to process in parallel during
|
|
1983
|
+
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
1984
|
+
value to control API usage and performance.
|
|
1985
|
+
max_concurrency (int): Maximum number of concurrent requests during
|
|
1986
|
+
extraction. Defaults to 8.
|
|
1987
|
+
show_progress (bool): Whether to display a progress bar during extraction.
|
|
1988
|
+
Useful for large datasets. Defaults to False.
|
|
1989
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
1990
|
+
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
1991
|
+
|
|
1992
|
+
Returns:
|
|
1993
|
+
pd.DataFrame: The original DataFrame with new columns added from the
|
|
1994
|
+
inferred structured data. Each inferred field becomes a new column.
|
|
1995
|
+
The original columns and index are preserved.
|
|
1996
|
+
|
|
1997
|
+
Example:
|
|
1998
|
+
```python
|
|
1999
|
+
# Add sentiment and issue type to support tickets
|
|
2000
|
+
df = pd.DataFrame({
|
|
2001
|
+
'ticket_id': [1, 2, 3],
|
|
2002
|
+
'description': [
|
|
2003
|
+
"Can't login, password reset not working",
|
|
2004
|
+
"Billing error, charged twice last month",
|
|
2005
|
+
"Great service, issue resolved quickly!"
|
|
2006
|
+
],
|
|
2007
|
+
'date': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
2008
|
+
})
|
|
2009
|
+
|
|
2010
|
+
# Add inferred fields to existing DataFrame (must be awaited)
|
|
2011
|
+
enriched_df = await df.aio.auto_extract(
|
|
2012
|
+
purpose="Extract issue type and sentiment for support dashboard",
|
|
2013
|
+
max_concurrency=4,
|
|
2014
|
+
show_progress=True
|
|
2015
|
+
)
|
|
2016
|
+
# Result: Original df with new columns like 'issue_type', 'sentiment', etc.
|
|
2017
|
+
|
|
2018
|
+
# Add product specifications to inventory data
|
|
2019
|
+
inventory = pd.DataFrame({
|
|
2020
|
+
'sku': ['A001', 'B002', 'C003'],
|
|
2021
|
+
'description': [
|
|
2022
|
+
"Laptop 16GB RAM, 512GB SSD, Intel i7",
|
|
2023
|
+
"Phone 128GB, 5G, dual camera",
|
|
2024
|
+
"Tablet 10-inch, WiFi only, 64GB"
|
|
2025
|
+
]
|
|
2026
|
+
})
|
|
2027
|
+
|
|
2028
|
+
enriched_inventory = await inventory.aio.auto_extract(
|
|
2029
|
+
purpose="Extract technical specifications for inventory system",
|
|
2030
|
+
batch_size=32
|
|
2031
|
+
)
|
|
2032
|
+
```
|
|
2033
|
+
|
|
2034
|
+
Note:
|
|
2035
|
+
This is an asynchronous method and must be awaited. This method is ideal
|
|
2036
|
+
for enriching existing DataFrames with additional structured fields
|
|
2037
|
+
extracted from text columns. The schema is inferred synchronously from
|
|
2038
|
+
the DataFrame content. For production use cases with stable schemas,
|
|
2039
|
+
consider using `infer_schema()` once and reusing the schema with `task()`.
|
|
2040
|
+
"""
|
|
2041
|
+
# Infer schema from DataFrame rows (synchronous)
|
|
2042
|
+
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
2043
|
+
|
|
2044
|
+
# Extract structured data using the inferred schema (asynchronous)
|
|
2045
|
+
inferred_series = await self._obj.aio.task(
|
|
2046
|
+
task=schema.task,
|
|
2047
|
+
batch_size=batch_size,
|
|
2048
|
+
max_concurrency=max_concurrency,
|
|
2049
|
+
show_progress=show_progress,
|
|
2050
|
+
**api_kwargs,
|
|
2051
|
+
)
|
|
2052
|
+
|
|
2053
|
+
return self._obj.assign(
|
|
2054
|
+
inferred=inferred_series,
|
|
2055
|
+
).ai.extract("inferred")
|