openaivec 0.14.12__py3-none-any.whl → 0.14.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_embeddings.py +17 -4
- openaivec/_model.py +7 -12
- openaivec/_prompt.py +3 -6
- openaivec/_responses.py +39 -117
- openaivec/_schema.py +27 -23
- openaivec/pandas_ext.py +355 -343
- openaivec/spark.py +98 -56
- openaivec/task/__init__.py +1 -1
- openaivec/task/customer_support/customer_sentiment.py +4 -9
- openaivec/task/customer_support/inquiry_classification.py +5 -8
- openaivec/task/customer_support/inquiry_summary.py +5 -6
- openaivec/task/customer_support/intent_analysis.py +5 -7
- openaivec/task/customer_support/response_suggestion.py +5 -8
- openaivec/task/customer_support/urgency_analysis.py +5 -8
- openaivec/task/nlp/dependency_parsing.py +1 -2
- openaivec/task/nlp/keyword_extraction.py +1 -2
- openaivec/task/nlp/morphological_analysis.py +1 -2
- openaivec/task/nlp/named_entity_recognition.py +1 -2
- openaivec/task/nlp/sentiment_analysis.py +1 -2
- openaivec/task/nlp/translation.py +1 -1
- openaivec/task/table/fillna.py +8 -3
- {openaivec-0.14.12.dist-info → openaivec-0.14.14.dist-info}/METADATA +1 -1
- openaivec-0.14.14.dist-info/RECORD +37 -0
- openaivec-0.14.12.dist-info/RECORD +0 -37
- {openaivec-0.14.12.dist-info → openaivec-0.14.14.dist-info}/WHEEL +0 -0
- {openaivec-0.14.12.dist-info → openaivec-0.14.14.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -181,8 +181,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
181
181
|
instructions: str,
|
|
182
182
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
183
183
|
response_format: type[ResponseFormat] = str,
|
|
184
|
-
temperature: float | None = 0.0,
|
|
185
|
-
top_p: float = 1.0,
|
|
186
184
|
**api_kwargs,
|
|
187
185
|
) -> pd.Series:
|
|
188
186
|
"""Call an LLM once for every Series element using a provided cache.
|
|
@@ -196,40 +194,30 @@ class OpenAIVecSeriesAccessor:
|
|
|
196
194
|
batching and deduplication control.
|
|
197
195
|
response_format (type[ResponseFormat], optional): Pydantic model or built-in
|
|
198
196
|
type the assistant should return. Defaults to ``str``.
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
Additional Keyword Args:
|
|
203
|
-
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
204
|
-
``seed``, etc.) are forwarded verbatim to the underlying client.
|
|
197
|
+
**api_kwargs: Arbitrary OpenAI Responses API parameters (e.g. ``temperature``,
|
|
198
|
+
``top_p``, ``frequency_penalty``, ``presence_penalty``, ``seed``, etc.) are
|
|
199
|
+
forwarded verbatim to the underlying client.
|
|
205
200
|
|
|
206
201
|
Returns:
|
|
207
202
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
208
203
|
"""
|
|
204
|
+
|
|
209
205
|
client: BatchResponses = BatchResponses(
|
|
210
206
|
client=CONTAINER.resolve(OpenAI),
|
|
211
207
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
212
208
|
system_message=instructions,
|
|
213
209
|
response_format=response_format,
|
|
214
210
|
cache=cache,
|
|
215
|
-
|
|
216
|
-
top_p=top_p,
|
|
211
|
+
api_kwargs=api_kwargs,
|
|
217
212
|
)
|
|
218
213
|
|
|
219
|
-
|
|
220
|
-
proxy_params = {"show_progress", "batch_size"}
|
|
221
|
-
filtered_kwargs = {k: v for k, v in api_kwargs.items() if k not in proxy_params}
|
|
222
|
-
return pd.Series(
|
|
223
|
-
client.parse(self._obj.tolist(), **filtered_kwargs), index=self._obj.index, name=self._obj.name
|
|
224
|
-
)
|
|
214
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
225
215
|
|
|
226
216
|
def responses(
|
|
227
217
|
self,
|
|
228
218
|
instructions: str,
|
|
229
219
|
response_format: type[ResponseFormat] = str,
|
|
230
220
|
batch_size: int | None = None,
|
|
231
|
-
temperature: float | None = 0.0,
|
|
232
|
-
top_p: float = 1.0,
|
|
233
221
|
show_progress: bool = False,
|
|
234
222
|
**api_kwargs,
|
|
235
223
|
) -> pd.Series:
|
|
@@ -248,6 +236,12 @@ class OpenAIVecSeriesAccessor:
|
|
|
248
236
|
batch_size=32,
|
|
249
237
|
show_progress=True
|
|
250
238
|
)
|
|
239
|
+
|
|
240
|
+
# With custom temperature
|
|
241
|
+
animals.ai.responses(
|
|
242
|
+
"translate creatively",
|
|
243
|
+
temperature=0.8
|
|
244
|
+
)
|
|
251
245
|
```
|
|
252
246
|
|
|
253
247
|
Args:
|
|
@@ -257,9 +251,8 @@ class OpenAIVecSeriesAccessor:
|
|
|
257
251
|
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
258
252
|
request. Defaults to ``None`` (automatic batch size optimization
|
|
259
253
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
260
|
-
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
261
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
262
254
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
255
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
263
256
|
|
|
264
257
|
Returns:
|
|
265
258
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
@@ -268,14 +261,13 @@ class OpenAIVecSeriesAccessor:
|
|
|
268
261
|
instructions=instructions,
|
|
269
262
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
270
263
|
response_format=response_format,
|
|
271
|
-
temperature=temperature,
|
|
272
|
-
top_p=top_p,
|
|
273
264
|
**api_kwargs,
|
|
274
265
|
)
|
|
275
266
|
|
|
276
267
|
def embeddings_with_cache(
|
|
277
268
|
self,
|
|
278
269
|
cache: BatchingMapProxy[str, np.ndarray],
|
|
270
|
+
**api_kwargs,
|
|
279
271
|
) -> pd.Series:
|
|
280
272
|
"""Compute OpenAI embeddings for every Series element using a provided cache.
|
|
281
273
|
|
|
@@ -299,6 +291,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
299
291
|
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
300
292
|
instance for managing API call batching and deduplication.
|
|
301
293
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
294
|
+
**api_kwargs: Additional keyword arguments to pass to the OpenAI API.
|
|
302
295
|
|
|
303
296
|
Returns:
|
|
304
297
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -308,6 +301,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
308
301
|
client=CONTAINER.resolve(OpenAI),
|
|
309
302
|
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
310
303
|
cache=cache,
|
|
304
|
+
api_kwargs=api_kwargs,
|
|
311
305
|
)
|
|
312
306
|
|
|
313
307
|
return pd.Series(
|
|
@@ -316,7 +310,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
316
310
|
name=self._obj.name,
|
|
317
311
|
)
|
|
318
312
|
|
|
319
|
-
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
313
|
+
def embeddings(self, batch_size: int | None = None, show_progress: bool = False, **api_kwargs) -> pd.Series:
|
|
320
314
|
"""Compute OpenAI embeddings for every Series element.
|
|
321
315
|
|
|
322
316
|
Example:
|
|
@@ -338,6 +332,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
338
332
|
single request. Defaults to ``None`` (automatic batch size optimization
|
|
339
333
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
340
334
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
335
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
341
336
|
|
|
342
337
|
Returns:
|
|
343
338
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -345,18 +340,18 @@ class OpenAIVecSeriesAccessor:
|
|
|
345
340
|
"""
|
|
346
341
|
return self.embeddings_with_cache(
|
|
347
342
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
343
|
+
**api_kwargs,
|
|
348
344
|
)
|
|
349
345
|
|
|
350
346
|
def task_with_cache(
|
|
351
347
|
self,
|
|
352
348
|
task: PreparedTask[ResponseFormat],
|
|
353
349
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
354
|
-
**api_kwargs,
|
|
355
350
|
) -> pd.Series:
|
|
356
351
|
"""Execute a prepared task on every Series element using a provided cache.
|
|
357
352
|
|
|
358
353
|
This mirrors ``responses_with_cache`` but uses the task's stored instructions,
|
|
359
|
-
response format,
|
|
354
|
+
response format, and API parameters. A supplied ``BatchingMapProxy`` enables
|
|
360
355
|
cross‑operation deduplicated reuse and external batch size / progress control.
|
|
361
356
|
|
|
362
357
|
Example:
|
|
@@ -370,10 +365,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
370
365
|
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
371
366
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
372
367
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
(``model``, system instructions, user input) are managed internally and cannot be overridden.
|
|
368
|
+
Note:
|
|
369
|
+
The task's stored API parameters are used. Core routing keys (``model``, system
|
|
370
|
+
instructions, user input) are managed internally and cannot be overridden.
|
|
377
371
|
|
|
378
372
|
Returns:
|
|
379
373
|
pandas.Series: Task results aligned with the original Series index.
|
|
@@ -384,17 +378,15 @@ class OpenAIVecSeriesAccessor:
|
|
|
384
378
|
system_message=task.instructions,
|
|
385
379
|
response_format=task.response_format,
|
|
386
380
|
cache=cache,
|
|
387
|
-
|
|
388
|
-
top_p=task.top_p,
|
|
381
|
+
api_kwargs=task.api_kwargs,
|
|
389
382
|
)
|
|
390
|
-
return pd.Series(client.parse(self._obj.tolist()
|
|
383
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
391
384
|
|
|
392
385
|
def task(
|
|
393
386
|
self,
|
|
394
387
|
task: PreparedTask,
|
|
395
388
|
batch_size: int | None = None,
|
|
396
389
|
show_progress: bool = False,
|
|
397
|
-
**api_kwargs,
|
|
398
390
|
) -> pd.Series:
|
|
399
391
|
"""Execute a prepared task on every Series element.
|
|
400
392
|
|
|
@@ -426,10 +418,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
426
418
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
427
419
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
428
420
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
``
|
|
432
|
-
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
421
|
+
Note:
|
|
422
|
+
The task's stored API parameters are used. Core batching / routing keys
|
|
423
|
+
(``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
433
424
|
library and cannot be overridden.
|
|
434
425
|
|
|
435
426
|
Returns:
|
|
@@ -438,149 +429,204 @@ class OpenAIVecSeriesAccessor:
|
|
|
438
429
|
return self.task_with_cache(
|
|
439
430
|
task=task,
|
|
440
431
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
441
|
-
**api_kwargs,
|
|
442
432
|
)
|
|
443
433
|
|
|
444
434
|
def parse_with_cache(
|
|
445
435
|
self,
|
|
446
436
|
instructions: str,
|
|
447
437
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
448
|
-
response_format: ResponseFormat = None,
|
|
438
|
+
response_format: type[ResponseFormat] | None = None,
|
|
449
439
|
max_examples: int = 100,
|
|
450
|
-
temperature: float | None = 0.0,
|
|
451
|
-
top_p: float = 1.0,
|
|
452
440
|
**api_kwargs,
|
|
453
441
|
) -> pd.Series:
|
|
454
442
|
"""Parse Series values using an LLM with a provided cache.
|
|
455
|
-
|
|
456
|
-
|
|
443
|
+
|
|
444
|
+
This method allows external control over caching behavior while parsing
|
|
445
|
+
Series content into structured data. If no response format is provided,
|
|
446
|
+
the method automatically infers an appropriate schema by analyzing the
|
|
447
|
+
data patterns.
|
|
457
448
|
|
|
458
449
|
Args:
|
|
459
|
-
instructions (str):
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
450
|
+
instructions (str): Plain language description of what information
|
|
451
|
+
to extract (e.g., "Extract customer information including name
|
|
452
|
+
and contact details"). This guides both the extraction process
|
|
453
|
+
and schema inference.
|
|
454
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
455
|
+
instance for managing API call batching and deduplication.
|
|
456
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
457
|
+
response_format (type[ResponseFormat] | None, optional): Target structure
|
|
458
|
+
for the parsed data. Can be a Pydantic model class, built-in type
|
|
459
|
+
(str, int, float, bool, list, dict), or None. If None, the method
|
|
460
|
+
infers an appropriate schema based on the instructions and data.
|
|
461
|
+
Defaults to None.
|
|
462
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
463
|
+
analyze when inferring the schema. Only used when response_format
|
|
464
|
+
is None. Defaults to 100.
|
|
465
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
466
|
+
frequency_penalty, presence_penalty, seed, etc.) forwarded to
|
|
467
|
+
the underlying API calls.
|
|
468
|
+
|
|
471
469
|
Returns:
|
|
472
|
-
pandas.Series: Series
|
|
473
|
-
|
|
470
|
+
pandas.Series: Series containing parsed structured data. Each value
|
|
471
|
+
is an instance of the specified response_format or the inferred
|
|
472
|
+
schema model, aligned with the original Series index.
|
|
474
473
|
"""
|
|
475
474
|
|
|
476
475
|
schema: InferredSchema | None = None
|
|
477
476
|
if response_format is None:
|
|
478
|
-
schema = self.infer_schema(
|
|
477
|
+
schema = self.infer_schema(instructions=instructions, max_examples=max_examples, **api_kwargs)
|
|
479
478
|
|
|
480
479
|
return self.responses_with_cache(
|
|
481
480
|
instructions=schema.inference_prompt if schema else instructions,
|
|
482
481
|
cache=cache,
|
|
483
482
|
response_format=response_format or schema.model,
|
|
484
|
-
temperature=temperature,
|
|
485
|
-
top_p=top_p,
|
|
486
483
|
**api_kwargs,
|
|
487
484
|
)
|
|
488
485
|
|
|
489
486
|
def parse(
|
|
490
487
|
self,
|
|
491
488
|
instructions: str,
|
|
492
|
-
response_format: ResponseFormat = None,
|
|
489
|
+
response_format: type[ResponseFormat] | None = None,
|
|
493
490
|
max_examples: int = 100,
|
|
494
491
|
batch_size: int | None = None,
|
|
495
492
|
show_progress: bool = False,
|
|
496
|
-
temperature: float | None = 0.0,
|
|
497
|
-
top_p: float = 1.0,
|
|
498
493
|
**api_kwargs,
|
|
499
494
|
) -> pd.Series:
|
|
500
|
-
"""Parse Series values using an LLM
|
|
495
|
+
"""Parse Series values into structured data using an LLM.
|
|
501
496
|
|
|
502
|
-
This method
|
|
503
|
-
|
|
497
|
+
This method extracts structured information from unstructured text in
|
|
498
|
+
the Series. When no response format is provided, it automatically
|
|
499
|
+
infers an appropriate schema by analyzing patterns in the data.
|
|
504
500
|
|
|
505
501
|
Args:
|
|
506
|
-
instructions (str):
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
502
|
+
instructions (str): Plain language description of what information
|
|
503
|
+
to extract (e.g., "Extract product details including price,
|
|
504
|
+
category, and availability"). This guides both the extraction
|
|
505
|
+
process and schema inference.
|
|
506
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
507
|
+
structure for the parsed data. Can be a Pydantic model class,
|
|
508
|
+
built-in type (str, int, float, bool, list, dict), or None.
|
|
509
|
+
If None, automatically infers a schema. Defaults to None.
|
|
510
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
511
|
+
analyze when inferring schema. Only used when response_format
|
|
512
|
+
is None. Defaults to 100.
|
|
513
|
+
batch_size (int | None, optional): Number of requests to process
|
|
514
|
+
per batch. None enables automatic optimization. Defaults to None.
|
|
515
|
+
show_progress (bool, optional): Display progress bar in Jupyter
|
|
516
|
+
notebooks. Defaults to False.
|
|
517
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
518
|
+
frequency_penalty, presence_penalty, seed, etc.).
|
|
517
519
|
|
|
518
520
|
Returns:
|
|
519
|
-
pandas.Series: Series
|
|
520
|
-
|
|
521
|
+
pandas.Series: Series containing parsed structured data as instances
|
|
522
|
+
of response_format or the inferred schema model.
|
|
523
|
+
|
|
524
|
+
Example:
|
|
525
|
+
```python
|
|
526
|
+
# With explicit schema
|
|
527
|
+
from pydantic import BaseModel
|
|
528
|
+
class Product(BaseModel):
|
|
529
|
+
name: str
|
|
530
|
+
price: float
|
|
531
|
+
in_stock: bool
|
|
532
|
+
|
|
533
|
+
descriptions = pd.Series([
|
|
534
|
+
"iPhone 15 Pro - $999, available now",
|
|
535
|
+
"Samsung Galaxy S24 - $899, out of stock"
|
|
536
|
+
])
|
|
537
|
+
products = descriptions.ai.parse(
|
|
538
|
+
"Extract product information",
|
|
539
|
+
response_format=Product
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# With automatic schema inference
|
|
543
|
+
reviews = pd.Series([
|
|
544
|
+
"Great product! 5 stars. Fast shipping.",
|
|
545
|
+
"Poor quality. 2 stars. Slow delivery."
|
|
546
|
+
])
|
|
547
|
+
parsed = reviews.ai.parse(
|
|
548
|
+
"Extract review rating and shipping feedback"
|
|
549
|
+
)
|
|
550
|
+
```
|
|
521
551
|
"""
|
|
522
552
|
return self.parse_with_cache(
|
|
523
553
|
instructions=instructions,
|
|
524
554
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
525
555
|
response_format=response_format,
|
|
526
556
|
max_examples=max_examples,
|
|
527
|
-
temperature=temperature,
|
|
528
|
-
top_p=top_p,
|
|
529
557
|
**api_kwargs,
|
|
530
558
|
)
|
|
531
559
|
|
|
532
|
-
def infer_schema(self,
|
|
560
|
+
def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> InferredSchema:
|
|
533
561
|
"""Infer a structured data schema from Series content using AI.
|
|
534
562
|
|
|
535
|
-
This method analyzes a sample of
|
|
536
|
-
a
|
|
537
|
-
|
|
538
|
-
|
|
563
|
+
This method analyzes a sample of Series values to automatically generate
|
|
564
|
+
a Pydantic model that captures the relevant information structure. The
|
|
565
|
+
inferred schema supports both flat and hierarchical (nested) structures,
|
|
566
|
+
making it suitable for complex data extraction tasks.
|
|
539
567
|
|
|
540
568
|
Args:
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
max_examples (int): Maximum number of
|
|
546
|
-
|
|
547
|
-
limit.
|
|
569
|
+
instructions (str): Plain language description of the extraction goal
|
|
570
|
+
(e.g., "Extract customer information for CRM system", "Parse
|
|
571
|
+
event details for calendar integration"). This guides which
|
|
572
|
+
fields to include and their purpose.
|
|
573
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
574
|
+
analyze for pattern detection. The method samples randomly up
|
|
575
|
+
to this limit. Higher values may improve schema quality but
|
|
576
|
+
increase inference time. Defaults to 100.
|
|
577
|
+
**api_kwargs: Additional OpenAI API parameters for fine-tuning
|
|
578
|
+
the inference process.
|
|
548
579
|
|
|
549
580
|
Returns:
|
|
550
|
-
InferredSchema:
|
|
551
|
-
-
|
|
552
|
-
- fields:
|
|
553
|
-
|
|
554
|
-
-
|
|
555
|
-
-
|
|
581
|
+
InferredSchema: A comprehensive schema object containing:
|
|
582
|
+
- instructions: Refined extraction objective statement
|
|
583
|
+
- fields: Hierarchical field specifications with names, types,
|
|
584
|
+
descriptions, and nested structures where applicable
|
|
585
|
+
- inference_prompt: Optimized prompt for consistent extraction
|
|
586
|
+
- model: Dynamically generated Pydantic model class supporting
|
|
587
|
+
both flat and nested structures
|
|
588
|
+
- task: PreparedTask configured for batch extraction using
|
|
589
|
+
the inferred schema
|
|
556
590
|
|
|
557
591
|
Example:
|
|
558
592
|
```python
|
|
593
|
+
# Simple flat structure
|
|
559
594
|
reviews = pd.Series([
|
|
560
|
-
"Great product
|
|
561
|
-
"
|
|
562
|
-
"Average product. Price is fair but nothing special."
|
|
595
|
+
"5 stars! Great product, fast shipping to NYC.",
|
|
596
|
+
"2 stars. Product broke, slow delivery to LA."
|
|
563
597
|
])
|
|
564
|
-
|
|
565
|
-
# Infer schema for sentiment analysis
|
|
566
598
|
schema = reviews.ai.infer_schema(
|
|
567
|
-
|
|
599
|
+
"Extract review ratings and shipping information"
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
# Hierarchical structure
|
|
603
|
+
orders = pd.Series([
|
|
604
|
+
"Order #123: John Doe, 123 Main St, NYC. Items: iPhone ($999), Case ($29)",
|
|
605
|
+
"Order #456: Jane Smith, 456 Oak Ave, LA. Items: iPad ($799)"
|
|
606
|
+
])
|
|
607
|
+
schema = orders.ai.infer_schema(
|
|
608
|
+
"Extract order details including customer and items"
|
|
568
609
|
)
|
|
610
|
+
# Inferred schema may include nested structures like:
|
|
611
|
+
# - customer: {name: str, address: str, city: str}
|
|
612
|
+
# - items: [{product: str, price: float}]
|
|
569
613
|
|
|
570
|
-
#
|
|
571
|
-
extracted =
|
|
614
|
+
# Apply the schema for extraction
|
|
615
|
+
extracted = orders.ai.task(schema.task)
|
|
572
616
|
```
|
|
573
617
|
|
|
574
618
|
Note:
|
|
575
|
-
The
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
619
|
+
The inference process uses multiple AI iterations to ensure schema
|
|
620
|
+
validity. Nested structures are automatically detected when the
|
|
621
|
+
data contains hierarchical relationships. The generated Pydantic
|
|
622
|
+
model ensures type safety and validation for all extracted data.
|
|
579
623
|
"""
|
|
580
624
|
inferer = CONTAINER.resolve(SchemaInferer)
|
|
581
625
|
|
|
582
626
|
input: SchemaInferenceInput = SchemaInferenceInput(
|
|
583
|
-
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(),
|
|
627
|
+
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(),
|
|
628
|
+
instructions=instructions,
|
|
629
|
+
**api_kwargs,
|
|
584
630
|
)
|
|
585
631
|
return inferer.infer_schema(input)
|
|
586
632
|
|
|
@@ -643,8 +689,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
643
689
|
instructions: str,
|
|
644
690
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
645
691
|
response_format: type[ResponseFormat] = str,
|
|
646
|
-
temperature: float | None = 0.0,
|
|
647
|
-
top_p: float = 1.0,
|
|
648
692
|
**api_kwargs,
|
|
649
693
|
) -> pd.Series:
|
|
650
694
|
"""Generate a response for each row after serializing it to JSON using a provided cache.
|
|
@@ -678,8 +722,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
678
722
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
679
723
|
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
680
724
|
responses. Defaults to ``str``.
|
|
681
|
-
|
|
682
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
725
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
683
726
|
|
|
684
727
|
Returns:
|
|
685
728
|
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
@@ -688,8 +731,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
688
731
|
instructions=instructions,
|
|
689
732
|
cache=cache,
|
|
690
733
|
response_format=response_format,
|
|
691
|
-
temperature=temperature,
|
|
692
|
-
top_p=top_p,
|
|
693
734
|
**api_kwargs,
|
|
694
735
|
)
|
|
695
736
|
|
|
@@ -698,8 +739,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
698
739
|
instructions: str,
|
|
699
740
|
response_format: type[ResponseFormat] = str,
|
|
700
741
|
batch_size: int | None = None,
|
|
701
|
-
temperature: float | None = 0.0,
|
|
702
|
-
top_p: float = 1.0,
|
|
703
742
|
show_progress: bool = False,
|
|
704
743
|
**api_kwargs,
|
|
705
744
|
) -> pd.Series:
|
|
@@ -731,9 +770,8 @@ class OpenAIVecDataFrameAccessor:
|
|
|
731
770
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
732
771
|
Defaults to ``None`` (automatic batch size optimization
|
|
733
772
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
734
|
-
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
735
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
736
773
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
774
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
737
775
|
|
|
738
776
|
Returns:
|
|
739
777
|
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
@@ -742,8 +780,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
742
780
|
instructions=instructions,
|
|
743
781
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
744
782
|
response_format=response_format,
|
|
745
|
-
temperature=temperature,
|
|
746
|
-
top_p=top_p,
|
|
747
783
|
**api_kwargs,
|
|
748
784
|
)
|
|
749
785
|
|
|
@@ -751,7 +787,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
751
787
|
self,
|
|
752
788
|
task: PreparedTask[ResponseFormat],
|
|
753
789
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
754
|
-
**api_kwargs,
|
|
755
790
|
) -> pd.Series:
|
|
756
791
|
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
|
|
757
792
|
|
|
@@ -759,9 +794,8 @@ class OpenAIVecDataFrameAccessor:
|
|
|
759
794
|
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
760
795
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
761
796
|
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
``seed``) forwarded verbatim. Core routing keys are managed internally.
|
|
797
|
+
Note:
|
|
798
|
+
The task's stored API parameters are used. Core routing keys are managed internally.
|
|
765
799
|
|
|
766
800
|
Returns:
|
|
767
801
|
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
@@ -769,7 +803,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
769
803
|
return _df_rows_to_json_series(self._obj).ai.task_with_cache(
|
|
770
804
|
task=task,
|
|
771
805
|
cache=cache,
|
|
772
|
-
**api_kwargs,
|
|
773
806
|
)
|
|
774
807
|
|
|
775
808
|
def task(
|
|
@@ -777,7 +810,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
777
810
|
task: PreparedTask,
|
|
778
811
|
batch_size: int | None = None,
|
|
779
812
|
show_progress: bool = False,
|
|
780
|
-
**api_kwargs,
|
|
781
813
|
) -> pd.Series:
|
|
782
814
|
"""Execute a prepared task on each DataFrame row after serializing it to JSON.
|
|
783
815
|
|
|
@@ -813,9 +845,8 @@ class OpenAIVecDataFrameAccessor:
|
|
|
813
845
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
814
846
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
815
847
|
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
848
|
+
Note:
|
|
849
|
+
The task's stored API parameters are used. Core batching / routing
|
|
819
850
|
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
820
851
|
library and cannot be overridden.
|
|
821
852
|
|
|
@@ -827,99 +858,108 @@ class OpenAIVecDataFrameAccessor:
|
|
|
827
858
|
task=task,
|
|
828
859
|
batch_size=batch_size,
|
|
829
860
|
show_progress=show_progress,
|
|
830
|
-
**api_kwargs,
|
|
831
861
|
)
|
|
832
862
|
|
|
833
863
|
def parse_with_cache(
|
|
834
864
|
self,
|
|
835
865
|
instructions: str,
|
|
836
866
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
837
|
-
response_format: ResponseFormat = None,
|
|
867
|
+
response_format: type[ResponseFormat] | None = None,
|
|
838
868
|
max_examples: int = 100,
|
|
839
|
-
temperature: float | None = 0.0,
|
|
840
|
-
top_p: float = 1.0,
|
|
841
869
|
**api_kwargs,
|
|
842
870
|
) -> pd.Series:
|
|
843
|
-
"""Parse DataFrame rows using an LLM with a provided cache.
|
|
871
|
+
"""Parse DataFrame rows into structured data using an LLM with a provided cache.
|
|
844
872
|
|
|
845
|
-
This method
|
|
846
|
-
|
|
847
|
-
|
|
873
|
+
This method processes each DataFrame row (converted to JSON) and extracts
|
|
874
|
+
structured information using an LLM. External cache control enables
|
|
875
|
+
deduplication across operations and custom batch management.
|
|
848
876
|
|
|
849
877
|
Args:
|
|
850
|
-
instructions (str):
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
for
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
878
|
+
instructions (str): Plain language description of what information
|
|
879
|
+
to extract from each row (e.g., "Extract shipping details and
|
|
880
|
+
order status"). Guides both extraction and schema inference.
|
|
881
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
882
|
+
instance for managing API call batching and deduplication.
|
|
883
|
+
Set cache.batch_size=None for automatic optimization.
|
|
884
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
885
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
886
|
+
type, or None for automatic schema inference. Defaults to None.
|
|
887
|
+
max_examples (int, optional): Maximum rows to analyze when inferring
|
|
888
|
+
schema (only used when response_format is None). Defaults to 100.
|
|
889
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
890
|
+
frequency_penalty, presence_penalty, seed, etc.).
|
|
863
891
|
|
|
864
892
|
Returns:
|
|
865
|
-
pandas.Series: Series
|
|
866
|
-
|
|
893
|
+
pandas.Series: Series containing parsed structured data as instances
|
|
894
|
+
of response_format or the inferred schema model, indexed like
|
|
895
|
+
the original DataFrame.
|
|
867
896
|
"""
|
|
868
897
|
return _df_rows_to_json_series(self._obj).ai.parse_with_cache(
|
|
869
898
|
instructions=instructions,
|
|
870
899
|
cache=cache,
|
|
871
900
|
response_format=response_format,
|
|
872
901
|
max_examples=max_examples,
|
|
873
|
-
temperature=temperature,
|
|
874
|
-
top_p=top_p,
|
|
875
902
|
**api_kwargs,
|
|
876
903
|
)
|
|
877
904
|
|
|
878
905
|
def parse(
|
|
879
906
|
self,
|
|
880
907
|
instructions: str,
|
|
881
|
-
response_format: ResponseFormat = None,
|
|
908
|
+
response_format: type[ResponseFormat] | None = None,
|
|
882
909
|
max_examples: int = 100,
|
|
883
910
|
batch_size: int | None = None,
|
|
884
911
|
show_progress: bool = False,
|
|
885
|
-
temperature: float | None = 0.0,
|
|
886
|
-
top_p: float = 1.0,
|
|
887
912
|
**api_kwargs,
|
|
888
913
|
) -> pd.Series:
|
|
889
|
-
"""Parse DataFrame rows using an LLM
|
|
914
|
+
"""Parse DataFrame rows into structured data using an LLM.
|
|
890
915
|
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
916
|
+
Each row is converted to JSON and processed to extract structured
|
|
917
|
+
information. When no response format is provided, the method
|
|
918
|
+
automatically infers an appropriate schema from the data.
|
|
894
919
|
|
|
895
920
|
Args:
|
|
896
|
-
instructions (str):
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
921
|
+
instructions (str): Plain language description of extraction goals
|
|
922
|
+
(e.g., "Extract transaction details including amount, date,
|
|
923
|
+
and merchant"). Guides extraction and schema inference.
|
|
924
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
925
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
926
|
+
type, or None for automatic inference. Defaults to None.
|
|
927
|
+
max_examples (int, optional): Maximum rows to analyze for schema
|
|
928
|
+
inference (when response_format is None). Defaults to 100.
|
|
929
|
+
batch_size (int | None, optional): Rows per API batch. None
|
|
930
|
+
enables automatic optimization. Defaults to None.
|
|
931
|
+
show_progress (bool, optional): Show progress bar in Jupyter
|
|
932
|
+
notebooks. Defaults to False.
|
|
933
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
907
934
|
|
|
908
935
|
Returns:
|
|
909
|
-
pandas.Series:
|
|
910
|
-
|
|
936
|
+
pandas.Series: Parsed structured data indexed like the original
|
|
937
|
+
DataFrame.
|
|
938
|
+
|
|
939
|
+
Example:
|
|
940
|
+
```python
|
|
941
|
+
df = pd.DataFrame({
|
|
942
|
+
'log': [
|
|
943
|
+
'2024-01-01 10:00 ERROR Database connection failed',
|
|
944
|
+
'2024-01-01 10:05 INFO Service started successfully'
|
|
945
|
+
]
|
|
946
|
+
})
|
|
947
|
+
|
|
948
|
+
# With automatic schema inference
|
|
949
|
+
parsed = df.ai.parse("Extract timestamp, level, and message")
|
|
950
|
+
# Returns Series with inferred structure like:
|
|
951
|
+
# {timestamp: str, level: str, message: str}
|
|
952
|
+
```
|
|
911
953
|
"""
|
|
912
954
|
return self.parse_with_cache(
|
|
913
955
|
instructions=instructions,
|
|
914
956
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
915
957
|
response_format=response_format,
|
|
916
958
|
max_examples=max_examples,
|
|
917
|
-
temperature=temperature,
|
|
918
|
-
top_p=top_p,
|
|
919
959
|
**api_kwargs,
|
|
920
960
|
)
|
|
921
961
|
|
|
922
|
-
def infer_schema(self,
|
|
962
|
+
def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> InferredSchema:
|
|
923
963
|
"""Infer a structured data schema from DataFrame rows using AI.
|
|
924
964
|
|
|
925
965
|
This method analyzes a sample of DataFrame rows to automatically infer
|
|
@@ -928,7 +968,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
928
968
|
field types, and potential categorical values.
|
|
929
969
|
|
|
930
970
|
Args:
|
|
931
|
-
|
|
971
|
+
instructions (str): Plain language description of how the extracted
|
|
932
972
|
structured data will be used (e.g., "Extract operational metrics
|
|
933
973
|
for dashboard", "Parse customer attributes for segmentation").
|
|
934
974
|
This guides field relevance and helps exclude irrelevant information.
|
|
@@ -938,7 +978,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
938
978
|
|
|
939
979
|
Returns:
|
|
940
980
|
InferredSchema: An object containing:
|
|
941
|
-
-
|
|
981
|
+
- instructions: Normalized statement of the extraction objective
|
|
942
982
|
- fields: List of field specifications with names, types, and descriptions
|
|
943
983
|
- inference_prompt: Reusable prompt for future extractions
|
|
944
984
|
- model: Dynamically generated Pydantic model for parsing
|
|
@@ -957,7 +997,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
957
997
|
|
|
958
998
|
# Infer schema for logistics tracking
|
|
959
999
|
schema = df.ai.infer_schema(
|
|
960
|
-
|
|
1000
|
+
instructions="Extract shipping status and location data for logistics tracking"
|
|
961
1001
|
)
|
|
962
1002
|
|
|
963
1003
|
# Apply the schema to extract structured data
|
|
@@ -965,14 +1005,15 @@ class OpenAIVecDataFrameAccessor:
|
|
|
965
1005
|
```
|
|
966
1006
|
|
|
967
1007
|
Note:
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
1008
|
+
Each row is converted to JSON before analysis. The inference
|
|
1009
|
+
process automatically detects hierarchical relationships and
|
|
1010
|
+
creates appropriate nested structures when present. The generated
|
|
1011
|
+
Pydantic model ensures type safety and validation.
|
|
972
1012
|
"""
|
|
973
1013
|
return _df_rows_to_json_series(self._obj).ai.infer_schema(
|
|
974
|
-
|
|
1014
|
+
instructions=instructions,
|
|
975
1015
|
max_examples=max_examples,
|
|
1016
|
+
**api_kwargs,
|
|
976
1017
|
)
|
|
977
1018
|
|
|
978
1019
|
def extract(self, column: str) -> pd.DataFrame:
|
|
@@ -1013,7 +1054,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
1013
1054
|
max_examples: int = 500,
|
|
1014
1055
|
batch_size: int | None = None,
|
|
1015
1056
|
show_progress: bool = False,
|
|
1016
|
-
**api_kwargs,
|
|
1017
1057
|
) -> pd.DataFrame:
|
|
1018
1058
|
"""Fill missing values in a DataFrame column using AI-powered inference.
|
|
1019
1059
|
|
|
@@ -1033,10 +1073,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
1033
1073
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1034
1074
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1035
1075
|
|
|
1036
|
-
Additional Keyword Args:
|
|
1037
|
-
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1038
|
-
``seed``, etc.) are forwarded verbatim to the underlying task execution.
|
|
1039
|
-
|
|
1040
1076
|
Returns:
|
|
1041
1077
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
1042
1078
|
column. The original DataFrame is not modified.
|
|
@@ -1068,7 +1104,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
1068
1104
|
return self._obj
|
|
1069
1105
|
|
|
1070
1106
|
filled_values: list[FillNaResponse] = missing_rows.ai.task(
|
|
1071
|
-
task=task, batch_size=batch_size, show_progress=show_progress
|
|
1107
|
+
task=task, batch_size=batch_size, show_progress=show_progress
|
|
1072
1108
|
)
|
|
1073
1109
|
|
|
1074
1110
|
# get deep copy of the DataFrame to avoid modifying the original
|
|
@@ -1128,8 +1164,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1128
1164
|
instructions: str,
|
|
1129
1165
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1130
1166
|
response_format: type[ResponseFormat] = str,
|
|
1131
|
-
temperature: float | None = 0.0,
|
|
1132
|
-
top_p: float = 1.0,
|
|
1133
1167
|
**api_kwargs,
|
|
1134
1168
|
) -> pd.Series:
|
|
1135
1169
|
"""Call an LLM once for every Series element using a provided cache (asynchronously).
|
|
@@ -1156,13 +1190,11 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1156
1190
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1157
1191
|
response_format (type[ResponseFormat], optional): Pydantic model or built‑in
|
|
1158
1192
|
type the assistant should return. Defaults to ``str``.
|
|
1159
|
-
temperature (float | None, optional): Sampling temperature. ``None`` omits the
|
|
1160
|
-
parameter (recommended for reasoning models). Defaults to ``0.0``.
|
|
1161
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1162
1193
|
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
1163
|
-
``AsyncOpenAI.responses.parse`` (e.g. ``
|
|
1164
|
-
future parameters). Core batching keys
|
|
1165
|
-
text_format) are protected and silently
|
|
1194
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
|
|
1195
|
+
``max_output_tokens``, penalties, future parameters). Core batching keys
|
|
1196
|
+
(model, instructions, input, text_format) are protected and silently
|
|
1197
|
+
ignored if provided.
|
|
1166
1198
|
|
|
1167
1199
|
Returns:
|
|
1168
1200
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
@@ -1176,14 +1208,10 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1176
1208
|
system_message=instructions,
|
|
1177
1209
|
response_format=response_format,
|
|
1178
1210
|
cache=cache,
|
|
1179
|
-
|
|
1180
|
-
top_p=top_p,
|
|
1211
|
+
api_kwargs=api_kwargs,
|
|
1181
1212
|
)
|
|
1182
1213
|
|
|
1183
|
-
|
|
1184
|
-
proxy_params = {"show_progress", "batch_size", "max_concurrency"}
|
|
1185
|
-
filtered_kwargs = {k: v for k, v in api_kwargs.items() if k not in proxy_params}
|
|
1186
|
-
results = await client.parse(self._obj.tolist(), **filtered_kwargs)
|
|
1214
|
+
results = await client.parse(self._obj.tolist())
|
|
1187
1215
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
1188
1216
|
|
|
1189
1217
|
async def responses(
|
|
@@ -1191,8 +1219,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1191
1219
|
instructions: str,
|
|
1192
1220
|
response_format: type[ResponseFormat] = str,
|
|
1193
1221
|
batch_size: int | None = None,
|
|
1194
|
-
temperature: float | None = 0.0,
|
|
1195
|
-
top_p: float = 1.0,
|
|
1196
1222
|
max_concurrency: int = 8,
|
|
1197
1223
|
show_progress: bool = False,
|
|
1198
1224
|
**api_kwargs,
|
|
@@ -1222,11 +1248,14 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1222
1248
|
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1223
1249
|
request. Defaults to ``None`` (automatic batch size optimization
|
|
1224
1250
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
1225
|
-
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1226
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1227
1251
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1228
1252
|
requests. Defaults to ``8``.
|
|
1229
1253
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1254
|
+
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
1255
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
|
|
1256
|
+
``max_output_tokens``, penalties, future parameters). Core batching keys
|
|
1257
|
+
(model, instructions, input, text_format) are protected and silently
|
|
1258
|
+
ignored if provided.
|
|
1230
1259
|
|
|
1231
1260
|
Returns:
|
|
1232
1261
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
@@ -1240,14 +1269,13 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1240
1269
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1241
1270
|
),
|
|
1242
1271
|
response_format=response_format,
|
|
1243
|
-
temperature=temperature,
|
|
1244
|
-
top_p=top_p,
|
|
1245
1272
|
**api_kwargs,
|
|
1246
1273
|
)
|
|
1247
1274
|
|
|
1248
1275
|
async def embeddings_with_cache(
|
|
1249
1276
|
self,
|
|
1250
1277
|
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
1278
|
+
**api_kwargs,
|
|
1251
1279
|
) -> pd.Series:
|
|
1252
1280
|
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
1253
1281
|
|
|
@@ -1275,6 +1303,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1275
1303
|
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
1276
1304
|
instance for managing API call batching and deduplication.
|
|
1277
1305
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1306
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
1278
1307
|
|
|
1279
1308
|
Returns:
|
|
1280
1309
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -1287,6 +1316,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1287
1316
|
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1288
1317
|
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
1289
1318
|
cache=cache,
|
|
1319
|
+
api_kwargs=api_kwargs,
|
|
1290
1320
|
)
|
|
1291
1321
|
|
|
1292
1322
|
# Await the async operation
|
|
@@ -1299,7 +1329,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1299
1329
|
)
|
|
1300
1330
|
|
|
1301
1331
|
async def embeddings(
|
|
1302
|
-
self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False
|
|
1332
|
+
self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False, **api_kwargs
|
|
1303
1333
|
) -> pd.Series:
|
|
1304
1334
|
"""Compute OpenAI embeddings for every Series element (asynchronously).
|
|
1305
1335
|
|
|
@@ -1325,6 +1355,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1325
1355
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1326
1356
|
requests. Defaults to ``8``.
|
|
1327
1357
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1358
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
1328
1359
|
|
|
1329
1360
|
Returns:
|
|
1330
1361
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -1337,13 +1368,13 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1337
1368
|
cache=AsyncBatchingMapProxy(
|
|
1338
1369
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1339
1370
|
),
|
|
1371
|
+
**api_kwargs,
|
|
1340
1372
|
)
|
|
1341
1373
|
|
|
1342
1374
|
async def task_with_cache(
|
|
1343
1375
|
self,
|
|
1344
1376
|
task: PreparedTask[ResponseFormat],
|
|
1345
1377
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1346
|
-
**api_kwargs,
|
|
1347
1378
|
) -> pd.Series:
|
|
1348
1379
|
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
1349
1380
|
|
|
@@ -1394,11 +1425,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1394
1425
|
system_message=task.instructions,
|
|
1395
1426
|
response_format=task.response_format,
|
|
1396
1427
|
cache=cache,
|
|
1397
|
-
|
|
1398
|
-
top_p=task.top_p,
|
|
1428
|
+
api_kwargs=task.api_kwargs,
|
|
1399
1429
|
)
|
|
1400
|
-
|
|
1401
|
-
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
1430
|
+
results = await client.parse(self._obj.tolist())
|
|
1402
1431
|
|
|
1403
1432
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
1404
1433
|
|
|
@@ -1408,7 +1437,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1408
1437
|
batch_size: int | None = None,
|
|
1409
1438
|
max_concurrency: int = 8,
|
|
1410
1439
|
show_progress: bool = False,
|
|
1411
|
-
**api_kwargs,
|
|
1412
1440
|
) -> pd.Series:
|
|
1413
1441
|
"""Execute a prepared task on every Series element (asynchronously).
|
|
1414
1442
|
|
|
@@ -1443,9 +1471,8 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1443
1471
|
requests. Defaults to 8.
|
|
1444
1472
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1445
1473
|
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1474
|
+
Note:
|
|
1475
|
+
The task's stored API parameters are used. Core batching / routing
|
|
1449
1476
|
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1450
1477
|
library and cannot be overridden.
|
|
1451
1478
|
|
|
@@ -1461,42 +1488,39 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1461
1488
|
cache=AsyncBatchingMapProxy(
|
|
1462
1489
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1463
1490
|
),
|
|
1464
|
-
**api_kwargs,
|
|
1465
1491
|
)
|
|
1466
1492
|
|
|
1467
1493
|
async def parse_with_cache(
|
|
1468
1494
|
self,
|
|
1469
1495
|
instructions: str,
|
|
1470
1496
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1471
|
-
response_format: ResponseFormat = None,
|
|
1497
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1472
1498
|
max_examples: int = 100,
|
|
1473
|
-
temperature: float | None = 0.0,
|
|
1474
|
-
top_p: float = 1.0,
|
|
1475
1499
|
**api_kwargs,
|
|
1476
1500
|
) -> pd.Series:
|
|
1477
|
-
"""Parse Series values using an LLM with a provided cache (asynchronously).
|
|
1501
|
+
"""Parse Series values into structured data using an LLM with a provided cache (asynchronously).
|
|
1478
1502
|
|
|
1479
|
-
This method
|
|
1480
|
-
|
|
1503
|
+
This async method provides external cache control while parsing Series
|
|
1504
|
+
content into structured data. Automatic schema inference is performed
|
|
1505
|
+
when no response format is specified.
|
|
1481
1506
|
|
|
1482
1507
|
Args:
|
|
1483
|
-
instructions (str):
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
for
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
`seed`, etc.) are forwarded verbatim to the underlying client.
|
|
1508
|
+
instructions (str): Plain language description of what to extract
|
|
1509
|
+
(e.g., "Extract dates, amounts, and descriptions from receipts").
|
|
1510
|
+
Guides both extraction and schema inference.
|
|
1511
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
|
|
1512
|
+
async cache for managing concurrent API calls and deduplication.
|
|
1513
|
+
Set cache.batch_size=None for automatic optimization.
|
|
1514
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1515
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
1516
|
+
type, or None for automatic inference. Defaults to None.
|
|
1517
|
+
max_examples (int, optional): Maximum values to analyze for schema
|
|
1518
|
+
inference (when response_format is None). Defaults to 100.
|
|
1519
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1496
1520
|
|
|
1497
1521
|
Returns:
|
|
1498
|
-
pandas.Series: Series
|
|
1499
|
-
|
|
1522
|
+
pandas.Series: Series containing parsed structured data aligned
|
|
1523
|
+
with the original index.
|
|
1500
1524
|
|
|
1501
1525
|
Note:
|
|
1502
1526
|
This is an asynchronous method and must be awaited.
|
|
@@ -1504,51 +1528,59 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1504
1528
|
schema: InferredSchema | None = None
|
|
1505
1529
|
if response_format is None:
|
|
1506
1530
|
# Use synchronous schema inference
|
|
1507
|
-
schema = self._obj.ai.infer_schema(
|
|
1531
|
+
schema = self._obj.ai.infer_schema(instructions=instructions, max_examples=max_examples)
|
|
1508
1532
|
|
|
1509
1533
|
return await self.responses_with_cache(
|
|
1510
1534
|
instructions=schema.inference_prompt if schema else instructions,
|
|
1511
1535
|
cache=cache,
|
|
1512
1536
|
response_format=response_format or schema.model,
|
|
1513
|
-
temperature=temperature,
|
|
1514
|
-
top_p=top_p,
|
|
1515
1537
|
**api_kwargs,
|
|
1516
1538
|
)
|
|
1517
1539
|
|
|
1518
1540
|
async def parse(
|
|
1519
1541
|
self,
|
|
1520
1542
|
instructions: str,
|
|
1521
|
-
response_format: ResponseFormat = None,
|
|
1543
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1522
1544
|
max_examples: int = 100,
|
|
1523
1545
|
batch_size: int | None = None,
|
|
1524
1546
|
max_concurrency: int = 8,
|
|
1525
1547
|
show_progress: bool = False,
|
|
1526
|
-
temperature: float | None = 0.0,
|
|
1527
|
-
top_p: float = 1.0,
|
|
1528
1548
|
**api_kwargs,
|
|
1529
1549
|
) -> pd.Series:
|
|
1530
|
-
"""Parse Series values using an LLM
|
|
1550
|
+
"""Parse Series values into structured data using an LLM (asynchronously).
|
|
1531
1551
|
|
|
1532
|
-
|
|
1533
|
-
|
|
1552
|
+
Async version of the parse method, extracting structured information
|
|
1553
|
+
from unstructured text with automatic schema inference when needed.
|
|
1534
1554
|
|
|
1535
1555
|
Args:
|
|
1536
|
-
instructions (str):
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1556
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1557
|
+
product names, prices, and categories from descriptions").
|
|
1558
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1559
|
+
structure. None triggers automatic schema inference. Defaults to None.
|
|
1560
|
+
max_examples (int, optional): Maximum values for schema inference.
|
|
1540
1561
|
Defaults to 100.
|
|
1541
|
-
batch_size (int | None):
|
|
1542
|
-
Defaults to None
|
|
1543
|
-
max_concurrency (int): Maximum
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1562
|
+
batch_size (int | None, optional): Requests per batch. None for
|
|
1563
|
+
automatic optimization. Defaults to None.
|
|
1564
|
+
max_concurrency (int, optional): Maximum concurrent API requests.
|
|
1565
|
+
Defaults to 8.
|
|
1566
|
+
show_progress (bool, optional): Show progress bar. Defaults to False.
|
|
1567
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1548
1568
|
|
|
1549
1569
|
Returns:
|
|
1550
|
-
pandas.Series:
|
|
1551
|
-
|
|
1570
|
+
pandas.Series: Parsed structured data indexed like the original Series.
|
|
1571
|
+
|
|
1572
|
+
Example:
|
|
1573
|
+
```python
|
|
1574
|
+
emails = pd.Series([
|
|
1575
|
+
"Meeting tomorrow at 3pm with John about Q4 planning",
|
|
1576
|
+
"Lunch with Sarah on Friday to discuss new project"
|
|
1577
|
+
])
|
|
1578
|
+
|
|
1579
|
+
# Async extraction with schema inference
|
|
1580
|
+
parsed = await emails.aio.parse(
|
|
1581
|
+
"Extract meeting details including time, person, and topic"
|
|
1582
|
+
)
|
|
1583
|
+
```
|
|
1552
1584
|
|
|
1553
1585
|
Note:
|
|
1554
1586
|
This is an asynchronous method and must be awaited.
|
|
@@ -1560,8 +1592,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1560
1592
|
),
|
|
1561
1593
|
response_format=response_format,
|
|
1562
1594
|
max_examples=max_examples,
|
|
1563
|
-
temperature=temperature,
|
|
1564
|
-
top_p=top_p,
|
|
1565
1595
|
**api_kwargs,
|
|
1566
1596
|
)
|
|
1567
1597
|
|
|
@@ -1578,8 +1608,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1578
1608
|
instructions: str,
|
|
1579
1609
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1580
1610
|
response_format: type[ResponseFormat] = str,
|
|
1581
|
-
temperature: float | None = 0.0,
|
|
1582
|
-
top_p: float = 1.0,
|
|
1583
1611
|
**api_kwargs,
|
|
1584
1612
|
) -> pd.Series:
|
|
1585
1613
|
"""Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
|
|
@@ -1615,8 +1643,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1615
1643
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1616
1644
|
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
1617
1645
|
responses. Defaults to ``str``.
|
|
1618
|
-
|
|
1619
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1646
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
1620
1647
|
|
|
1621
1648
|
Returns:
|
|
1622
1649
|
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
@@ -1629,8 +1656,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1629
1656
|
instructions=instructions,
|
|
1630
1657
|
cache=cache,
|
|
1631
1658
|
response_format=response_format,
|
|
1632
|
-
temperature=temperature,
|
|
1633
|
-
top_p=top_p,
|
|
1634
1659
|
**api_kwargs,
|
|
1635
1660
|
)
|
|
1636
1661
|
|
|
@@ -1639,8 +1664,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1639
1664
|
instructions: str,
|
|
1640
1665
|
response_format: type[ResponseFormat] = str,
|
|
1641
1666
|
batch_size: int | None = None,
|
|
1642
|
-
temperature: float | None = 0.0,
|
|
1643
|
-
top_p: float = 1.0,
|
|
1644
1667
|
max_concurrency: int = 8,
|
|
1645
1668
|
show_progress: bool = False,
|
|
1646
1669
|
**api_kwargs,
|
|
@@ -1674,8 +1697,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1674
1697
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
1675
1698
|
Defaults to ``None`` (automatic batch size optimization
|
|
1676
1699
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
1677
|
-
|
|
1678
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1700
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
1679
1701
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1680
1702
|
requests. Defaults to ``8``.
|
|
1681
1703
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -1692,8 +1714,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1692
1714
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1693
1715
|
),
|
|
1694
1716
|
response_format=response_format,
|
|
1695
|
-
temperature=temperature,
|
|
1696
|
-
top_p=top_p,
|
|
1697
1717
|
**api_kwargs,
|
|
1698
1718
|
)
|
|
1699
1719
|
|
|
@@ -1701,7 +1721,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1701
1721
|
self,
|
|
1702
1722
|
task: PreparedTask[ResponseFormat],
|
|
1703
1723
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1704
|
-
**api_kwargs,
|
|
1705
1724
|
) -> pd.Series:
|
|
1706
1725
|
"""Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
|
|
1707
1726
|
|
|
@@ -1711,8 +1730,8 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1711
1730
|
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
1712
1731
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1713
1732
|
|
|
1714
|
-
|
|
1715
|
-
|
|
1733
|
+
Note:
|
|
1734
|
+
The task's stored API parameters are used. Core routing keys are managed internally.
|
|
1716
1735
|
|
|
1717
1736
|
Returns:
|
|
1718
1737
|
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
@@ -1723,7 +1742,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1723
1742
|
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1724
1743
|
task=task,
|
|
1725
1744
|
cache=cache,
|
|
1726
|
-
**api_kwargs,
|
|
1727
1745
|
)
|
|
1728
1746
|
|
|
1729
1747
|
async def task(
|
|
@@ -1732,7 +1750,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1732
1750
|
batch_size: int | None = None,
|
|
1733
1751
|
max_concurrency: int = 8,
|
|
1734
1752
|
show_progress: bool = False,
|
|
1735
|
-
**api_kwargs,
|
|
1736
1753
|
) -> pd.Series:
|
|
1737
1754
|
"""Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
|
|
1738
1755
|
|
|
@@ -1771,9 +1788,8 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1771
1788
|
requests. Defaults to 8.
|
|
1772
1789
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1773
1790
|
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1791
|
+
Note:
|
|
1792
|
+
The task's stored API parameters are used. Core batching / routing
|
|
1777
1793
|
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1778
1794
|
library and cannot be overridden.
|
|
1779
1795
|
|
|
@@ -1790,43 +1806,34 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1790
1806
|
batch_size=batch_size,
|
|
1791
1807
|
max_concurrency=max_concurrency,
|
|
1792
1808
|
show_progress=show_progress,
|
|
1793
|
-
**api_kwargs,
|
|
1794
1809
|
)
|
|
1795
1810
|
|
|
1796
1811
|
async def parse_with_cache(
|
|
1797
1812
|
self,
|
|
1798
1813
|
instructions: str,
|
|
1799
1814
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1800
|
-
response_format: ResponseFormat = None,
|
|
1815
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1801
1816
|
max_examples: int = 100,
|
|
1802
|
-
temperature: float | None = 0.0,
|
|
1803
|
-
top_p: float = 1.0,
|
|
1804
1817
|
**api_kwargs,
|
|
1805
1818
|
) -> pd.Series:
|
|
1806
|
-
"""Parse DataFrame rows using an LLM with
|
|
1819
|
+
"""Parse DataFrame rows into structured data using an LLM with cache (asynchronously).
|
|
1807
1820
|
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
on the provided purpose.
|
|
1821
|
+
Async method for parsing DataFrame rows (as JSON) with external cache
|
|
1822
|
+
control, enabling deduplication across operations and concurrent processing.
|
|
1811
1823
|
|
|
1812
1824
|
Args:
|
|
1813
|
-
instructions (str):
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1825
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1826
|
+
invoice details including items, quantities, and totals").
|
|
1827
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
|
|
1828
|
+
async cache for concurrent API call management.
|
|
1829
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1830
|
+
structure. None triggers automatic schema inference. Defaults to None.
|
|
1831
|
+
max_examples (int, optional): Maximum rows for schema inference.
|
|
1819
1832
|
Defaults to 100.
|
|
1820
|
-
|
|
1821
|
-
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1822
|
-
|
|
1823
|
-
Additional Keyword Args:
|
|
1824
|
-
Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
|
|
1825
|
-
`seed`, etc.) are forwarded verbatim to the underlying client.
|
|
1833
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1826
1834
|
|
|
1827
1835
|
Returns:
|
|
1828
|
-
pandas.Series:
|
|
1829
|
-
`response_format` or inferred schema model.
|
|
1836
|
+
pandas.Series: Parsed structured data indexed like the original DataFrame.
|
|
1830
1837
|
|
|
1831
1838
|
Note:
|
|
1832
1839
|
This is an asynchronous method and must be awaited.
|
|
@@ -1836,46 +1843,55 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1836
1843
|
cache=cache,
|
|
1837
1844
|
response_format=response_format,
|
|
1838
1845
|
max_examples=max_examples,
|
|
1839
|
-
temperature=temperature,
|
|
1840
|
-
top_p=top_p,
|
|
1841
1846
|
**api_kwargs,
|
|
1842
1847
|
)
|
|
1843
1848
|
|
|
1844
1849
|
async def parse(
|
|
1845
1850
|
self,
|
|
1846
1851
|
instructions: str,
|
|
1847
|
-
response_format: ResponseFormat = None,
|
|
1852
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1848
1853
|
max_examples: int = 100,
|
|
1849
1854
|
batch_size: int | None = None,
|
|
1850
1855
|
max_concurrency: int = 8,
|
|
1851
1856
|
show_progress: bool = False,
|
|
1852
|
-
temperature: float | None = 0.0,
|
|
1853
|
-
top_p: float = 1.0,
|
|
1854
1857
|
**api_kwargs,
|
|
1855
1858
|
) -> pd.Series:
|
|
1856
|
-
"""Parse DataFrame rows using an LLM
|
|
1859
|
+
"""Parse DataFrame rows into structured data using an LLM (asynchronously).
|
|
1857
1860
|
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
on the provided purpose.
|
|
1861
|
+
Async version for extracting structured information from DataFrame rows,
|
|
1862
|
+
with automatic schema inference when no format is specified.
|
|
1861
1863
|
|
|
1862
1864
|
Args:
|
|
1863
|
-
instructions (str):
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1865
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1866
|
+
customer details, order items, and payment information").
|
|
1867
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1868
|
+
structure. None triggers automatic inference. Defaults to None.
|
|
1869
|
+
max_examples (int, optional): Maximum rows for schema inference.
|
|
1867
1870
|
Defaults to 100.
|
|
1868
|
-
batch_size (int | None):
|
|
1869
|
-
Defaults to None
|
|
1870
|
-
max_concurrency (int): Maximum
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1871
|
+
batch_size (int | None, optional): Rows per batch. None for
|
|
1872
|
+
automatic optimization. Defaults to None.
|
|
1873
|
+
max_concurrency (int, optional): Maximum concurrent requests.
|
|
1874
|
+
Defaults to 8.
|
|
1875
|
+
show_progress (bool, optional): Show progress bar. Defaults to False.
|
|
1876
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1875
1877
|
|
|
1876
1878
|
Returns:
|
|
1877
|
-
pandas.Series:
|
|
1878
|
-
|
|
1879
|
+
pandas.Series: Parsed structured data indexed like the original DataFrame.
|
|
1880
|
+
|
|
1881
|
+
Example:
|
|
1882
|
+
```python
|
|
1883
|
+
df = pd.DataFrame({
|
|
1884
|
+
'raw_data': [
|
|
1885
|
+
'Customer: John Doe, Order: 2 laptops @ $1200 each',
|
|
1886
|
+
'Customer: Jane Smith, Order: 5 phones @ $800 each'
|
|
1887
|
+
]
|
|
1888
|
+
})
|
|
1889
|
+
|
|
1890
|
+
# Async parsing with automatic schema inference
|
|
1891
|
+
parsed = await df.aio.parse(
|
|
1892
|
+
"Extract customer name, product, quantity, and unit price"
|
|
1893
|
+
)
|
|
1894
|
+
```
|
|
1879
1895
|
|
|
1880
1896
|
Note:
|
|
1881
1897
|
This is an asynchronous method and must be awaited.
|
|
@@ -1887,8 +1903,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1887
1903
|
),
|
|
1888
1904
|
response_format=response_format,
|
|
1889
1905
|
max_examples=max_examples,
|
|
1890
|
-
temperature=temperature,
|
|
1891
|
-
top_p=top_p,
|
|
1892
1906
|
**api_kwargs,
|
|
1893
1907
|
)
|
|
1894
1908
|
|
|
@@ -1992,7 +2006,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1992
2006
|
batch_size: int | None = None,
|
|
1993
2007
|
max_concurrency: int = 8,
|
|
1994
2008
|
show_progress: bool = False,
|
|
1995
|
-
**api_kwargs,
|
|
1996
2009
|
) -> pd.DataFrame:
|
|
1997
2010
|
"""Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
|
|
1998
2011
|
|
|
@@ -2014,10 +2027,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
2014
2027
|
requests. Defaults to 8.
|
|
2015
2028
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
2016
2029
|
|
|
2017
|
-
Additional Keyword Args:
|
|
2018
|
-
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
2019
|
-
``seed``, etc.) are forwarded verbatim to the underlying task execution.
|
|
2020
|
-
|
|
2021
2030
|
Returns:
|
|
2022
2031
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
2023
2032
|
column. The original DataFrame is not modified.
|
|
@@ -2055,7 +2064,10 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
2055
2064
|
return self._obj
|
|
2056
2065
|
|
|
2057
2066
|
filled_values: list[FillNaResponse] = await missing_rows.aio.task(
|
|
2058
|
-
task=task,
|
|
2067
|
+
task=task,
|
|
2068
|
+
batch_size=batch_size,
|
|
2069
|
+
max_concurrency=max_concurrency,
|
|
2070
|
+
show_progress=show_progress,
|
|
2059
2071
|
)
|
|
2060
2072
|
|
|
2061
2073
|
# get deep copy of the DataFrame to avoid modifying the original
|