openaivec 0.14.10__py3-none-any.whl → 0.14.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_di.py +21 -0
- openaivec/_embeddings.py +17 -4
- openaivec/_model.py +7 -12
- openaivec/_prompt.py +3 -6
- openaivec/_provider.py +8 -29
- openaivec/_responses.py +39 -117
- openaivec/_schema.py +27 -23
- openaivec/pandas_ext.py +356 -343
- openaivec/spark.py +253 -115
- openaivec/task/__init__.py +1 -1
- openaivec/task/customer_support/customer_sentiment.py +4 -9
- openaivec/task/customer_support/inquiry_classification.py +5 -8
- openaivec/task/customer_support/inquiry_summary.py +5 -6
- openaivec/task/customer_support/intent_analysis.py +5 -7
- openaivec/task/customer_support/response_suggestion.py +5 -8
- openaivec/task/customer_support/urgency_analysis.py +5 -8
- openaivec/task/nlp/dependency_parsing.py +1 -2
- openaivec/task/nlp/keyword_extraction.py +1 -2
- openaivec/task/nlp/morphological_analysis.py +1 -2
- openaivec/task/nlp/named_entity_recognition.py +1 -2
- openaivec/task/nlp/sentiment_analysis.py +1 -2
- openaivec/task/nlp/translation.py +1 -1
- openaivec/task/table/fillna.py +8 -3
- {openaivec-0.14.10.dist-info → openaivec-0.14.13.dist-info}/METADATA +40 -16
- openaivec-0.14.13.dist-info/RECORD +37 -0
- openaivec-0.14.10.dist-info/RECORD +0 -37
- {openaivec-0.14.10.dist-info → openaivec-0.14.13.dist-info}/WHEEL +0 -0
- {openaivec-0.14.10.dist-info → openaivec-0.14.13.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -181,8 +181,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
181
181
|
instructions: str,
|
|
182
182
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
183
183
|
response_format: type[ResponseFormat] = str,
|
|
184
|
-
temperature: float | None = 0.0,
|
|
185
|
-
top_p: float = 1.0,
|
|
186
184
|
**api_kwargs,
|
|
187
185
|
) -> pd.Series:
|
|
188
186
|
"""Call an LLM once for every Series element using a provided cache.
|
|
@@ -196,40 +194,30 @@ class OpenAIVecSeriesAccessor:
|
|
|
196
194
|
batching and deduplication control.
|
|
197
195
|
response_format (type[ResponseFormat], optional): Pydantic model or built-in
|
|
198
196
|
type the assistant should return. Defaults to ``str``.
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
Additional Keyword Args:
|
|
203
|
-
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
204
|
-
``seed``, etc.) are forwarded verbatim to the underlying client.
|
|
197
|
+
**api_kwargs: Arbitrary OpenAI Responses API parameters (e.g. ``temperature``,
|
|
198
|
+
``top_p``, ``frequency_penalty``, ``presence_penalty``, ``seed``, etc.) are
|
|
199
|
+
forwarded verbatim to the underlying client.
|
|
205
200
|
|
|
206
201
|
Returns:
|
|
207
202
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
208
203
|
"""
|
|
204
|
+
|
|
209
205
|
client: BatchResponses = BatchResponses(
|
|
210
206
|
client=CONTAINER.resolve(OpenAI),
|
|
211
207
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
212
208
|
system_message=instructions,
|
|
213
209
|
response_format=response_format,
|
|
214
210
|
cache=cache,
|
|
215
|
-
|
|
216
|
-
top_p=top_p,
|
|
211
|
+
api_kwargs=api_kwargs,
|
|
217
212
|
)
|
|
218
213
|
|
|
219
|
-
|
|
220
|
-
proxy_params = {"show_progress", "batch_size"}
|
|
221
|
-
filtered_kwargs = {k: v for k, v in api_kwargs.items() if k not in proxy_params}
|
|
222
|
-
return pd.Series(
|
|
223
|
-
client.parse(self._obj.tolist(), **filtered_kwargs), index=self._obj.index, name=self._obj.name
|
|
224
|
-
)
|
|
214
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
225
215
|
|
|
226
216
|
def responses(
|
|
227
217
|
self,
|
|
228
218
|
instructions: str,
|
|
229
219
|
response_format: type[ResponseFormat] = str,
|
|
230
220
|
batch_size: int | None = None,
|
|
231
|
-
temperature: float | None = 0.0,
|
|
232
|
-
top_p: float = 1.0,
|
|
233
221
|
show_progress: bool = False,
|
|
234
222
|
**api_kwargs,
|
|
235
223
|
) -> pd.Series:
|
|
@@ -248,6 +236,12 @@ class OpenAIVecSeriesAccessor:
|
|
|
248
236
|
batch_size=32,
|
|
249
237
|
show_progress=True
|
|
250
238
|
)
|
|
239
|
+
|
|
240
|
+
# With custom temperature
|
|
241
|
+
animals.ai.responses(
|
|
242
|
+
"translate creatively",
|
|
243
|
+
temperature=0.8
|
|
244
|
+
)
|
|
251
245
|
```
|
|
252
246
|
|
|
253
247
|
Args:
|
|
@@ -257,9 +251,8 @@ class OpenAIVecSeriesAccessor:
|
|
|
257
251
|
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
258
252
|
request. Defaults to ``None`` (automatic batch size optimization
|
|
259
253
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
260
|
-
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
261
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
262
254
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
255
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
263
256
|
|
|
264
257
|
Returns:
|
|
265
258
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
@@ -268,14 +261,13 @@ class OpenAIVecSeriesAccessor:
|
|
|
268
261
|
instructions=instructions,
|
|
269
262
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
270
263
|
response_format=response_format,
|
|
271
|
-
temperature=temperature,
|
|
272
|
-
top_p=top_p,
|
|
273
264
|
**api_kwargs,
|
|
274
265
|
)
|
|
275
266
|
|
|
276
267
|
def embeddings_with_cache(
|
|
277
268
|
self,
|
|
278
269
|
cache: BatchingMapProxy[str, np.ndarray],
|
|
270
|
+
**api_kwargs,
|
|
279
271
|
) -> pd.Series:
|
|
280
272
|
"""Compute OpenAI embeddings for every Series element using a provided cache.
|
|
281
273
|
|
|
@@ -299,6 +291,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
299
291
|
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
300
292
|
instance for managing API call batching and deduplication.
|
|
301
293
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
294
|
+
**api_kwargs: Additional keyword arguments to pass to the OpenAI API.
|
|
302
295
|
|
|
303
296
|
Returns:
|
|
304
297
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -308,6 +301,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
308
301
|
client=CONTAINER.resolve(OpenAI),
|
|
309
302
|
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
310
303
|
cache=cache,
|
|
304
|
+
api_kwargs=api_kwargs,
|
|
311
305
|
)
|
|
312
306
|
|
|
313
307
|
return pd.Series(
|
|
@@ -316,7 +310,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
316
310
|
name=self._obj.name,
|
|
317
311
|
)
|
|
318
312
|
|
|
319
|
-
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
313
|
+
def embeddings(self, batch_size: int | None = None, show_progress: bool = False, **api_kwargs) -> pd.Series:
|
|
320
314
|
"""Compute OpenAI embeddings for every Series element.
|
|
321
315
|
|
|
322
316
|
Example:
|
|
@@ -338,6 +332,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
338
332
|
single request. Defaults to ``None`` (automatic batch size optimization
|
|
339
333
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
340
334
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
335
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
341
336
|
|
|
342
337
|
Returns:
|
|
343
338
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -345,18 +340,18 @@ class OpenAIVecSeriesAccessor:
|
|
|
345
340
|
"""
|
|
346
341
|
return self.embeddings_with_cache(
|
|
347
342
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
343
|
+
**api_kwargs,
|
|
348
344
|
)
|
|
349
345
|
|
|
350
346
|
def task_with_cache(
|
|
351
347
|
self,
|
|
352
348
|
task: PreparedTask[ResponseFormat],
|
|
353
349
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
354
|
-
**api_kwargs,
|
|
355
350
|
) -> pd.Series:
|
|
356
351
|
"""Execute a prepared task on every Series element using a provided cache.
|
|
357
352
|
|
|
358
353
|
This mirrors ``responses_with_cache`` but uses the task's stored instructions,
|
|
359
|
-
response format,
|
|
354
|
+
response format, and API parameters. A supplied ``BatchingMapProxy`` enables
|
|
360
355
|
cross‑operation deduplicated reuse and external batch size / progress control.
|
|
361
356
|
|
|
362
357
|
Example:
|
|
@@ -370,10 +365,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
370
365
|
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
371
366
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
372
367
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
(``model``, system instructions, user input) are managed internally and cannot be overridden.
|
|
368
|
+
Note:
|
|
369
|
+
The task's stored API parameters are used. Core routing keys (``model``, system
|
|
370
|
+
instructions, user input) are managed internally and cannot be overridden.
|
|
377
371
|
|
|
378
372
|
Returns:
|
|
379
373
|
pandas.Series: Task results aligned with the original Series index.
|
|
@@ -384,17 +378,15 @@ class OpenAIVecSeriesAccessor:
|
|
|
384
378
|
system_message=task.instructions,
|
|
385
379
|
response_format=task.response_format,
|
|
386
380
|
cache=cache,
|
|
387
|
-
|
|
388
|
-
top_p=task.top_p,
|
|
381
|
+
api_kwargs=task.api_kwargs,
|
|
389
382
|
)
|
|
390
|
-
return pd.Series(client.parse(self._obj.tolist()
|
|
383
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
391
384
|
|
|
392
385
|
def task(
|
|
393
386
|
self,
|
|
394
387
|
task: PreparedTask,
|
|
395
388
|
batch_size: int | None = None,
|
|
396
389
|
show_progress: bool = False,
|
|
397
|
-
**api_kwargs,
|
|
398
390
|
) -> pd.Series:
|
|
399
391
|
"""Execute a prepared task on every Series element.
|
|
400
392
|
|
|
@@ -426,10 +418,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
426
418
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
427
419
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
428
420
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
``
|
|
432
|
-
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
421
|
+
Note:
|
|
422
|
+
The task's stored API parameters are used. Core batching / routing keys
|
|
423
|
+
(``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
433
424
|
library and cannot be overridden.
|
|
434
425
|
|
|
435
426
|
Returns:
|
|
@@ -438,148 +429,204 @@ class OpenAIVecSeriesAccessor:
|
|
|
438
429
|
return self.task_with_cache(
|
|
439
430
|
task=task,
|
|
440
431
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
441
|
-
**api_kwargs,
|
|
442
432
|
)
|
|
443
433
|
|
|
444
434
|
def parse_with_cache(
|
|
445
435
|
self,
|
|
446
436
|
instructions: str,
|
|
447
437
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
448
|
-
response_format: ResponseFormat = None,
|
|
438
|
+
response_format: type[ResponseFormat] | None = None,
|
|
449
439
|
max_examples: int = 100,
|
|
450
|
-
temperature: float | None = 0.0,
|
|
451
|
-
top_p: float = 1.0,
|
|
452
440
|
**api_kwargs,
|
|
453
441
|
) -> pd.Series:
|
|
454
442
|
"""Parse Series values using an LLM with a provided cache.
|
|
455
|
-
|
|
456
|
-
|
|
443
|
+
|
|
444
|
+
This method allows external control over caching behavior while parsing
|
|
445
|
+
Series content into structured data. If no response format is provided,
|
|
446
|
+
the method automatically infers an appropriate schema by analyzing the
|
|
447
|
+
data patterns.
|
|
448
|
+
|
|
457
449
|
Args:
|
|
458
|
-
instructions (str):
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
450
|
+
instructions (str): Plain language description of what information
|
|
451
|
+
to extract (e.g., "Extract customer information including name
|
|
452
|
+
and contact details"). This guides both the extraction process
|
|
453
|
+
and schema inference.
|
|
454
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
455
|
+
instance for managing API call batching and deduplication.
|
|
456
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
457
|
+
response_format (type[ResponseFormat] | None, optional): Target structure
|
|
458
|
+
for the parsed data. Can be a Pydantic model class, built-in type
|
|
459
|
+
(str, int, float, bool, list, dict), or None. If None, the method
|
|
460
|
+
infers an appropriate schema based on the instructions and data.
|
|
461
|
+
Defaults to None.
|
|
462
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
463
|
+
analyze when inferring the schema. Only used when response_format
|
|
464
|
+
is None. Defaults to 100.
|
|
465
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
466
|
+
frequency_penalty, presence_penalty, seed, etc.) forwarded to
|
|
467
|
+
the underlying API calls.
|
|
468
|
+
|
|
470
469
|
Returns:
|
|
471
|
-
pandas.Series: Series
|
|
472
|
-
|
|
470
|
+
pandas.Series: Series containing parsed structured data. Each value
|
|
471
|
+
is an instance of the specified response_format or the inferred
|
|
472
|
+
schema model, aligned with the original Series index.
|
|
473
473
|
"""
|
|
474
474
|
|
|
475
475
|
schema: InferredSchema | None = None
|
|
476
476
|
if response_format is None:
|
|
477
|
-
schema = self.infer_schema(
|
|
477
|
+
schema = self.infer_schema(instructions=instructions, max_examples=max_examples, **api_kwargs)
|
|
478
478
|
|
|
479
479
|
return self.responses_with_cache(
|
|
480
480
|
instructions=schema.inference_prompt if schema else instructions,
|
|
481
481
|
cache=cache,
|
|
482
482
|
response_format=response_format or schema.model,
|
|
483
|
-
temperature=temperature,
|
|
484
|
-
top_p=top_p,
|
|
485
483
|
**api_kwargs,
|
|
486
484
|
)
|
|
487
485
|
|
|
488
486
|
def parse(
|
|
489
487
|
self,
|
|
490
488
|
instructions: str,
|
|
491
|
-
response_format: ResponseFormat = None,
|
|
489
|
+
response_format: type[ResponseFormat] | None = None,
|
|
492
490
|
max_examples: int = 100,
|
|
493
491
|
batch_size: int | None = None,
|
|
494
492
|
show_progress: bool = False,
|
|
495
|
-
temperature: float | None = 0.0,
|
|
496
|
-
top_p: float = 1.0,
|
|
497
493
|
**api_kwargs,
|
|
498
494
|
) -> pd.Series:
|
|
499
|
-
"""Parse Series values using an LLM
|
|
495
|
+
"""Parse Series values into structured data using an LLM.
|
|
500
496
|
|
|
501
|
-
This method
|
|
502
|
-
|
|
497
|
+
This method extracts structured information from unstructured text in
|
|
498
|
+
the Series. When no response format is provided, it automatically
|
|
499
|
+
infers an appropriate schema by analyzing patterns in the data.
|
|
503
500
|
|
|
504
501
|
Args:
|
|
505
|
-
instructions (str):
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
502
|
+
instructions (str): Plain language description of what information
|
|
503
|
+
to extract (e.g., "Extract product details including price,
|
|
504
|
+
category, and availability"). This guides both the extraction
|
|
505
|
+
process and schema inference.
|
|
506
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
507
|
+
structure for the parsed data. Can be a Pydantic model class,
|
|
508
|
+
built-in type (str, int, float, bool, list, dict), or None.
|
|
509
|
+
If None, automatically infers a schema. Defaults to None.
|
|
510
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
511
|
+
analyze when inferring schema. Only used when response_format
|
|
512
|
+
is None. Defaults to 100.
|
|
513
|
+
batch_size (int | None, optional): Number of requests to process
|
|
514
|
+
per batch. None enables automatic optimization. Defaults to None.
|
|
515
|
+
show_progress (bool, optional): Display progress bar in Jupyter
|
|
516
|
+
notebooks. Defaults to False.
|
|
517
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
518
|
+
frequency_penalty, presence_penalty, seed, etc.).
|
|
516
519
|
|
|
517
520
|
Returns:
|
|
518
|
-
pandas.Series: Series
|
|
519
|
-
|
|
521
|
+
pandas.Series: Series containing parsed structured data as instances
|
|
522
|
+
of response_format or the inferred schema model.
|
|
523
|
+
|
|
524
|
+
Example:
|
|
525
|
+
```python
|
|
526
|
+
# With explicit schema
|
|
527
|
+
from pydantic import BaseModel
|
|
528
|
+
class Product(BaseModel):
|
|
529
|
+
name: str
|
|
530
|
+
price: float
|
|
531
|
+
in_stock: bool
|
|
532
|
+
|
|
533
|
+
descriptions = pd.Series([
|
|
534
|
+
"iPhone 15 Pro - $999, available now",
|
|
535
|
+
"Samsung Galaxy S24 - $899, out of stock"
|
|
536
|
+
])
|
|
537
|
+
products = descriptions.ai.parse(
|
|
538
|
+
"Extract product information",
|
|
539
|
+
response_format=Product
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# With automatic schema inference
|
|
543
|
+
reviews = pd.Series([
|
|
544
|
+
"Great product! 5 stars. Fast shipping.",
|
|
545
|
+
"Poor quality. 2 stars. Slow delivery."
|
|
546
|
+
])
|
|
547
|
+
parsed = reviews.ai.parse(
|
|
548
|
+
"Extract review rating and shipping feedback"
|
|
549
|
+
)
|
|
550
|
+
```
|
|
520
551
|
"""
|
|
521
552
|
return self.parse_with_cache(
|
|
522
553
|
instructions=instructions,
|
|
523
554
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
524
555
|
response_format=response_format,
|
|
525
556
|
max_examples=max_examples,
|
|
526
|
-
temperature=temperature,
|
|
527
|
-
top_p=top_p,
|
|
528
557
|
**api_kwargs,
|
|
529
558
|
)
|
|
530
559
|
|
|
531
|
-
def infer_schema(self,
|
|
560
|
+
def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> InferredSchema:
|
|
532
561
|
"""Infer a structured data schema from Series content using AI.
|
|
533
562
|
|
|
534
|
-
This method analyzes a sample of
|
|
535
|
-
a
|
|
536
|
-
|
|
537
|
-
|
|
563
|
+
This method analyzes a sample of Series values to automatically generate
|
|
564
|
+
a Pydantic model that captures the relevant information structure. The
|
|
565
|
+
inferred schema supports both flat and hierarchical (nested) structures,
|
|
566
|
+
making it suitable for complex data extraction tasks.
|
|
538
567
|
|
|
539
568
|
Args:
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
max_examples (int): Maximum number of
|
|
545
|
-
|
|
546
|
-
limit.
|
|
569
|
+
instructions (str): Plain language description of the extraction goal
|
|
570
|
+
(e.g., "Extract customer information for CRM system", "Parse
|
|
571
|
+
event details for calendar integration"). This guides which
|
|
572
|
+
fields to include and their purpose.
|
|
573
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
574
|
+
analyze for pattern detection. The method samples randomly up
|
|
575
|
+
to this limit. Higher values may improve schema quality but
|
|
576
|
+
increase inference time. Defaults to 100.
|
|
577
|
+
**api_kwargs: Additional OpenAI API parameters for fine-tuning
|
|
578
|
+
the inference process.
|
|
547
579
|
|
|
548
580
|
Returns:
|
|
549
|
-
InferredSchema:
|
|
550
|
-
-
|
|
551
|
-
- fields:
|
|
552
|
-
|
|
553
|
-
-
|
|
554
|
-
-
|
|
581
|
+
InferredSchema: A comprehensive schema object containing:
|
|
582
|
+
- instructions: Refined extraction objective statement
|
|
583
|
+
- fields: Hierarchical field specifications with names, types,
|
|
584
|
+
descriptions, and nested structures where applicable
|
|
585
|
+
- inference_prompt: Optimized prompt for consistent extraction
|
|
586
|
+
- model: Dynamically generated Pydantic model class supporting
|
|
587
|
+
both flat and nested structures
|
|
588
|
+
- task: PreparedTask configured for batch extraction using
|
|
589
|
+
the inferred schema
|
|
555
590
|
|
|
556
591
|
Example:
|
|
557
592
|
```python
|
|
593
|
+
# Simple flat structure
|
|
558
594
|
reviews = pd.Series([
|
|
559
|
-
"Great product
|
|
560
|
-
"
|
|
561
|
-
"Average product. Price is fair but nothing special."
|
|
595
|
+
"5 stars! Great product, fast shipping to NYC.",
|
|
596
|
+
"2 stars. Product broke, slow delivery to LA."
|
|
562
597
|
])
|
|
563
|
-
|
|
564
|
-
# Infer schema for sentiment analysis
|
|
565
598
|
schema = reviews.ai.infer_schema(
|
|
566
|
-
|
|
599
|
+
"Extract review ratings and shipping information"
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
# Hierarchical structure
|
|
603
|
+
orders = pd.Series([
|
|
604
|
+
"Order #123: John Doe, 123 Main St, NYC. Items: iPhone ($999), Case ($29)",
|
|
605
|
+
"Order #456: Jane Smith, 456 Oak Ave, LA. Items: iPad ($799)"
|
|
606
|
+
])
|
|
607
|
+
schema = orders.ai.infer_schema(
|
|
608
|
+
"Extract order details including customer and items"
|
|
567
609
|
)
|
|
610
|
+
# Inferred schema may include nested structures like:
|
|
611
|
+
# - customer: {name: str, address: str, city: str}
|
|
612
|
+
# - items: [{product: str, price: float}]
|
|
568
613
|
|
|
569
|
-
#
|
|
570
|
-
extracted =
|
|
614
|
+
# Apply the schema for extraction
|
|
615
|
+
extracted = orders.ai.task(schema.task)
|
|
571
616
|
```
|
|
572
617
|
|
|
573
618
|
Note:
|
|
574
|
-
The
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
619
|
+
The inference process uses multiple AI iterations to ensure schema
|
|
620
|
+
validity. Nested structures are automatically detected when the
|
|
621
|
+
data contains hierarchical relationships. The generated Pydantic
|
|
622
|
+
model ensures type safety and validation for all extracted data.
|
|
578
623
|
"""
|
|
579
624
|
inferer = CONTAINER.resolve(SchemaInferer)
|
|
580
625
|
|
|
581
626
|
input: SchemaInferenceInput = SchemaInferenceInput(
|
|
582
|
-
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(),
|
|
627
|
+
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(),
|
|
628
|
+
instructions=instructions,
|
|
629
|
+
**api_kwargs,
|
|
583
630
|
)
|
|
584
631
|
return inferer.infer_schema(input)
|
|
585
632
|
|
|
@@ -642,8 +689,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
642
689
|
instructions: str,
|
|
643
690
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
644
691
|
response_format: type[ResponseFormat] = str,
|
|
645
|
-
temperature: float | None = 0.0,
|
|
646
|
-
top_p: float = 1.0,
|
|
647
692
|
**api_kwargs,
|
|
648
693
|
) -> pd.Series:
|
|
649
694
|
"""Generate a response for each row after serializing it to JSON using a provided cache.
|
|
@@ -677,8 +722,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
677
722
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
678
723
|
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
679
724
|
responses. Defaults to ``str``.
|
|
680
|
-
|
|
681
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
725
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
682
726
|
|
|
683
727
|
Returns:
|
|
684
728
|
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
@@ -687,8 +731,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
687
731
|
instructions=instructions,
|
|
688
732
|
cache=cache,
|
|
689
733
|
response_format=response_format,
|
|
690
|
-
temperature=temperature,
|
|
691
|
-
top_p=top_p,
|
|
692
734
|
**api_kwargs,
|
|
693
735
|
)
|
|
694
736
|
|
|
@@ -697,8 +739,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
697
739
|
instructions: str,
|
|
698
740
|
response_format: type[ResponseFormat] = str,
|
|
699
741
|
batch_size: int | None = None,
|
|
700
|
-
temperature: float | None = 0.0,
|
|
701
|
-
top_p: float = 1.0,
|
|
702
742
|
show_progress: bool = False,
|
|
703
743
|
**api_kwargs,
|
|
704
744
|
) -> pd.Series:
|
|
@@ -730,9 +770,8 @@ class OpenAIVecDataFrameAccessor:
|
|
|
730
770
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
731
771
|
Defaults to ``None`` (automatic batch size optimization
|
|
732
772
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
733
|
-
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
734
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
735
773
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
774
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
736
775
|
|
|
737
776
|
Returns:
|
|
738
777
|
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
@@ -741,8 +780,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
741
780
|
instructions=instructions,
|
|
742
781
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
743
782
|
response_format=response_format,
|
|
744
|
-
temperature=temperature,
|
|
745
|
-
top_p=top_p,
|
|
746
783
|
**api_kwargs,
|
|
747
784
|
)
|
|
748
785
|
|
|
@@ -750,7 +787,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
750
787
|
self,
|
|
751
788
|
task: PreparedTask[ResponseFormat],
|
|
752
789
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
753
|
-
**api_kwargs,
|
|
754
790
|
) -> pd.Series:
|
|
755
791
|
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
|
|
756
792
|
|
|
@@ -758,9 +794,8 @@ class OpenAIVecDataFrameAccessor:
|
|
|
758
794
|
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
759
795
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
760
796
|
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
``seed``) forwarded verbatim. Core routing keys are managed internally.
|
|
797
|
+
Note:
|
|
798
|
+
The task's stored API parameters are used. Core routing keys are managed internally.
|
|
764
799
|
|
|
765
800
|
Returns:
|
|
766
801
|
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
@@ -768,7 +803,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
768
803
|
return _df_rows_to_json_series(self._obj).ai.task_with_cache(
|
|
769
804
|
task=task,
|
|
770
805
|
cache=cache,
|
|
771
|
-
**api_kwargs,
|
|
772
806
|
)
|
|
773
807
|
|
|
774
808
|
def task(
|
|
@@ -776,7 +810,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
776
810
|
task: PreparedTask,
|
|
777
811
|
batch_size: int | None = None,
|
|
778
812
|
show_progress: bool = False,
|
|
779
|
-
**api_kwargs,
|
|
780
813
|
) -> pd.Series:
|
|
781
814
|
"""Execute a prepared task on each DataFrame row after serializing it to JSON.
|
|
782
815
|
|
|
@@ -812,9 +845,8 @@ class OpenAIVecDataFrameAccessor:
|
|
|
812
845
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
813
846
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
814
847
|
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
848
|
+
Note:
|
|
849
|
+
The task's stored API parameters are used. Core batching / routing
|
|
818
850
|
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
819
851
|
library and cannot be overridden.
|
|
820
852
|
|
|
@@ -826,99 +858,108 @@ class OpenAIVecDataFrameAccessor:
|
|
|
826
858
|
task=task,
|
|
827
859
|
batch_size=batch_size,
|
|
828
860
|
show_progress=show_progress,
|
|
829
|
-
**api_kwargs,
|
|
830
861
|
)
|
|
831
862
|
|
|
832
863
|
def parse_with_cache(
|
|
833
864
|
self,
|
|
834
865
|
instructions: str,
|
|
835
866
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
836
|
-
response_format: ResponseFormat = None,
|
|
867
|
+
response_format: type[ResponseFormat] | None = None,
|
|
837
868
|
max_examples: int = 100,
|
|
838
|
-
temperature: float | None = 0.0,
|
|
839
|
-
top_p: float = 1.0,
|
|
840
869
|
**api_kwargs,
|
|
841
870
|
) -> pd.Series:
|
|
842
|
-
"""Parse DataFrame rows using an LLM with a provided cache.
|
|
871
|
+
"""Parse DataFrame rows into structured data using an LLM with a provided cache.
|
|
843
872
|
|
|
844
|
-
This method
|
|
845
|
-
|
|
846
|
-
|
|
873
|
+
This method processes each DataFrame row (converted to JSON) and extracts
|
|
874
|
+
structured information using an LLM. External cache control enables
|
|
875
|
+
deduplication across operations and custom batch management.
|
|
847
876
|
|
|
848
877
|
Args:
|
|
849
|
-
instructions (str):
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
for
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
878
|
+
instructions (str): Plain language description of what information
|
|
879
|
+
to extract from each row (e.g., "Extract shipping details and
|
|
880
|
+
order status"). Guides both extraction and schema inference.
|
|
881
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
882
|
+
instance for managing API call batching and deduplication.
|
|
883
|
+
Set cache.batch_size=None for automatic optimization.
|
|
884
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
885
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
886
|
+
type, or None for automatic schema inference. Defaults to None.
|
|
887
|
+
max_examples (int, optional): Maximum rows to analyze when inferring
|
|
888
|
+
schema (only used when response_format is None). Defaults to 100.
|
|
889
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
890
|
+
frequency_penalty, presence_penalty, seed, etc.).
|
|
862
891
|
|
|
863
892
|
Returns:
|
|
864
|
-
pandas.Series: Series
|
|
865
|
-
|
|
893
|
+
pandas.Series: Series containing parsed structured data as instances
|
|
894
|
+
of response_format or the inferred schema model, indexed like
|
|
895
|
+
the original DataFrame.
|
|
866
896
|
"""
|
|
867
897
|
return _df_rows_to_json_series(self._obj).ai.parse_with_cache(
|
|
868
898
|
instructions=instructions,
|
|
869
899
|
cache=cache,
|
|
870
900
|
response_format=response_format,
|
|
871
901
|
max_examples=max_examples,
|
|
872
|
-
temperature=temperature,
|
|
873
|
-
top_p=top_p,
|
|
874
902
|
**api_kwargs,
|
|
875
903
|
)
|
|
876
904
|
|
|
877
905
|
def parse(
|
|
878
906
|
self,
|
|
879
907
|
instructions: str,
|
|
880
|
-
response_format: ResponseFormat = None,
|
|
908
|
+
response_format: type[ResponseFormat] | None = None,
|
|
881
909
|
max_examples: int = 100,
|
|
882
910
|
batch_size: int | None = None,
|
|
883
911
|
show_progress: bool = False,
|
|
884
|
-
temperature: float | None = 0.0,
|
|
885
|
-
top_p: float = 1.0,
|
|
886
912
|
**api_kwargs,
|
|
887
913
|
) -> pd.Series:
|
|
888
|
-
"""Parse DataFrame rows using an LLM
|
|
914
|
+
"""Parse DataFrame rows into structured data using an LLM.
|
|
889
915
|
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
916
|
+
Each row is converted to JSON and processed to extract structured
|
|
917
|
+
information. When no response format is provided, the method
|
|
918
|
+
automatically infers an appropriate schema from the data.
|
|
893
919
|
|
|
894
920
|
Args:
|
|
895
|
-
instructions (str):
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
921
|
+
instructions (str): Plain language description of extraction goals
|
|
922
|
+
(e.g., "Extract transaction details including amount, date,
|
|
923
|
+
and merchant"). Guides extraction and schema inference.
|
|
924
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
925
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
926
|
+
type, or None for automatic inference. Defaults to None.
|
|
927
|
+
max_examples (int, optional): Maximum rows to analyze for schema
|
|
928
|
+
inference (when response_format is None). Defaults to 100.
|
|
929
|
+
batch_size (int | None, optional): Rows per API batch. None
|
|
930
|
+
enables automatic optimization. Defaults to None.
|
|
931
|
+
show_progress (bool, optional): Show progress bar in Jupyter
|
|
932
|
+
notebooks. Defaults to False.
|
|
933
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
906
934
|
|
|
907
935
|
Returns:
|
|
908
|
-
pandas.Series:
|
|
909
|
-
|
|
936
|
+
pandas.Series: Parsed structured data indexed like the original
|
|
937
|
+
DataFrame.
|
|
938
|
+
|
|
939
|
+
Example:
|
|
940
|
+
```python
|
|
941
|
+
df = pd.DataFrame({
|
|
942
|
+
'log': [
|
|
943
|
+
'2024-01-01 10:00 ERROR Database connection failed',
|
|
944
|
+
'2024-01-01 10:05 INFO Service started successfully'
|
|
945
|
+
]
|
|
946
|
+
})
|
|
947
|
+
|
|
948
|
+
# With automatic schema inference
|
|
949
|
+
parsed = df.ai.parse("Extract timestamp, level, and message")
|
|
950
|
+
# Returns Series with inferred structure like:
|
|
951
|
+
# {timestamp: str, level: str, message: str}
|
|
952
|
+
```
|
|
910
953
|
"""
|
|
911
954
|
return self.parse_with_cache(
|
|
912
955
|
instructions=instructions,
|
|
913
956
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
914
957
|
response_format=response_format,
|
|
915
958
|
max_examples=max_examples,
|
|
916
|
-
temperature=temperature,
|
|
917
|
-
top_p=top_p,
|
|
918
959
|
**api_kwargs,
|
|
919
960
|
)
|
|
920
961
|
|
|
921
|
-
def infer_schema(self,
|
|
962
|
+
def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> InferredSchema:
|
|
922
963
|
"""Infer a structured data schema from DataFrame rows using AI.
|
|
923
964
|
|
|
924
965
|
This method analyzes a sample of DataFrame rows to automatically infer
|
|
@@ -927,7 +968,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
927
968
|
field types, and potential categorical values.
|
|
928
969
|
|
|
929
970
|
Args:
|
|
930
|
-
|
|
971
|
+
instructions (str): Plain language description of how the extracted
|
|
931
972
|
structured data will be used (e.g., "Extract operational metrics
|
|
932
973
|
for dashboard", "Parse customer attributes for segmentation").
|
|
933
974
|
This guides field relevance and helps exclude irrelevant information.
|
|
@@ -937,7 +978,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
937
978
|
|
|
938
979
|
Returns:
|
|
939
980
|
InferredSchema: An object containing:
|
|
940
|
-
-
|
|
981
|
+
- instructions: Normalized statement of the extraction objective
|
|
941
982
|
- fields: List of field specifications with names, types, and descriptions
|
|
942
983
|
- inference_prompt: Reusable prompt for future extractions
|
|
943
984
|
- model: Dynamically generated Pydantic model for parsing
|
|
@@ -956,7 +997,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
956
997
|
|
|
957
998
|
# Infer schema for logistics tracking
|
|
958
999
|
schema = df.ai.infer_schema(
|
|
959
|
-
|
|
1000
|
+
instructions="Extract shipping status and location data for logistics tracking"
|
|
960
1001
|
)
|
|
961
1002
|
|
|
962
1003
|
# Apply the schema to extract structured data
|
|
@@ -964,14 +1005,15 @@ class OpenAIVecDataFrameAccessor:
|
|
|
964
1005
|
```
|
|
965
1006
|
|
|
966
1007
|
Note:
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
1008
|
+
Each row is converted to JSON before analysis. The inference
|
|
1009
|
+
process automatically detects hierarchical relationships and
|
|
1010
|
+
creates appropriate nested structures when present. The generated
|
|
1011
|
+
Pydantic model ensures type safety and validation.
|
|
971
1012
|
"""
|
|
972
1013
|
return _df_rows_to_json_series(self._obj).ai.infer_schema(
|
|
973
|
-
|
|
1014
|
+
instructions=instructions,
|
|
974
1015
|
max_examples=max_examples,
|
|
1016
|
+
**api_kwargs,
|
|
975
1017
|
)
|
|
976
1018
|
|
|
977
1019
|
def extract(self, column: str) -> pd.DataFrame:
|
|
@@ -1012,7 +1054,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
1012
1054
|
max_examples: int = 500,
|
|
1013
1055
|
batch_size: int | None = None,
|
|
1014
1056
|
show_progress: bool = False,
|
|
1015
|
-
**api_kwargs,
|
|
1016
1057
|
) -> pd.DataFrame:
|
|
1017
1058
|
"""Fill missing values in a DataFrame column using AI-powered inference.
|
|
1018
1059
|
|
|
@@ -1032,10 +1073,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
1032
1073
|
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1033
1074
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1034
1075
|
|
|
1035
|
-
Additional Keyword Args:
|
|
1036
|
-
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1037
|
-
``seed``, etc.) are forwarded verbatim to the underlying task execution.
|
|
1038
|
-
|
|
1039
1076
|
Returns:
|
|
1040
1077
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
1041
1078
|
column. The original DataFrame is not modified.
|
|
@@ -1067,7 +1104,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
1067
1104
|
return self._obj
|
|
1068
1105
|
|
|
1069
1106
|
filled_values: list[FillNaResponse] = missing_rows.ai.task(
|
|
1070
|
-
task=task, batch_size=batch_size, show_progress=show_progress
|
|
1107
|
+
task=task, batch_size=batch_size, show_progress=show_progress
|
|
1071
1108
|
)
|
|
1072
1109
|
|
|
1073
1110
|
# get deep copy of the DataFrame to avoid modifying the original
|
|
@@ -1127,8 +1164,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1127
1164
|
instructions: str,
|
|
1128
1165
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1129
1166
|
response_format: type[ResponseFormat] = str,
|
|
1130
|
-
temperature: float | None = 0.0,
|
|
1131
|
-
top_p: float = 1.0,
|
|
1132
1167
|
**api_kwargs,
|
|
1133
1168
|
) -> pd.Series:
|
|
1134
1169
|
"""Call an LLM once for every Series element using a provided cache (asynchronously).
|
|
@@ -1155,13 +1190,11 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1155
1190
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1156
1191
|
response_format (type[ResponseFormat], optional): Pydantic model or built‑in
|
|
1157
1192
|
type the assistant should return. Defaults to ``str``.
|
|
1158
|
-
temperature (float | None, optional): Sampling temperature. ``None`` omits the
|
|
1159
|
-
parameter (recommended for reasoning models). Defaults to ``0.0``.
|
|
1160
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1161
1193
|
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
1162
|
-
``AsyncOpenAI.responses.parse`` (e.g. ``
|
|
1163
|
-
future parameters). Core batching keys
|
|
1164
|
-
text_format) are protected and silently
|
|
1194
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
|
|
1195
|
+
``max_output_tokens``, penalties, future parameters). Core batching keys
|
|
1196
|
+
(model, instructions, input, text_format) are protected and silently
|
|
1197
|
+
ignored if provided.
|
|
1165
1198
|
|
|
1166
1199
|
Returns:
|
|
1167
1200
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
@@ -1175,14 +1208,10 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1175
1208
|
system_message=instructions,
|
|
1176
1209
|
response_format=response_format,
|
|
1177
1210
|
cache=cache,
|
|
1178
|
-
|
|
1179
|
-
top_p=top_p,
|
|
1211
|
+
api_kwargs=api_kwargs,
|
|
1180
1212
|
)
|
|
1181
1213
|
|
|
1182
|
-
|
|
1183
|
-
proxy_params = {"show_progress", "batch_size", "max_concurrency"}
|
|
1184
|
-
filtered_kwargs = {k: v for k, v in api_kwargs.items() if k not in proxy_params}
|
|
1185
|
-
results = await client.parse(self._obj.tolist(), **filtered_kwargs)
|
|
1214
|
+
results = await client.parse(self._obj.tolist())
|
|
1186
1215
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
1187
1216
|
|
|
1188
1217
|
async def responses(
|
|
@@ -1190,8 +1219,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1190
1219
|
instructions: str,
|
|
1191
1220
|
response_format: type[ResponseFormat] = str,
|
|
1192
1221
|
batch_size: int | None = None,
|
|
1193
|
-
temperature: float | None = 0.0,
|
|
1194
|
-
top_p: float = 1.0,
|
|
1195
1222
|
max_concurrency: int = 8,
|
|
1196
1223
|
show_progress: bool = False,
|
|
1197
1224
|
**api_kwargs,
|
|
@@ -1221,11 +1248,14 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1221
1248
|
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1222
1249
|
request. Defaults to ``None`` (automatic batch size optimization
|
|
1223
1250
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
1224
|
-
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1225
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1226
1251
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1227
1252
|
requests. Defaults to ``8``.
|
|
1228
1253
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1254
|
+
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
1255
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
|
|
1256
|
+
``max_output_tokens``, penalties, future parameters). Core batching keys
|
|
1257
|
+
(model, instructions, input, text_format) are protected and silently
|
|
1258
|
+
ignored if provided.
|
|
1229
1259
|
|
|
1230
1260
|
Returns:
|
|
1231
1261
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
@@ -1239,14 +1269,13 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1239
1269
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1240
1270
|
),
|
|
1241
1271
|
response_format=response_format,
|
|
1242
|
-
temperature=temperature,
|
|
1243
|
-
top_p=top_p,
|
|
1244
1272
|
**api_kwargs,
|
|
1245
1273
|
)
|
|
1246
1274
|
|
|
1247
1275
|
async def embeddings_with_cache(
|
|
1248
1276
|
self,
|
|
1249
1277
|
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
1278
|
+
**api_kwargs,
|
|
1250
1279
|
) -> pd.Series:
|
|
1251
1280
|
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
1252
1281
|
|
|
@@ -1274,6 +1303,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1274
1303
|
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
1275
1304
|
instance for managing API call batching and deduplication.
|
|
1276
1305
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1306
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
1277
1307
|
|
|
1278
1308
|
Returns:
|
|
1279
1309
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -1286,6 +1316,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1286
1316
|
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1287
1317
|
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
1288
1318
|
cache=cache,
|
|
1319
|
+
api_kwargs=api_kwargs,
|
|
1289
1320
|
)
|
|
1290
1321
|
|
|
1291
1322
|
# Await the async operation
|
|
@@ -1298,7 +1329,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1298
1329
|
)
|
|
1299
1330
|
|
|
1300
1331
|
async def embeddings(
|
|
1301
|
-
self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False
|
|
1332
|
+
self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False, **api_kwargs
|
|
1302
1333
|
) -> pd.Series:
|
|
1303
1334
|
"""Compute OpenAI embeddings for every Series element (asynchronously).
|
|
1304
1335
|
|
|
@@ -1324,6 +1355,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1324
1355
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1325
1356
|
requests. Defaults to ``8``.
|
|
1326
1357
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1358
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
1327
1359
|
|
|
1328
1360
|
Returns:
|
|
1329
1361
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -1336,13 +1368,13 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1336
1368
|
cache=AsyncBatchingMapProxy(
|
|
1337
1369
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1338
1370
|
),
|
|
1371
|
+
**api_kwargs,
|
|
1339
1372
|
)
|
|
1340
1373
|
|
|
1341
1374
|
async def task_with_cache(
|
|
1342
1375
|
self,
|
|
1343
1376
|
task: PreparedTask[ResponseFormat],
|
|
1344
1377
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1345
|
-
**api_kwargs,
|
|
1346
1378
|
) -> pd.Series:
|
|
1347
1379
|
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
1348
1380
|
|
|
@@ -1393,11 +1425,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1393
1425
|
system_message=task.instructions,
|
|
1394
1426
|
response_format=task.response_format,
|
|
1395
1427
|
cache=cache,
|
|
1396
|
-
|
|
1397
|
-
top_p=task.top_p,
|
|
1428
|
+
api_kwargs=task.api_kwargs,
|
|
1398
1429
|
)
|
|
1399
|
-
|
|
1400
|
-
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
1430
|
+
results = await client.parse(self._obj.tolist())
|
|
1401
1431
|
|
|
1402
1432
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
1403
1433
|
|
|
@@ -1407,7 +1437,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1407
1437
|
batch_size: int | None = None,
|
|
1408
1438
|
max_concurrency: int = 8,
|
|
1409
1439
|
show_progress: bool = False,
|
|
1410
|
-
**api_kwargs,
|
|
1411
1440
|
) -> pd.Series:
|
|
1412
1441
|
"""Execute a prepared task on every Series element (asynchronously).
|
|
1413
1442
|
|
|
@@ -1442,9 +1471,8 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1442
1471
|
requests. Defaults to 8.
|
|
1443
1472
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1444
1473
|
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1474
|
+
Note:
|
|
1475
|
+
The task's stored API parameters are used. Core batching / routing
|
|
1448
1476
|
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1449
1477
|
library and cannot be overridden.
|
|
1450
1478
|
|
|
@@ -1460,42 +1488,39 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1460
1488
|
cache=AsyncBatchingMapProxy(
|
|
1461
1489
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1462
1490
|
),
|
|
1463
|
-
**api_kwargs,
|
|
1464
1491
|
)
|
|
1465
1492
|
|
|
1466
1493
|
async def parse_with_cache(
|
|
1467
1494
|
self,
|
|
1468
1495
|
instructions: str,
|
|
1469
1496
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1470
|
-
response_format: ResponseFormat = None,
|
|
1497
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1471
1498
|
max_examples: int = 100,
|
|
1472
|
-
temperature: float | None = 0.0,
|
|
1473
|
-
top_p: float = 1.0,
|
|
1474
1499
|
**api_kwargs,
|
|
1475
1500
|
) -> pd.Series:
|
|
1476
|
-
"""Parse Series values using an LLM with a provided cache (asynchronously).
|
|
1501
|
+
"""Parse Series values into structured data using an LLM with a provided cache (asynchronously).
|
|
1477
1502
|
|
|
1478
|
-
This method
|
|
1479
|
-
|
|
1503
|
+
This async method provides external cache control while parsing Series
|
|
1504
|
+
content into structured data. Automatic schema inference is performed
|
|
1505
|
+
when no response format is specified.
|
|
1480
1506
|
|
|
1481
1507
|
Args:
|
|
1482
|
-
instructions (str):
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
for
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
`seed`, etc.) are forwarded verbatim to the underlying client.
|
|
1508
|
+
instructions (str): Plain language description of what to extract
|
|
1509
|
+
(e.g., "Extract dates, amounts, and descriptions from receipts").
|
|
1510
|
+
Guides both extraction and schema inference.
|
|
1511
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
|
|
1512
|
+
async cache for managing concurrent API calls and deduplication.
|
|
1513
|
+
Set cache.batch_size=None for automatic optimization.
|
|
1514
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1515
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
1516
|
+
type, or None for automatic inference. Defaults to None.
|
|
1517
|
+
max_examples (int, optional): Maximum values to analyze for schema
|
|
1518
|
+
inference (when response_format is None). Defaults to 100.
|
|
1519
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1495
1520
|
|
|
1496
1521
|
Returns:
|
|
1497
|
-
pandas.Series: Series
|
|
1498
|
-
|
|
1522
|
+
pandas.Series: Series containing parsed structured data aligned
|
|
1523
|
+
with the original index.
|
|
1499
1524
|
|
|
1500
1525
|
Note:
|
|
1501
1526
|
This is an asynchronous method and must be awaited.
|
|
@@ -1503,51 +1528,59 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1503
1528
|
schema: InferredSchema | None = None
|
|
1504
1529
|
if response_format is None:
|
|
1505
1530
|
# Use synchronous schema inference
|
|
1506
|
-
schema = self._obj.ai.infer_schema(
|
|
1531
|
+
schema = self._obj.ai.infer_schema(instructions=instructions, max_examples=max_examples)
|
|
1507
1532
|
|
|
1508
1533
|
return await self.responses_with_cache(
|
|
1509
1534
|
instructions=schema.inference_prompt if schema else instructions,
|
|
1510
1535
|
cache=cache,
|
|
1511
1536
|
response_format=response_format or schema.model,
|
|
1512
|
-
temperature=temperature,
|
|
1513
|
-
top_p=top_p,
|
|
1514
1537
|
**api_kwargs,
|
|
1515
1538
|
)
|
|
1516
1539
|
|
|
1517
1540
|
async def parse(
|
|
1518
1541
|
self,
|
|
1519
1542
|
instructions: str,
|
|
1520
|
-
response_format: ResponseFormat = None,
|
|
1543
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1521
1544
|
max_examples: int = 100,
|
|
1522
1545
|
batch_size: int | None = None,
|
|
1523
1546
|
max_concurrency: int = 8,
|
|
1524
1547
|
show_progress: bool = False,
|
|
1525
|
-
temperature: float | None = 0.0,
|
|
1526
|
-
top_p: float = 1.0,
|
|
1527
1548
|
**api_kwargs,
|
|
1528
1549
|
) -> pd.Series:
|
|
1529
|
-
"""Parse Series values using an LLM
|
|
1550
|
+
"""Parse Series values into structured data using an LLM (asynchronously).
|
|
1530
1551
|
|
|
1531
|
-
|
|
1532
|
-
|
|
1552
|
+
Async version of the parse method, extracting structured information
|
|
1553
|
+
from unstructured text with automatic schema inference when needed.
|
|
1533
1554
|
|
|
1534
1555
|
Args:
|
|
1535
|
-
instructions (str):
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1556
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1557
|
+
product names, prices, and categories from descriptions").
|
|
1558
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1559
|
+
structure. None triggers automatic schema inference. Defaults to None.
|
|
1560
|
+
max_examples (int, optional): Maximum values for schema inference.
|
|
1539
1561
|
Defaults to 100.
|
|
1540
|
-
batch_size (int | None):
|
|
1541
|
-
Defaults to None
|
|
1542
|
-
max_concurrency (int): Maximum
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1562
|
+
batch_size (int | None, optional): Requests per batch. None for
|
|
1563
|
+
automatic optimization. Defaults to None.
|
|
1564
|
+
max_concurrency (int, optional): Maximum concurrent API requests.
|
|
1565
|
+
Defaults to 8.
|
|
1566
|
+
show_progress (bool, optional): Show progress bar. Defaults to False.
|
|
1567
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1547
1568
|
|
|
1548
1569
|
Returns:
|
|
1549
|
-
pandas.Series:
|
|
1550
|
-
|
|
1570
|
+
pandas.Series: Parsed structured data indexed like the original Series.
|
|
1571
|
+
|
|
1572
|
+
Example:
|
|
1573
|
+
```python
|
|
1574
|
+
emails = pd.Series([
|
|
1575
|
+
"Meeting tomorrow at 3pm with John about Q4 planning",
|
|
1576
|
+
"Lunch with Sarah on Friday to discuss new project"
|
|
1577
|
+
])
|
|
1578
|
+
|
|
1579
|
+
# Async extraction with schema inference
|
|
1580
|
+
parsed = await emails.aio.parse(
|
|
1581
|
+
"Extract meeting details including time, person, and topic"
|
|
1582
|
+
)
|
|
1583
|
+
```
|
|
1551
1584
|
|
|
1552
1585
|
Note:
|
|
1553
1586
|
This is an asynchronous method and must be awaited.
|
|
@@ -1559,8 +1592,6 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1559
1592
|
),
|
|
1560
1593
|
response_format=response_format,
|
|
1561
1594
|
max_examples=max_examples,
|
|
1562
|
-
temperature=temperature,
|
|
1563
|
-
top_p=top_p,
|
|
1564
1595
|
**api_kwargs,
|
|
1565
1596
|
)
|
|
1566
1597
|
|
|
@@ -1577,8 +1608,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1577
1608
|
instructions: str,
|
|
1578
1609
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1579
1610
|
response_format: type[ResponseFormat] = str,
|
|
1580
|
-
temperature: float | None = 0.0,
|
|
1581
|
-
top_p: float = 1.0,
|
|
1582
1611
|
**api_kwargs,
|
|
1583
1612
|
) -> pd.Series:
|
|
1584
1613
|
"""Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
|
|
@@ -1614,8 +1643,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1614
1643
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1615
1644
|
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
1616
1645
|
responses. Defaults to ``str``.
|
|
1617
|
-
|
|
1618
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1646
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
1619
1647
|
|
|
1620
1648
|
Returns:
|
|
1621
1649
|
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
@@ -1628,8 +1656,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1628
1656
|
instructions=instructions,
|
|
1629
1657
|
cache=cache,
|
|
1630
1658
|
response_format=response_format,
|
|
1631
|
-
temperature=temperature,
|
|
1632
|
-
top_p=top_p,
|
|
1633
1659
|
**api_kwargs,
|
|
1634
1660
|
)
|
|
1635
1661
|
|
|
@@ -1638,8 +1664,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1638
1664
|
instructions: str,
|
|
1639
1665
|
response_format: type[ResponseFormat] = str,
|
|
1640
1666
|
batch_size: int | None = None,
|
|
1641
|
-
temperature: float | None = 0.0,
|
|
1642
|
-
top_p: float = 1.0,
|
|
1643
1667
|
max_concurrency: int = 8,
|
|
1644
1668
|
show_progress: bool = False,
|
|
1645
1669
|
**api_kwargs,
|
|
@@ -1673,8 +1697,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1673
1697
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
1674
1698
|
Defaults to ``None`` (automatic batch size optimization
|
|
1675
1699
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
1676
|
-
|
|
1677
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1700
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
1678
1701
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1679
1702
|
requests. Defaults to ``8``.
|
|
1680
1703
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -1691,8 +1714,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1691
1714
|
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1692
1715
|
),
|
|
1693
1716
|
response_format=response_format,
|
|
1694
|
-
temperature=temperature,
|
|
1695
|
-
top_p=top_p,
|
|
1696
1717
|
**api_kwargs,
|
|
1697
1718
|
)
|
|
1698
1719
|
|
|
@@ -1700,7 +1721,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1700
1721
|
self,
|
|
1701
1722
|
task: PreparedTask[ResponseFormat],
|
|
1702
1723
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1703
|
-
**api_kwargs,
|
|
1704
1724
|
) -> pd.Series:
|
|
1705
1725
|
"""Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
|
|
1706
1726
|
|
|
@@ -1710,8 +1730,8 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1710
1730
|
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
1711
1731
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1712
1732
|
|
|
1713
|
-
|
|
1714
|
-
|
|
1733
|
+
Note:
|
|
1734
|
+
The task's stored API parameters are used. Core routing keys are managed internally.
|
|
1715
1735
|
|
|
1716
1736
|
Returns:
|
|
1717
1737
|
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
@@ -1722,7 +1742,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1722
1742
|
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1723
1743
|
task=task,
|
|
1724
1744
|
cache=cache,
|
|
1725
|
-
**api_kwargs,
|
|
1726
1745
|
)
|
|
1727
1746
|
|
|
1728
1747
|
async def task(
|
|
@@ -1731,7 +1750,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1731
1750
|
batch_size: int | None = None,
|
|
1732
1751
|
max_concurrency: int = 8,
|
|
1733
1752
|
show_progress: bool = False,
|
|
1734
|
-
**api_kwargs,
|
|
1735
1753
|
) -> pd.Series:
|
|
1736
1754
|
"""Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
|
|
1737
1755
|
|
|
@@ -1770,9 +1788,8 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1770
1788
|
requests. Defaults to 8.
|
|
1771
1789
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1772
1790
|
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1791
|
+
Note:
|
|
1792
|
+
The task's stored API parameters are used. Core batching / routing
|
|
1776
1793
|
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1777
1794
|
library and cannot be overridden.
|
|
1778
1795
|
|
|
@@ -1789,43 +1806,34 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1789
1806
|
batch_size=batch_size,
|
|
1790
1807
|
max_concurrency=max_concurrency,
|
|
1791
1808
|
show_progress=show_progress,
|
|
1792
|
-
**api_kwargs,
|
|
1793
1809
|
)
|
|
1794
1810
|
|
|
1795
1811
|
async def parse_with_cache(
|
|
1796
1812
|
self,
|
|
1797
1813
|
instructions: str,
|
|
1798
1814
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1799
|
-
response_format: ResponseFormat = None,
|
|
1815
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1800
1816
|
max_examples: int = 100,
|
|
1801
|
-
temperature: float | None = 0.0,
|
|
1802
|
-
top_p: float = 1.0,
|
|
1803
1817
|
**api_kwargs,
|
|
1804
1818
|
) -> pd.Series:
|
|
1805
|
-
"""Parse DataFrame rows using an LLM with
|
|
1819
|
+
"""Parse DataFrame rows into structured data using an LLM with cache (asynchronously).
|
|
1806
1820
|
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
on the provided purpose.
|
|
1821
|
+
Async method for parsing DataFrame rows (as JSON) with external cache
|
|
1822
|
+
control, enabling deduplication across operations and concurrent processing.
|
|
1810
1823
|
|
|
1811
1824
|
Args:
|
|
1812
|
-
instructions (str):
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1825
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1826
|
+
invoice details including items, quantities, and totals").
|
|
1827
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
|
|
1828
|
+
async cache for concurrent API call management.
|
|
1829
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1830
|
+
structure. None triggers automatic schema inference. Defaults to None.
|
|
1831
|
+
max_examples (int, optional): Maximum rows for schema inference.
|
|
1818
1832
|
Defaults to 100.
|
|
1819
|
-
|
|
1820
|
-
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1821
|
-
|
|
1822
|
-
Additional Keyword Args:
|
|
1823
|
-
Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
|
|
1824
|
-
`seed`, etc.) are forwarded verbatim to the underlying client.
|
|
1833
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1825
1834
|
|
|
1826
1835
|
Returns:
|
|
1827
|
-
pandas.Series:
|
|
1828
|
-
`response_format` or inferred schema model.
|
|
1836
|
+
pandas.Series: Parsed structured data indexed like the original DataFrame.
|
|
1829
1837
|
|
|
1830
1838
|
Note:
|
|
1831
1839
|
This is an asynchronous method and must be awaited.
|
|
@@ -1835,46 +1843,55 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1835
1843
|
cache=cache,
|
|
1836
1844
|
response_format=response_format,
|
|
1837
1845
|
max_examples=max_examples,
|
|
1838
|
-
temperature=temperature,
|
|
1839
|
-
top_p=top_p,
|
|
1840
1846
|
**api_kwargs,
|
|
1841
1847
|
)
|
|
1842
1848
|
|
|
1843
1849
|
async def parse(
|
|
1844
1850
|
self,
|
|
1845
1851
|
instructions: str,
|
|
1846
|
-
response_format: ResponseFormat = None,
|
|
1852
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1847
1853
|
max_examples: int = 100,
|
|
1848
1854
|
batch_size: int | None = None,
|
|
1849
1855
|
max_concurrency: int = 8,
|
|
1850
1856
|
show_progress: bool = False,
|
|
1851
|
-
temperature: float | None = 0.0,
|
|
1852
|
-
top_p: float = 1.0,
|
|
1853
1857
|
**api_kwargs,
|
|
1854
1858
|
) -> pd.Series:
|
|
1855
|
-
"""Parse DataFrame rows using an LLM
|
|
1859
|
+
"""Parse DataFrame rows into structured data using an LLM (asynchronously).
|
|
1856
1860
|
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
on the provided purpose.
|
|
1861
|
+
Async version for extracting structured information from DataFrame rows,
|
|
1862
|
+
with automatic schema inference when no format is specified.
|
|
1860
1863
|
|
|
1861
1864
|
Args:
|
|
1862
|
-
instructions (str):
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1865
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1866
|
+
customer details, order items, and payment information").
|
|
1867
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1868
|
+
structure. None triggers automatic inference. Defaults to None.
|
|
1869
|
+
max_examples (int, optional): Maximum rows for schema inference.
|
|
1866
1870
|
Defaults to 100.
|
|
1867
|
-
batch_size (int | None):
|
|
1868
|
-
Defaults to None
|
|
1869
|
-
max_concurrency (int): Maximum
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1871
|
+
batch_size (int | None, optional): Rows per batch. None for
|
|
1872
|
+
automatic optimization. Defaults to None.
|
|
1873
|
+
max_concurrency (int, optional): Maximum concurrent requests.
|
|
1874
|
+
Defaults to 8.
|
|
1875
|
+
show_progress (bool, optional): Show progress bar. Defaults to False.
|
|
1876
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1874
1877
|
|
|
1875
1878
|
Returns:
|
|
1876
|
-
pandas.Series:
|
|
1877
|
-
|
|
1879
|
+
pandas.Series: Parsed structured data indexed like the original DataFrame.
|
|
1880
|
+
|
|
1881
|
+
Example:
|
|
1882
|
+
```python
|
|
1883
|
+
df = pd.DataFrame({
|
|
1884
|
+
'raw_data': [
|
|
1885
|
+
'Customer: John Doe, Order: 2 laptops @ $1200 each',
|
|
1886
|
+
'Customer: Jane Smith, Order: 5 phones @ $800 each'
|
|
1887
|
+
]
|
|
1888
|
+
})
|
|
1889
|
+
|
|
1890
|
+
# Async parsing with automatic schema inference
|
|
1891
|
+
parsed = await df.aio.parse(
|
|
1892
|
+
"Extract customer name, product, quantity, and unit price"
|
|
1893
|
+
)
|
|
1894
|
+
```
|
|
1878
1895
|
|
|
1879
1896
|
Note:
|
|
1880
1897
|
This is an asynchronous method and must be awaited.
|
|
@@ -1886,8 +1903,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1886
1903
|
),
|
|
1887
1904
|
response_format=response_format,
|
|
1888
1905
|
max_examples=max_examples,
|
|
1889
|
-
temperature=temperature,
|
|
1890
|
-
top_p=top_p,
|
|
1891
1906
|
**api_kwargs,
|
|
1892
1907
|
)
|
|
1893
1908
|
|
|
@@ -1991,7 +2006,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1991
2006
|
batch_size: int | None = None,
|
|
1992
2007
|
max_concurrency: int = 8,
|
|
1993
2008
|
show_progress: bool = False,
|
|
1994
|
-
**api_kwargs,
|
|
1995
2009
|
) -> pd.DataFrame:
|
|
1996
2010
|
"""Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
|
|
1997
2011
|
|
|
@@ -2013,10 +2027,6 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
2013
2027
|
requests. Defaults to 8.
|
|
2014
2028
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
2015
2029
|
|
|
2016
|
-
Additional Keyword Args:
|
|
2017
|
-
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
2018
|
-
``seed``, etc.) are forwarded verbatim to the underlying task execution.
|
|
2019
|
-
|
|
2020
2030
|
Returns:
|
|
2021
2031
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
2022
2032
|
column. The original DataFrame is not modified.
|
|
@@ -2054,7 +2064,10 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
2054
2064
|
return self._obj
|
|
2055
2065
|
|
|
2056
2066
|
filled_values: list[FillNaResponse] = await missing_rows.aio.task(
|
|
2057
|
-
task=task,
|
|
2067
|
+
task=task,
|
|
2068
|
+
batch_size=batch_size,
|
|
2069
|
+
max_concurrency=max_concurrency,
|
|
2070
|
+
show_progress=show_progress,
|
|
2058
2071
|
)
|
|
2059
2072
|
|
|
2060
2073
|
# get deep copy of the DataFrame to avoid modifying the original
|