openaivec 0.14.3__py3-none-any.whl → 0.14.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openaivec/pandas_ext.py CHANGED
@@ -49,6 +49,8 @@ import pandas as pd
49
49
  import tiktoken
50
50
  from openai import AsyncOpenAI, OpenAI
51
51
 
52
+ from openaivec._schema import InferredSchema, SchemaInferenceInput, SchemaInferer
53
+
52
54
  __all__ = [
53
55
  "embeddings_model",
54
56
  "responses_model",
@@ -182,6 +184,27 @@ class OpenAIVecSeriesAccessor:
182
184
  top_p: float = 1.0,
183
185
  **api_kwargs,
184
186
  ) -> pd.Series:
187
+ """Call an LLM once for every Series element using a provided cache.
188
+
189
+ This is a lower-level method that allows explicit cache management for advanced
190
+ use cases. Most users should use the standard ``responses`` method instead.
191
+
192
+ Args:
193
+ instructions (str): System prompt prepended to every user message.
194
+ cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
195
+ batching and deduplication control.
196
+ response_format (Type[ResponseFormat], optional): Pydantic model or built-in
197
+ type the assistant should return. Defaults to ``str``.
198
+ temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
199
+ top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
200
+
201
+ Additional Keyword Args:
202
+ Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
203
+ ``seed``, etc.) are forwarded verbatim to the underlying client.
204
+
205
+ Returns:
206
+ pandas.Series: Series whose values are instances of ``response_format``.
207
+ """
185
208
  client: BatchResponses = BatchResponses(
186
209
  client=CONTAINER.resolve(OpenAI),
187
210
  model_name=CONTAINER.resolve(ResponsesModelName).value,
@@ -195,6 +218,56 @@ class OpenAIVecSeriesAccessor:
195
218
  # Forward any extra kwargs to the underlying Responses API.
196
219
  return pd.Series(client.parse(self._obj.tolist(), **api_kwargs), index=self._obj.index, name=self._obj.name)
197
220
 
221
+ def responses(
222
+ self,
223
+ instructions: str,
224
+ response_format: Type[ResponseFormat] = str,
225
+ batch_size: int | None = None,
226
+ temperature: float | None = 0.0,
227
+ top_p: float = 1.0,
228
+ show_progress: bool = False,
229
+ **api_kwargs,
230
+ ) -> pd.Series:
231
+ """Call an LLM once for every Series element.
232
+
233
+ Example:
234
+ ```python
235
+ animals = pd.Series(["cat", "dog", "elephant"])
236
+ # Basic usage
237
+ animals.ai.responses("translate to French")
238
+
239
+ # With progress bar in Jupyter notebooks
240
+ large_series = pd.Series(["data"] * 1000)
241
+ large_series.ai.responses(
242
+ "analyze this data",
243
+ batch_size=32,
244
+ show_progress=True
245
+ )
246
+ ```
247
+
248
+ Args:
249
+ instructions (str): System prompt prepended to every user message.
250
+ response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
251
+ type the assistant should return. Defaults to ``str``.
252
+ batch_size (int | None, optional): Number of prompts grouped into a single
253
+ request. Defaults to ``None`` (automatic batch size optimization
254
+ based on execution time). Set to a positive integer for fixed batch size.
255
+ temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
256
+ top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
257
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
258
+
259
+ Returns:
260
+ pandas.Series: Series whose values are instances of ``response_format``.
261
+ """
262
+ return self.responses_with_cache(
263
+ instructions=instructions,
264
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
265
+ response_format=response_format,
266
+ temperature=temperature,
267
+ top_p=top_p,
268
+ **api_kwargs,
269
+ )
270
+
198
271
  def embeddings_with_cache(
199
272
  self,
200
273
  cache: BatchingMapProxy[str, np.ndarray],
@@ -205,15 +278,6 @@ class OpenAIVecSeriesAccessor:
205
278
  a pre-configured BatchingMapProxy instance, enabling cache sharing
206
279
  across multiple operations or custom batch size management.
207
280
 
208
- Args:
209
- cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
210
- instance for managing API call batching and deduplication.
211
- Set cache.batch_size=None to enable automatic batch size optimization.
212
-
213
- Returns:
214
- pandas.Series: Series whose values are ``np.ndarray`` objects
215
- (dtype ``float32``).
216
-
217
281
  Example:
218
282
  ```python
219
283
  from openaivec._proxy import BatchingMapProxy
@@ -225,6 +289,15 @@ class OpenAIVecSeriesAccessor:
225
289
  animals = pd.Series(["cat", "dog", "elephant"])
226
290
  embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
227
291
  ```
292
+
293
+ Args:
294
+ cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
295
+ instance for managing API call batching and deduplication.
296
+ Set cache.batch_size=None to enable automatic batch size optimization.
297
+
298
+ Returns:
299
+ pandas.Series: Series whose values are ``np.ndarray`` objects
300
+ (dtype ``float32``).
228
301
  """
229
302
  client: BatchEmbeddings = BatchEmbeddings(
230
303
  client=CONTAINER.resolve(OpenAI),
@@ -238,54 +311,35 @@ class OpenAIVecSeriesAccessor:
238
311
  name=self._obj.name,
239
312
  )
240
313
 
241
- def responses(
242
- self,
243
- instructions: str,
244
- response_format: Type[ResponseFormat] = str,
245
- batch_size: int | None = None,
246
- temperature: float | None = 0.0,
247
- top_p: float = 1.0,
248
- show_progress: bool = False,
249
- **api_kwargs,
250
- ) -> pd.Series:
251
- """Call an LLM once for every Series element.
314
+ def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
315
+ """Compute OpenAI embeddings for every Series element.
252
316
 
253
317
  Example:
254
318
  ```python
255
319
  animals = pd.Series(["cat", "dog", "elephant"])
256
320
  # Basic usage
257
- animals.ai.responses("translate to French")
321
+ animals.ai.embeddings()
258
322
 
259
- # With progress bar in Jupyter notebooks
260
- large_series = pd.Series(["data"] * 1000)
261
- large_series.ai.responses(
262
- "analyze this data",
263
- batch_size=32,
323
+ # With progress bar for large datasets
324
+ large_texts = pd.Series(["text"] * 5000)
325
+ embeddings = large_texts.ai.embeddings(
326
+ batch_size=100,
264
327
  show_progress=True
265
328
  )
266
329
  ```
267
330
 
268
331
  Args:
269
- instructions (str): System prompt prepended to every user message.
270
- response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
271
- type the assistant should return. Defaults to ``str``.
272
- batch_size (int | None, optional): Number of prompts grouped into a single
273
- request. Defaults to ``None`` (automatic batch size optimization
332
+ batch_size (int | None, optional): Number of inputs grouped into a
333
+ single request. Defaults to ``None`` (automatic batch size optimization
274
334
  based on execution time). Set to a positive integer for fixed batch size.
275
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
276
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
277
335
  show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
278
336
 
279
337
  Returns:
280
- pandas.Series: Series whose values are instances of ``response_format``.
338
+ pandas.Series: Series whose values are ``np.ndarray`` objects
339
+ (dtype ``float32``).
281
340
  """
282
- return self.responses_with_cache(
283
- instructions=instructions,
341
+ return self.embeddings_with_cache(
284
342
  cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
285
- response_format=response_format,
286
- temperature=temperature,
287
- top_p=top_p,
288
- **api_kwargs,
289
343
  )
290
344
 
291
345
  def task_with_cache(
@@ -300,6 +354,13 @@ class OpenAIVecSeriesAccessor:
300
354
  response format, temperature and top_p. A supplied ``BatchingMapProxy`` enables
301
355
  cross‑operation deduplicated reuse and external batch size / progress control.
302
356
 
357
+ Example:
358
+ ```python
359
+ from openaivec._proxy import BatchingMapProxy
360
+ shared_cache = BatchingMapProxy(batch_size=64)
361
+ reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
362
+ ```
363
+
303
364
  Args:
304
365
  task (PreparedTask): Prepared task (instructions + response_format + sampling params).
305
366
  cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
@@ -311,13 +372,6 @@ class OpenAIVecSeriesAccessor:
311
372
 
312
373
  Returns:
313
374
  pandas.Series: Task results aligned with the original Series index.
314
-
315
- Example:
316
- ```python
317
- from openaivec._proxy import BatchingMapProxy
318
- shared_cache = BatchingMapProxy(batch_size=64)
319
- reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
320
- ```
321
375
  """
322
376
  client: BatchResponses = BatchResponses(
323
377
  client=CONTAINER.resolve(OpenAI),
@@ -382,36 +436,60 @@ class OpenAIVecSeriesAccessor:
382
436
  **api_kwargs,
383
437
  )
384
438
 
385
- def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
386
- """Compute OpenAI embeddings for every Series element.
439
+ def infer_schema(self, purpose: str, max_examples: int = 100) -> InferredSchema:
440
+ """Infer a structured data schema from Series content using AI.
441
+
442
+ This method analyzes a sample of the Series values to automatically infer
443
+ a structured schema that can be used for consistent data extraction.
444
+ The inferred schema includes field names, types, descriptions, and
445
+ potential enum values based on patterns found in the data.
446
+
447
+ Args:
448
+ purpose (str): Plain language description of how the extracted
449
+ structured data will be used (e.g., "Extract customer sentiment
450
+ signals for analytics", "Parse product features for search").
451
+ This guides field relevance and helps exclude irrelevant information.
452
+ max_examples (int): Maximum number of examples to analyze from the
453
+ Series. The method will sample randomly from the Series up to this
454
+ limit. Defaults to 100.
455
+
456
+ Returns:
457
+ InferredSchema: An object containing:
458
+ - purpose: Normalized statement of the extraction objective
459
+ - fields: List of field specifications with names, types, and descriptions
460
+ - inference_prompt: Reusable prompt for future extractions
461
+ - model: Dynamically generated Pydantic model for parsing
462
+ - task: PreparedTask for batch extraction operations
387
463
 
388
464
  Example:
389
465
  ```python
390
- animals = pd.Series(["cat", "dog", "elephant"])
391
- # Basic usage
392
- animals.ai.embeddings()
466
+ reviews = pd.Series([
467
+ "Great product! Fast shipping and excellent quality.",
468
+ "Terrible experience. Item broke after 2 days.",
469
+ "Average product. Price is fair but nothing special."
470
+ ])
393
471
 
394
- # With progress bar for large datasets
395
- large_texts = pd.Series(["text"] * 5000)
396
- embeddings = large_texts.ai.embeddings(
397
- batch_size=100,
398
- show_progress=True
472
+ # Infer schema for sentiment analysis
473
+ schema = reviews.ai.infer_schema(
474
+ purpose="Extract sentiment and product quality indicators"
399
475
  )
400
- ```
401
476
 
402
- Args:
403
- batch_size (int | None, optional): Number of inputs grouped into a
404
- single request. Defaults to ``None`` (automatic batch size optimization
405
- based on execution time). Set to a positive integer for fixed batch size.
406
- show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
477
+ # Use the inferred schema for batch extraction
478
+ extracted = reviews.ai.task(schema.task)
479
+ ```
407
480
 
408
- Returns:
409
- pandas.Series: Series whose values are ``np.ndarray`` objects
410
- (dtype ``float32``).
481
+ Note:
482
+ The schema inference uses AI to analyze patterns in the data and may
483
+ require multiple attempts to produce a valid schema. Fields are limited
484
+ to primitive types (string, integer, float, boolean) with optional
485
+ enum values for categorical fields.
411
486
  """
412
- return self.embeddings_with_cache(
413
- cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
487
+ inferer = CONTAINER.resolve(SchemaInferer)
488
+
489
+ input: SchemaInferenceInput = SchemaInferenceInput(
490
+ examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(), purpose=purpose
414
491
  )
492
+ return inferer.infer_schema(input)
415
493
 
416
494
  def count_tokens(self) -> pd.Series:
417
495
  """Count `tiktoken` tokens per row.
@@ -459,45 +537,97 @@ class OpenAIVecSeriesAccessor:
459
537
  extracted.columns = [f"{self._obj.name}_{col}" for col in extracted.columns]
460
538
  return extracted
461
539
 
540
+ def auto_extract(
541
+ self,
542
+ purpose: str,
543
+ max_examples: int = 100,
544
+ batch_size: int | None = None,
545
+ show_progress: bool = False,
546
+ **api_kwargs,
547
+ ) -> pd.DataFrame:
548
+ """Automatically infer schema and extract structured data in one step.
462
549
 
463
- @pd.api.extensions.register_dataframe_accessor("ai")
464
- class OpenAIVecDataFrameAccessor:
465
- """pandas DataFrame accessor (``.ai``) that adds OpenAI helpers."""
550
+ This convenience method combines schema inference and data extraction into
551
+ a single operation. It first analyzes a sample of the Series to infer an
552
+ appropriate schema based on the stated purpose, then immediately applies
553
+ that schema to extract structured data from all values in the Series.
466
554
 
467
- def __init__(self, df_obj: pd.DataFrame):
468
- self._obj = df_obj
555
+ Args:
556
+ purpose (str): Plain language description of what information to extract
557
+ and how it will be used (e.g., "Extract product features for search",
558
+ "Parse customer feedback for sentiment analysis"). This guides both
559
+ schema inference and field selection.
560
+ max_examples (int): Maximum number of examples to use for schema inference.
561
+ A larger sample may produce more accurate schemas but increases
562
+ inference time. Defaults to 100.
563
+ batch_size (int | None): Number of requests to process in parallel during
564
+ extraction. Defaults to None (automatic optimization). Set to a specific
565
+ value to control API usage and performance.
566
+ show_progress (bool): Whether to display a progress bar during extraction.
567
+ Useful for large datasets. Defaults to False.
568
+ **api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
569
+ `frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
469
570
 
470
- def extract(self, column: str) -> pd.DataFrame:
471
- """Flatten one column of Pydantic models/dicts into top‑level columns.
571
+ Returns:
572
+ pd.DataFrame: A DataFrame with extracted structured data. Each inferred
573
+ field becomes a column, with the same index as the original Series.
574
+ Column names and types are determined by the inferred schema.
472
575
 
473
576
  Example:
474
577
  ```python
475
- df = pd.DataFrame([
476
- {"animal": {"name": "cat", "legs": 4}},
477
- {"animal": {"name": "dog", "legs": 4}},
478
- {"animal": {"name": "elephant", "legs": 4}},
578
+ # Extract structured data from product reviews
579
+ reviews = pd.Series([
580
+ "Great laptop! 16GB RAM, fast SSD, battery lasts 10 hours",
581
+ "Decent phone. 128GB storage, camera is okay, screen is bright",
582
+ "Gaming desktop with RTX 4090, 32GB RAM, runs everything smoothly"
479
583
  ])
480
- df.ai.extract("animal")
481
- ```
482
- This method returns a DataFrame with the same index as the original,
483
- where each column corresponds to a key in the dictionaries.
484
- The source column is dropped.
485
584
 
486
- Args:
487
- column (str): Column to expand.
585
+ # One-step extraction
586
+ extracted = reviews.ai.auto_extract(
587
+ purpose="Extract product specifications and performance metrics",
588
+ show_progress=True
589
+ )
590
+ # Result: DataFrame with columns like 'ram', 'storage', 'battery_life', etc.
488
591
 
489
- Returns:
490
- pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
592
+ # Extract sentiment and issues from support tickets
593
+ tickets = pd.Series([
594
+ "Account locked, can't reset password, very frustrated",
595
+ "Billing error, charged twice for subscription",
596
+ "Great support! Issue resolved quickly"
597
+ ])
598
+
599
+ features = tickets.ai.auto_extract(
600
+ purpose="Extract issue type and customer sentiment for support analytics"
601
+ )
602
+ ```
603
+
604
+ Note:
605
+ This method is ideal for exploratory data analysis when you don't have
606
+ a predefined schema. For production use cases with stable schemas,
607
+ consider using `infer_schema()` once and reusing the schema with `task()`.
608
+ The inferred schema is not returned, so if you need to inspect or save it,
609
+ use `infer_schema()` and `task()` separately.
491
610
  """
492
- if column not in self._obj.columns:
493
- raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
611
+ schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
494
612
 
495
- return (
496
- self._obj.pipe(lambda df: df.reset_index(drop=True))
497
- .pipe(lambda df: df.join(df[column].ai.extract()))
498
- .pipe(lambda df: df.set_index(self._obj.index))
499
- .pipe(lambda df: df.drop(columns=[column], axis=1))
500
- )
613
+ return pd.DataFrame(
614
+ {
615
+ "inferred": self._obj.ai.task(
616
+ task=schema.task,
617
+ batch_size=batch_size,
618
+ show_progress=show_progress,
619
+ **api_kwargs,
620
+ ),
621
+ }
622
+ ).ai.extract("inferred")
623
+
624
+
625
+ @pd.api.extensions.register_dataframe_accessor("ai")
626
+ class OpenAIVecDataFrameAccessor:
627
+ """pandas DataFrame accessor (``.ai``) that adds OpenAI helpers."""
628
+
629
+ def __init__(self, df_obj: pd.DataFrame):
630
+ self._obj = df_obj
501
631
 
502
632
  def responses_with_cache(
503
633
  self,
@@ -508,25 +638,12 @@ class OpenAIVecDataFrameAccessor:
508
638
  top_p: float = 1.0,
509
639
  **api_kwargs,
510
640
  ) -> pd.Series:
511
- """Generate a response for each row after serialising it to JSON using a provided cache.
641
+ """Generate a response for each row after serializing it to JSON using a provided cache.
512
642
 
513
643
  This method allows external control over caching behavior by accepting
514
644
  a pre-configured BatchingMapProxy instance, enabling cache sharing
515
645
  across multiple operations or custom batch size management.
516
646
 
517
- Args:
518
- instructions (str): System prompt for the assistant.
519
- cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
520
- instance for managing API call batching and deduplication.
521
- Set cache.batch_size=None to enable automatic batch size optimization.
522
- response_format (Type[ResponseFormat], optional): Desired Python type of the
523
- responses. Defaults to ``str``.
524
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
525
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
526
-
527
- Returns:
528
- pandas.Series: Responses aligned with the DataFrame's original index.
529
-
530
647
  Example:
531
648
  ```python
532
649
  from openaivec._proxy import BatchingMapProxy
@@ -544,6 +661,19 @@ class OpenAIVecDataFrameAccessor:
544
661
  cache=shared_cache
545
662
  )
546
663
  ```
664
+
665
+ Args:
666
+ instructions (str): System prompt for the assistant.
667
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
668
+ instance for managing API call batching and deduplication.
669
+ Set cache.batch_size=None to enable automatic batch size optimization.
670
+ response_format (Type[ResponseFormat], optional): Desired Python type of the
671
+ responses. Defaults to ``str``.
672
+ temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
673
+ top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
674
+
675
+ Returns:
676
+ pandas.Series: Responses aligned with the DataFrame's original index.
547
677
  """
548
678
  return _df_rows_to_json_series(self._obj).ai.responses_with_cache(
549
679
  instructions=instructions,
@@ -564,7 +694,7 @@ class OpenAIVecDataFrameAccessor:
564
694
  show_progress: bool = False,
565
695
  **api_kwargs,
566
696
  ) -> pd.Series:
567
- """Generate a response for each row after serialising it to JSON.
697
+ """Generate a response for each row after serializing it to JSON.
568
698
 
569
699
  Example:
570
700
  ```python
@@ -592,7 +722,7 @@ class OpenAIVecDataFrameAccessor:
592
722
  batch_size (int | None, optional): Number of requests sent in one batch.
593
723
  Defaults to ``None`` (automatic batch size optimization
594
724
  based on execution time). Set to a positive integer for fixed batch size.
595
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
725
+ temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
596
726
  top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
597
727
  show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
598
728
 
@@ -608,18 +738,43 @@ class OpenAIVecDataFrameAccessor:
608
738
  **api_kwargs,
609
739
  )
610
740
 
611
- def task(
741
+ def task_with_cache(
612
742
  self,
613
- task: PreparedTask,
614
- batch_size: int | None = None,
615
- show_progress: bool = False,
743
+ task: PreparedTask[ResponseFormat],
744
+ cache: BatchingMapProxy[str, ResponseFormat],
616
745
  **api_kwargs,
617
746
  ) -> pd.Series:
618
- """Execute a prepared task on each DataFrame row after serialising it to JSON.
747
+ """Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
619
748
 
620
- Example:
621
- ```python
622
- from openaivec._model import PreparedTask
749
+ Args:
750
+ task (PreparedTask): Prepared task (instructions + response_format + sampling params).
751
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
752
+
753
+ Additional Keyword Args:
754
+ Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
755
+ ``seed``) forwarded verbatim. Core routing keys are managed internally.
756
+
757
+ Returns:
758
+ pandas.Series: Task results aligned with the DataFrame's original index.
759
+ """
760
+ return _df_rows_to_json_series(self._obj).ai.task_with_cache(
761
+ task=task,
762
+ cache=cache,
763
+ **api_kwargs,
764
+ )
765
+
766
+ def task(
767
+ self,
768
+ task: PreparedTask,
769
+ batch_size: int | None = None,
770
+ show_progress: bool = False,
771
+ **api_kwargs,
772
+ ) -> pd.Series:
773
+ """Execute a prepared task on each DataFrame row after serializing it to JSON.
774
+
775
+ Example:
776
+ ```python
777
+ from openaivec._model import PreparedTask
623
778
 
624
779
  # Assume you have a prepared task for data analysis
625
780
  analysis_task = PreparedTask(...)
@@ -666,29 +821,92 @@ class OpenAIVecDataFrameAccessor:
666
821
  **api_kwargs,
667
822
  )
668
823
 
669
- def task_with_cache(
670
- self,
671
- task: PreparedTask[ResponseFormat],
672
- cache: BatchingMapProxy[str, ResponseFormat],
673
- **api_kwargs,
674
- ) -> pd.Series:
675
- """Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
824
+ def infer_schema(self, purpose: str, max_examples: int = 100) -> InferredSchema:
825
+ """Infer a structured data schema from DataFrame rows using AI.
826
+
827
+ This method analyzes a sample of DataFrame rows to automatically infer
828
+ a structured schema that can be used for consistent data extraction.
829
+ Each row is converted to JSON format and analyzed to identify patterns,
830
+ field types, and potential categorical values.
676
831
 
677
832
  Args:
678
- task (PreparedTask): Prepared task (instructions + response_format + sampling params).
679
- cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
833
+ purpose (str): Plain language description of how the extracted
834
+ structured data will be used (e.g., "Extract operational metrics
835
+ for dashboard", "Parse customer attributes for segmentation").
836
+ This guides field relevance and helps exclude irrelevant information.
837
+ max_examples (int): Maximum number of rows to analyze from the
838
+ DataFrame. The method will sample randomly up to this limit.
839
+ Defaults to 100.
680
840
 
681
- Additional Keyword Args:
682
- Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
683
- ``seed``) forwarded verbatim. Core routing keys are managed internally.
841
+ Returns:
842
+ InferredSchema: An object containing:
843
+ - purpose: Normalized statement of the extraction objective
844
+ - fields: List of field specifications with names, types, and descriptions
845
+ - inference_prompt: Reusable prompt for future extractions
846
+ - model: Dynamically generated Pydantic model for parsing
847
+ - task: PreparedTask for batch extraction operations
848
+
849
+ Example:
850
+ ```python
851
+ df = pd.DataFrame({
852
+ 'text': [
853
+ "Order #123: Shipped to NYC, arriving Tuesday",
854
+ "Order #456: Delayed due to weather, new ETA Friday",
855
+ "Order #789: Delivered to customer in LA"
856
+ ],
857
+ 'timestamp': ['2024-01-01', '2024-01-02', '2024-01-03']
858
+ })
859
+
860
+ # Infer schema for logistics tracking
861
+ schema = df.ai.infer_schema(
862
+ purpose="Extract shipping status and location data for logistics tracking"
863
+ )
864
+
865
+ # Apply the schema to extract structured data
866
+ extracted_df = df.ai.task(schema.task)
867
+ ```
868
+
869
+ Note:
870
+ The DataFrame rows are internally converted to JSON format before
871
+ analysis. The inferred schema is flat (no nested structures) and
872
+ uses only primitive types to ensure compatibility with pandas and
873
+ Spark operations.
874
+ """
875
+ return _df_rows_to_json_series(self._obj).ai.infer_schema(
876
+ purpose=purpose,
877
+ max_examples=max_examples,
878
+ )
879
+
880
+ def extract(self, column: str) -> pd.DataFrame:
881
+ """Flatten one column of Pydantic models/dicts into top‑level columns.
882
+
883
+ Example:
884
+ ```python
885
+ df = pd.DataFrame([
886
+ {"animal": {"name": "cat", "legs": 4}},
887
+ {"animal": {"name": "dog", "legs": 4}},
888
+ {"animal": {"name": "elephant", "legs": 4}},
889
+ ])
890
+ df.ai.extract("animal")
891
+ ```
892
+ This method returns a DataFrame with the same index as the original,
893
+ where each column corresponds to a key in the dictionaries.
894
+ The source column is dropped.
895
+
896
+ Args:
897
+ column (str): Column to expand.
684
898
 
685
899
  Returns:
686
- pandas.Series: Task results aligned with the DataFrame's original index.
900
+ pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
687
901
  """
688
- return _df_rows_to_json_series(self._obj).ai.task_with_cache(
689
- task=task,
690
- cache=cache,
691
- **api_kwargs,
902
+ if column not in self._obj.columns:
903
+ raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
904
+
905
+ return (
906
+ self._obj.pipe(lambda df: df.reset_index(drop=True))
907
+ .pipe(lambda df: df.join(df[column].ai.extract()))
908
+ .pipe(lambda df: df.set_index(self._obj.index))
909
+ .pipe(lambda df: df.drop(columns=[column], axis=1))
692
910
  )
693
911
 
694
912
  def fillna(
@@ -769,6 +987,100 @@ class OpenAIVecDataFrameAccessor:
769
987
 
770
988
  return df
771
989
 
990
+ def auto_extract(
991
+ self,
992
+ purpose: str,
993
+ max_examples: int = 100,
994
+ batch_size: int | None = None,
995
+ show_progress: bool = False,
996
+ **api_kwargs,
997
+ ) -> pd.DataFrame:
998
+ """Automatically infer schema and add extracted fields to the DataFrame.
999
+
1000
+ This convenience method combines schema inference and data extraction to
1001
+ automatically add new columns to the existing DataFrame. It analyzes a
1002
+ sample of the DataFrame rows to infer an appropriate schema based on the
1003
+ stated purpose, then extracts structured data and joins it with the
1004
+ original DataFrame.
1005
+
1006
+ Args:
1007
+ purpose (str): Plain language description of what information to extract
1008
+ and how it will be used (e.g., "Extract customer sentiment metrics",
1009
+ "Parse product attributes for analytics"). This guides both schema
1010
+ inference and field selection.
1011
+ max_examples (int): Maximum number of rows to use for schema inference.
1012
+ A larger sample may produce more accurate schemas but increases
1013
+ inference time. Defaults to 100.
1014
+ batch_size (int | None): Number of requests to process in parallel during
1015
+ extraction. Defaults to None (automatic optimization). Set to a specific
1016
+ value to control API usage and performance.
1017
+ show_progress (bool): Whether to display a progress bar during extraction.
1018
+ Useful for large datasets. Defaults to False.
1019
+ **api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
1020
+ `frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
1021
+
1022
+ Returns:
1023
+ pd.DataFrame: The original DataFrame with new columns added from the
1024
+ inferred structured data. Each inferred field becomes a new column.
1025
+ The original columns and index are preserved.
1026
+
1027
+ Example:
1028
+ ```python
1029
+ # Add sentiment and issue type to support tickets
1030
+ df = pd.DataFrame({
1031
+ 'ticket_id': [1, 2, 3],
1032
+ 'description': [
1033
+ "Can't login, password reset not working",
1034
+ "Billing error, charged twice last month",
1035
+ "Great service, issue resolved quickly!"
1036
+ ],
1037
+ 'date': ['2024-01-01', '2024-01-02', '2024-01-03']
1038
+ })
1039
+
1040
+ # Add inferred fields to existing DataFrame
1041
+ enriched_df = df.ai.auto_extract(
1042
+ purpose="Extract issue type and sentiment for support dashboard",
1043
+ show_progress=True
1044
+ )
1045
+ # Result: Original df with new columns like 'issue_type', 'sentiment', etc.
1046
+
1047
+ # Add product specifications to inventory data
1048
+ inventory = pd.DataFrame({
1049
+ 'sku': ['A001', 'B002', 'C003'],
1050
+ 'description': [
1051
+ "Laptop 16GB RAM, 512GB SSD, Intel i7",
1052
+ "Phone 128GB, 5G, dual camera",
1053
+ "Tablet 10-inch, WiFi only, 64GB"
1054
+ ]
1055
+ })
1056
+
1057
+ enriched_inventory = inventory.ai.auto_extract(
1058
+ purpose="Extract technical specifications for inventory system"
1059
+ )
1060
+ ```
1061
+
1062
+ Note:
1063
+ This method is ideal for enriching existing DataFrames with additional
1064
+ structured fields extracted from text columns. The schema is inferred
1065
+ from the entire DataFrame content (converted to JSON format). For
1066
+ production use cases with stable schemas, consider using `infer_schema()`
1067
+ once and reusing the schema with `task()`.
1068
+ """
1069
+ # Infer schema from DataFrame rows
1070
+ schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
1071
+
1072
+ # Extract structured data using the inferred schema
1073
+ inferred_series = self._obj.ai.task(
1074
+ task=schema.task,
1075
+ batch_size=batch_size,
1076
+ show_progress=show_progress,
1077
+ **api_kwargs,
1078
+ )
1079
+
1080
+ return self._obj.assign(
1081
+ inferred=inferred_series,
1082
+ ).ai.extract("inferred")
1083
+
772
1084
  def similarity(self, col1: str, col2: str) -> pd.Series:
773
1085
  """Compute cosine similarity between two columns containing embedding vectors.
774
1086
 
@@ -776,15 +1088,6 @@ class OpenAIVecDataFrameAccessor:
776
1088
  two columns of the DataFrame. The vectors should be numpy arrays or
777
1089
  array-like objects that support dot product operations.
778
1090
 
779
- Args:
780
- col1 (str): Name of the first column containing embedding vectors.
781
- col2 (str): Name of the second column containing embedding vectors.
782
-
783
- Returns:
784
- pandas.Series: Series containing cosine similarity scores between
785
- corresponding vectors in col1 and col2, with values ranging
786
- from -1 to 1, where 1 indicates identical direction.
787
-
788
1091
  Example:
789
1092
  ```python
790
1093
  df = pd.DataFrame({
@@ -793,6 +1096,15 @@ class OpenAIVecDataFrameAccessor:
793
1096
  })
794
1097
  similarities = df.ai.similarity('vec1', 'vec2')
795
1098
  ```
1099
+
1100
+ Args:
1101
+ col1 (str): Name of the first column containing embedding vectors.
1102
+ col2 (str): Name of the second column containing embedding vectors.
1103
+
1104
+ Returns:
1105
+ pandas.Series: Series containing cosine similarity scores between
1106
+ corresponding vectors in col1 and col2, with values ranging
1107
+ from -1 to 1, where 1 indicates identical direction.
796
1108
  """
797
1109
  return self._obj.apply(
798
1110
  lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
@@ -823,6 +1135,16 @@ class AsyncOpenAIVecSeriesAccessor:
823
1135
  across multiple operations or custom batch size management. The concurrency
824
1136
  is controlled by the cache instance itself.
825
1137
 
1138
+ Example:
1139
+ ```python
1140
+ result = await series.aio.responses_with_cache(
1141
+ "classify",
1142
+ cache=shared,
1143
+ max_output_tokens=256,
1144
+ frequency_penalty=0.2,
1145
+ )
1146
+ ```
1147
+
826
1148
  Args:
827
1149
  instructions (str): System prompt prepended to every user message.
828
1150
  cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
@@ -841,16 +1163,6 @@ class AsyncOpenAIVecSeriesAccessor:
841
1163
  Returns:
842
1164
  pandas.Series: Series whose values are instances of ``response_format``.
843
1165
 
844
- Example:
845
- ```python
846
- result = await series.aio.responses_with_cache(
847
- "classify",
848
- cache=shared,
849
- max_output_tokens=256,
850
- frequency_penalty=0.2,
851
- )
852
- ```
853
-
854
1166
  Note:
855
1167
  This is an asynchronous method and must be awaited.
856
1168
  """
@@ -866,122 +1178,6 @@ class AsyncOpenAIVecSeriesAccessor:
866
1178
  results = await client.parse(self._obj.tolist(), **api_kwargs)
867
1179
  return pd.Series(results, index=self._obj.index, name=self._obj.name)
868
1180
 
869
- async def embeddings_with_cache(
870
- self,
871
- cache: AsyncBatchingMapProxy[str, np.ndarray],
872
- ) -> pd.Series:
873
- """Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
874
-
875
- This method allows external control over caching behavior by accepting
876
- a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
877
- across multiple operations or custom batch size management. The concurrency
878
- is controlled by the cache instance itself.
879
-
880
- Args:
881
- cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
882
- instance for managing API call batching and deduplication.
883
- Set cache.batch_size=None to enable automatic batch size optimization.
884
-
885
- Returns:
886
- pandas.Series: Series whose values are ``np.ndarray`` objects
887
- (dtype ``float32``).
888
-
889
- Example:
890
- ```python
891
- from openaivec._proxy import AsyncBatchingMapProxy
892
- import numpy as np
893
-
894
- # Create a shared cache with custom batch size and concurrency
895
- shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
896
- batch_size=64, max_concurrency=4
897
- )
898
-
899
- animals = pd.Series(["cat", "dog", "elephant"])
900
- # Must be awaited
901
- embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
902
- ```
903
-
904
- Note:
905
- This is an asynchronous method and must be awaited.
906
- """
907
- client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
908
- client=CONTAINER.resolve(AsyncOpenAI),
909
- model_name=CONTAINER.resolve(EmbeddingsModelName).value,
910
- cache=cache,
911
- )
912
-
913
- # Await the async operation
914
- results = await client.create(self._obj.tolist())
915
-
916
- return pd.Series(
917
- results,
918
- index=self._obj.index,
919
- name=self._obj.name,
920
- )
921
-
922
- async def task_with_cache(
923
- self,
924
- task: PreparedTask[ResponseFormat],
925
- cache: AsyncBatchingMapProxy[str, ResponseFormat],
926
- **api_kwargs,
927
- ) -> pd.Series:
928
- """Execute a prepared task on every Series element using a provided cache (asynchronously).
929
-
930
- This method allows external control over caching behavior by accepting
931
- a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
932
- across multiple operations or custom batch size management. The concurrency
933
- is controlled by the cache instance itself.
934
-
935
- Args:
936
- task (PreparedTask): A pre-configured task containing instructions,
937
- response format, and other parameters for processing the inputs.
938
- cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
939
- instance for managing API call batching and deduplication.
940
- Set cache.batch_size=None to enable automatic batch size optimization.
941
-
942
- Additional Keyword Args:
943
- Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
944
- ``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
945
- keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
946
- library and cannot be overridden.
947
-
948
- Returns:
949
- pandas.Series: Series whose values are instances of the task's
950
- response format, aligned with the original Series index.
951
-
952
- Example:
953
- ```python
954
- from openaivec._model import PreparedTask
955
- from openaivec._proxy import AsyncBatchingMapProxy
956
-
957
- # Create a shared cache with custom batch size and concurrency
958
- shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
959
-
960
- # Assume you have a prepared task for sentiment analysis
961
- sentiment_task = PreparedTask(...)
962
-
963
- reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
964
- # Must be awaited
965
- results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
966
- ```
967
-
968
- Note:
969
- This is an asynchronous method and must be awaited.
970
- """
971
- client = AsyncBatchResponses(
972
- client=CONTAINER.resolve(AsyncOpenAI),
973
- model_name=CONTAINER.resolve(ResponsesModelName).value,
974
- system_message=task.instructions,
975
- response_format=task.response_format,
976
- cache=cache,
977
- temperature=task.temperature,
978
- top_p=task.top_p,
979
- )
980
- # Await the async operation
981
- results = await client.parse(self._obj.tolist(), **api_kwargs)
982
-
983
- return pd.Series(results, index=self._obj.index, name=self._obj.name)
984
-
985
1181
  async def responses(
986
1182
  self,
987
1183
  instructions: str,
@@ -1018,27 +1214,80 @@ class AsyncOpenAIVecSeriesAccessor:
1018
1214
  batch_size (int | None, optional): Number of prompts grouped into a single
1019
1215
  request. Defaults to ``None`` (automatic batch size optimization
1020
1216
  based on execution time). Set to a positive integer for fixed batch size.
1021
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
1217
+ temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
1022
1218
  top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
1023
1219
  max_concurrency (int, optional): Maximum number of concurrent
1024
1220
  requests. Defaults to ``8``.
1025
1221
  show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
1026
1222
 
1027
1223
  Returns:
1028
- pandas.Series: Series whose values are instances of ``response_format``.
1224
+ pandas.Series: Series whose values are instances of ``response_format``.
1225
+
1226
+ Note:
1227
+ This is an asynchronous method and must be awaited.
1228
+ """
1229
+ return await self.responses_with_cache(
1230
+ instructions=instructions,
1231
+ cache=AsyncBatchingMapProxy(
1232
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1233
+ ),
1234
+ response_format=response_format,
1235
+ temperature=temperature,
1236
+ top_p=top_p,
1237
+ **api_kwargs,
1238
+ )
1239
+
1240
+ async def embeddings_with_cache(
1241
+ self,
1242
+ cache: AsyncBatchingMapProxy[str, np.ndarray],
1243
+ ) -> pd.Series:
1244
+ """Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
1245
+
1246
+ This method allows external control over caching behavior by accepting
1247
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1248
+ across multiple operations or custom batch size management. The concurrency
1249
+ is controlled by the cache instance itself.
1250
+
1251
+ Example:
1252
+ ```python
1253
+ from openaivec._proxy import AsyncBatchingMapProxy
1254
+ import numpy as np
1255
+
1256
+ # Create a shared cache with custom batch size and concurrency
1257
+ shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
1258
+ batch_size=64, max_concurrency=4
1259
+ )
1260
+
1261
+ animals = pd.Series(["cat", "dog", "elephant"])
1262
+ # Must be awaited
1263
+ embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
1264
+ ```
1265
+
1266
+ Args:
1267
+ cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
1268
+ instance for managing API call batching and deduplication.
1269
+ Set cache.batch_size=None to enable automatic batch size optimization.
1270
+
1271
+ Returns:
1272
+ pandas.Series: Series whose values are ``np.ndarray`` objects
1273
+ (dtype ``float32``).
1029
1274
 
1030
1275
  Note:
1031
1276
  This is an asynchronous method and must be awaited.
1032
1277
  """
1033
- return await self.responses_with_cache(
1034
- instructions=instructions,
1035
- cache=AsyncBatchingMapProxy(
1036
- batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1037
- ),
1038
- response_format=response_format,
1039
- temperature=temperature,
1040
- top_p=top_p,
1041
- **api_kwargs,
1278
+ client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
1279
+ client=CONTAINER.resolve(AsyncOpenAI),
1280
+ model_name=CONTAINER.resolve(EmbeddingsModelName).value,
1281
+ cache=cache,
1282
+ )
1283
+
1284
+ # Await the async operation
1285
+ results = await client.create(self._obj.tolist())
1286
+
1287
+ return pd.Series(
1288
+ results,
1289
+ index=self._obj.index,
1290
+ name=self._obj.name,
1042
1291
  )
1043
1292
 
1044
1293
  async def embeddings(
@@ -1082,6 +1331,69 @@ class AsyncOpenAIVecSeriesAccessor:
1082
1331
  ),
1083
1332
  )
1084
1333
 
1334
+ async def task_with_cache(
1335
+ self,
1336
+ task: PreparedTask[ResponseFormat],
1337
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1338
+ **api_kwargs,
1339
+ ) -> pd.Series:
1340
+ """Execute a prepared task on every Series element using a provided cache (asynchronously).
1341
+
1342
+ This method allows external control over caching behavior by accepting
1343
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1344
+ across multiple operations or custom batch size management. The concurrency
1345
+ is controlled by the cache instance itself.
1346
+
1347
+ Args:
1348
+ task (PreparedTask): A pre-configured task containing instructions,
1349
+ response format, and other parameters for processing the inputs.
1350
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
1351
+ instance for managing API call batching and deduplication.
1352
+ Set cache.batch_size=None to enable automatic batch size optimization.
1353
+
1354
+ Example:
1355
+ ```python
1356
+ from openaivec._model import PreparedTask
1357
+ from openaivec._proxy import AsyncBatchingMapProxy
1358
+
1359
+ # Create a shared cache with custom batch size and concurrency
1360
+ shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
1361
+
1362
+ # Assume you have a prepared task for sentiment analysis
1363
+ sentiment_task = PreparedTask(...)
1364
+
1365
+ reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
1366
+ # Must be awaited
1367
+ results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
1368
+ ```
1369
+
1370
+ Additional Keyword Args:
1371
+ Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
1372
+ ``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
1373
+ keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
1374
+ library and cannot be overridden.
1375
+
1376
+ Returns:
1377
+ pandas.Series: Series whose values are instances of the task's
1378
+ response format, aligned with the original Series index.
1379
+
1380
+ Note:
1381
+ This is an asynchronous method and must be awaited.
1382
+ """
1383
+ client = AsyncBatchResponses(
1384
+ client=CONTAINER.resolve(AsyncOpenAI),
1385
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
1386
+ system_message=task.instructions,
1387
+ response_format=task.response_format,
1388
+ cache=cache,
1389
+ temperature=task.temperature,
1390
+ top_p=task.top_p,
1391
+ )
1392
+ # Await the async operation
1393
+ results = await client.parse(self._obj.tolist(), **api_kwargs)
1394
+
1395
+ return pd.Series(results, index=self._obj.index, name=self._obj.name)
1396
+
1085
1397
  async def task(
1086
1398
  self,
1087
1399
  task: PreparedTask,
@@ -1144,6 +1456,96 @@ class AsyncOpenAIVecSeriesAccessor:
1144
1456
  **api_kwargs,
1145
1457
  )
1146
1458
 
1459
+ async def auto_extract(
1460
+ self,
1461
+ purpose: str,
1462
+ max_examples: int = 100,
1463
+ batch_size: int | None = None,
1464
+ max_concurrency: int = 8,
1465
+ show_progress: bool = False,
1466
+ **api_kwargs,
1467
+ ) -> pd.DataFrame:
1468
+ """Automatically infer schema and extract structured data in one step (asynchronously).
1469
+
1470
+ This convenience method combines schema inference and data extraction into
1471
+ a single operation. It first analyzes a sample of the Series to infer an
1472
+ appropriate schema based on the stated purpose, then immediately applies
1473
+ that schema to extract structured data from all values in the Series.
1474
+
1475
+ Args:
1476
+ purpose (str): Plain language description of what information to extract
1477
+ and how it will be used (e.g., "Extract product features for search",
1478
+ "Parse customer feedback for sentiment analysis"). This guides both
1479
+ schema inference and field selection.
1480
+ max_examples (int): Maximum number of examples to use for schema inference.
1481
+ A larger sample may produce more accurate schemas but increases
1482
+ inference time. Defaults to 100.
1483
+ batch_size (int | None): Number of requests to process in parallel during
1484
+ extraction. Defaults to None (automatic optimization). Set to a specific
1485
+ value to control API usage and performance.
1486
+ max_concurrency (int): Maximum number of concurrent requests during
1487
+ extraction. Defaults to 8.
1488
+ show_progress (bool): Whether to display a progress bar during extraction.
1489
+ Useful for large datasets. Defaults to False.
1490
+ **api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
1491
+ `frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
1492
+
1493
+ Returns:
1494
+ pd.DataFrame: A DataFrame with extracted structured data. Each inferred
1495
+ field becomes a column, with the same index as the original Series.
1496
+ Column names and types are determined by the inferred schema.
1497
+
1498
+ Example:
1499
+ ```python
1500
+ # Extract structured data from product reviews
1501
+ reviews = pd.Series([
1502
+ "Great laptop! 16GB RAM, fast SSD, battery lasts 10 hours",
1503
+ "Decent phone. 128GB storage, camera is okay, screen is bright",
1504
+ "Gaming desktop with RTX 4090, 32GB RAM, runs everything smoothly"
1505
+ ])
1506
+
1507
+ # One-step extraction (must be awaited)
1508
+ extracted = await reviews.aio.auto_extract(
1509
+ purpose="Extract product specifications and performance metrics",
1510
+ max_concurrency=4,
1511
+ show_progress=True
1512
+ )
1513
+ # Result: DataFrame with columns like 'ram', 'storage', 'battery_life', etc.
1514
+
1515
+ # Extract sentiment and issues from support tickets
1516
+ tickets = pd.Series([
1517
+ "Account locked, can't reset password, very frustrated",
1518
+ "Billing error, charged twice for subscription",
1519
+ "Great support! Issue resolved quickly"
1520
+ ])
1521
+
1522
+ features = await tickets.aio.auto_extract(
1523
+ purpose="Extract issue type and customer sentiment for support analytics",
1524
+ batch_size=32
1525
+ )
1526
+ ```
1527
+
1528
+ Note:
1529
+ This is an asynchronous method and must be awaited. This method is ideal
1530
+ for exploratory data analysis when you don't have a predefined schema.
1531
+ For production use cases with stable schemas, consider using the synchronous
1532
+ `infer_schema()` once and reusing the schema with `task()`. The inferred
1533
+ schema is not returned, so if you need to inspect or save it, use
1534
+ `infer_schema()` and `task()` separately.
1535
+ """
1536
+ # Use synchronous infer_schema since it's not async
1537
+ schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
1538
+
1539
+ inferred_series = await self._obj.aio.task(
1540
+ task=schema.task,
1541
+ batch_size=batch_size,
1542
+ max_concurrency=max_concurrency,
1543
+ show_progress=show_progress,
1544
+ **api_kwargs,
1545
+ )
1546
+
1547
+ return pd.DataFrame({"inferred": inferred_series}).ai.extract("inferred")
1548
+
1147
1549
 
1148
1550
  @pd.api.extensions.register_dataframe_accessor("aio")
1149
1551
  class AsyncOpenAIVecDataFrameAccessor:
@@ -1161,26 +1563,13 @@ class AsyncOpenAIVecDataFrameAccessor:
1161
1563
  top_p: float = 1.0,
1162
1564
  **api_kwargs,
1163
1565
  ) -> pd.Series:
1164
- """Generate a response for each row after serialising it to JSON using a provided cache (asynchronously).
1566
+ """Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
1165
1567
 
1166
1568
  This method allows external control over caching behavior by accepting
1167
1569
  a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1168
1570
  across multiple operations or custom batch size management. The concurrency
1169
1571
  is controlled by the cache instance itself.
1170
1572
 
1171
- Args:
1172
- instructions (str): System prompt for the assistant.
1173
- cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
1174
- instance for managing API call batching and deduplication.
1175
- Set cache.batch_size=None to enable automatic batch size optimization.
1176
- response_format (Type[ResponseFormat], optional): Desired Python type of the
1177
- responses. Defaults to ``str``.
1178
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
1179
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
1180
-
1181
- Returns:
1182
- pandas.Series: Responses aligned with the DataFrame's original index.
1183
-
1184
1573
  Example:
1185
1574
  ```python
1186
1575
  from openaivec._proxy import AsyncBatchingMapProxy
@@ -1200,6 +1589,19 @@ class AsyncOpenAIVecDataFrameAccessor:
1200
1589
  )
1201
1590
  ```
1202
1591
 
1592
+ Args:
1593
+ instructions (str): System prompt for the assistant.
1594
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
1595
+ instance for managing API call batching and deduplication.
1596
+ Set cache.batch_size=None to enable automatic batch size optimization.
1597
+ response_format (Type[ResponseFormat], optional): Desired Python type of the
1598
+ responses. Defaults to ``str``.
1599
+ temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
1600
+ top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
1601
+
1602
+ Returns:
1603
+ pandas.Series: Responses aligned with the DataFrame's original index.
1604
+
1203
1605
  Note:
1204
1606
  This is an asynchronous method and must be awaited.
1205
1607
  """
@@ -1224,7 +1626,7 @@ class AsyncOpenAIVecDataFrameAccessor:
1224
1626
  show_progress: bool = False,
1225
1627
  **api_kwargs,
1226
1628
  ) -> pd.Series:
1227
- """Generate a response for each row after serialising it to JSON (asynchronously).
1629
+ """Generate a response for each row after serializing it to JSON (asynchronously).
1228
1630
 
1229
1631
  Example:
1230
1632
  ```python
@@ -1253,7 +1655,7 @@ class AsyncOpenAIVecDataFrameAccessor:
1253
1655
  batch_size (int | None, optional): Number of requests sent in one batch.
1254
1656
  Defaults to ``None`` (automatic batch size optimization
1255
1657
  based on execution time). Set to a positive integer for fixed batch size.
1256
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
1658
+ temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
1257
1659
  top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
1258
1660
  max_concurrency (int, optional): Maximum number of concurrent
1259
1661
  requests. Defaults to ``8``.
@@ -1276,6 +1678,35 @@ class AsyncOpenAIVecDataFrameAccessor:
1276
1678
  **api_kwargs,
1277
1679
  )
1278
1680
 
1681
+ async def task_with_cache(
1682
+ self,
1683
+ task: PreparedTask[ResponseFormat],
1684
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1685
+ **api_kwargs,
1686
+ ) -> pd.Series:
1687
+ """Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
1688
+
1689
+ After serializing each row to JSON, this method executes the prepared task.
1690
+
1691
+ Args:
1692
+ task (PreparedTask): Prepared task (instructions + response_format + sampling params).
1693
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
1694
+
1695
+ Additional Keyword Args:
1696
+ Arbitrary OpenAI Responses API parameters forwarded verbatim. Core routing keys are protected.
1697
+
1698
+ Returns:
1699
+ pandas.Series: Task results aligned with the DataFrame's original index.
1700
+
1701
+ Note:
1702
+ This is an asynchronous method and must be awaited.
1703
+ """
1704
+ return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
1705
+ task=task,
1706
+ cache=cache,
1707
+ **api_kwargs,
1708
+ )
1709
+
1279
1710
  async def task(
1280
1711
  self,
1281
1712
  task: PreparedTask,
@@ -1284,7 +1715,7 @@ class AsyncOpenAIVecDataFrameAccessor:
1284
1715
  show_progress: bool = False,
1285
1716
  **api_kwargs,
1286
1717
  ) -> pd.Series:
1287
- """Execute a prepared task on each DataFrame row after serialising it to JSON (asynchronously).
1718
+ """Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
1288
1719
 
1289
1720
  Example:
1290
1721
  ```python
@@ -1343,40 +1774,24 @@ class AsyncOpenAIVecDataFrameAccessor:
1343
1774
  **api_kwargs,
1344
1775
  )
1345
1776
 
1346
- async def task_with_cache(
1347
- self,
1348
- task: PreparedTask[ResponseFormat],
1349
- cache: AsyncBatchingMapProxy[str, ResponseFormat],
1350
- **api_kwargs,
1351
- ) -> pd.Series:
1352
- """Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache (async).
1353
-
1354
- Args:
1355
- task (PreparedTask): Prepared task (instructions + response_format + sampling params).
1356
- cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
1357
-
1358
- Additional Keyword Args:
1359
- Arbitrary OpenAI Responses API parameters forwarded verbatim. Core routing keys are protected.
1360
-
1361
- Returns:
1362
- pandas.Series: Task results aligned with the DataFrame's original index.
1363
-
1364
- Note:
1365
- This is an asynchronous method and must be awaited.
1366
- """
1367
- return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
1368
- task=task,
1369
- cache=cache,
1370
- **api_kwargs,
1371
- )
1372
-
1373
1777
  async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
1374
- """
1375
- Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
1778
+ """Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
1376
1779
 
1377
1780
  This method allows chaining operations on the DataFrame, similar to pandas' `pipe` method,
1378
1781
  but with support for asynchronous functions.
1379
1782
 
1783
+ Example:
1784
+ ```python
1785
+ async def process_data(df):
1786
+ # Simulate an asynchronous computation
1787
+ await asyncio.sleep(1)
1788
+ return df.dropna()
1789
+
1790
+ df = pd.DataFrame({"col": [1, 2, None, 4]})
1791
+ # Must be awaited
1792
+ result = await df.aio.pipe(process_data)
1793
+ ```
1794
+
1380
1795
  Args:
1381
1796
  func (Callable[[pd.DataFrame], Awaitable[T] | T]): A function that takes a DataFrame
1382
1797
  as input and returns either a result or an awaitable result.
@@ -1538,3 +1953,103 @@ class AsyncOpenAIVecDataFrameAccessor:
1538
1953
  df.at[actual_index, target_column_name] = result.output
1539
1954
 
1540
1955
  return df
1956
+
1957
+ async def auto_extract(
1958
+ self,
1959
+ purpose: str,
1960
+ max_examples: int = 100,
1961
+ batch_size: int | None = None,
1962
+ max_concurrency: int = 8,
1963
+ show_progress: bool = False,
1964
+ **api_kwargs,
1965
+ ) -> pd.DataFrame:
1966
+ """Automatically infer schema and add extracted fields to the DataFrame (asynchronously).
1967
+
1968
+ This convenience method combines schema inference and data extraction to
1969
+ automatically add new columns to the existing DataFrame. It analyzes a
1970
+ sample of the DataFrame rows to infer an appropriate schema based on the
1971
+ stated purpose, then extracts structured data and joins it with the
1972
+ original DataFrame.
1973
+
1974
+ Args:
1975
+ purpose (str): Plain language description of what information to extract
1976
+ and how it will be used (e.g., "Extract customer sentiment metrics",
1977
+ "Parse product attributes for analytics"). This guides both schema
1978
+ inference and field selection.
1979
+ max_examples (int): Maximum number of rows to use for schema inference.
1980
+ A larger sample may produce more accurate schemas but increases
1981
+ inference time. Defaults to 100.
1982
+ batch_size (int | None): Number of requests to process in parallel during
1983
+ extraction. Defaults to None (automatic optimization). Set to a specific
1984
+ value to control API usage and performance.
1985
+ max_concurrency (int): Maximum number of concurrent requests during
1986
+ extraction. Defaults to 8.
1987
+ show_progress (bool): Whether to display a progress bar during extraction.
1988
+ Useful for large datasets. Defaults to False.
1989
+ **api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
1990
+ `frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
1991
+
1992
+ Returns:
1993
+ pd.DataFrame: The original DataFrame with new columns added from the
1994
+ inferred structured data. Each inferred field becomes a new column.
1995
+ The original columns and index are preserved.
1996
+
1997
+ Example:
1998
+ ```python
1999
+ # Add sentiment and issue type to support tickets
2000
+ df = pd.DataFrame({
2001
+ 'ticket_id': [1, 2, 3],
2002
+ 'description': [
2003
+ "Can't login, password reset not working",
2004
+ "Billing error, charged twice last month",
2005
+ "Great service, issue resolved quickly!"
2006
+ ],
2007
+ 'date': ['2024-01-01', '2024-01-02', '2024-01-03']
2008
+ })
2009
+
2010
+ # Add inferred fields to existing DataFrame (must be awaited)
2011
+ enriched_df = await df.aio.auto_extract(
2012
+ purpose="Extract issue type and sentiment for support dashboard",
2013
+ max_concurrency=4,
2014
+ show_progress=True
2015
+ )
2016
+ # Result: Original df with new columns like 'issue_type', 'sentiment', etc.
2017
+
2018
+ # Add product specifications to inventory data
2019
+ inventory = pd.DataFrame({
2020
+ 'sku': ['A001', 'B002', 'C003'],
2021
+ 'description': [
2022
+ "Laptop 16GB RAM, 512GB SSD, Intel i7",
2023
+ "Phone 128GB, 5G, dual camera",
2024
+ "Tablet 10-inch, WiFi only, 64GB"
2025
+ ]
2026
+ })
2027
+
2028
+ enriched_inventory = await inventory.aio.auto_extract(
2029
+ purpose="Extract technical specifications for inventory system",
2030
+ batch_size=32
2031
+ )
2032
+ ```
2033
+
2034
+ Note:
2035
+ This is an asynchronous method and must be awaited. This method is ideal
2036
+ for enriching existing DataFrames with additional structured fields
2037
+ extracted from text columns. The schema is inferred synchronously from
2038
+ the DataFrame content. For production use cases with stable schemas,
2039
+ consider using `infer_schema()` once and reusing the schema with `task()`.
2040
+ """
2041
+ # Infer schema from DataFrame rows (synchronous)
2042
+ schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
2043
+
2044
+ # Extract structured data using the inferred schema (asynchronous)
2045
+ inferred_series = await self._obj.aio.task(
2046
+ task=schema.task,
2047
+ batch_size=batch_size,
2048
+ max_concurrency=max_concurrency,
2049
+ show_progress=show_progress,
2050
+ **api_kwargs,
2051
+ )
2052
+
2053
+ return self._obj.assign(
2054
+ inferred=inferred_series,
2055
+ ).ai.extract("inferred")