openaivec 0.14.9__py3-none-any.whl → 0.14.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openaivec/pandas_ext.py CHANGED
@@ -216,8 +216,12 @@ class OpenAIVecSeriesAccessor:
216
216
  top_p=top_p,
217
217
  )
218
218
 
219
- # Forward any extra kwargs to the underlying Responses API.
220
- return pd.Series(client.parse(self._obj.tolist(), **api_kwargs), index=self._obj.index, name=self._obj.name)
219
+ # Forward any extra kwargs to the underlying Responses API, excluding proxy-specific ones.
220
+ proxy_params = {"show_progress", "batch_size"}
221
+ filtered_kwargs = {k: v for k, v in api_kwargs.items() if k not in proxy_params}
222
+ return pd.Series(
223
+ client.parse(self._obj.tolist(), **filtered_kwargs), index=self._obj.index, name=self._obj.name
224
+ )
221
225
 
222
226
  def responses(
223
227
  self,
@@ -437,7 +441,94 @@ class OpenAIVecSeriesAccessor:
437
441
  **api_kwargs,
438
442
  )
439
443
 
440
- def infer_schema(self, purpose: str, max_examples: int = 100) -> InferredSchema:
444
+ def parse_with_cache(
445
+ self,
446
+ instructions: str,
447
+ cache: BatchingMapProxy[str, ResponseFormat],
448
+ response_format: ResponseFormat = None,
449
+ max_examples: int = 100,
450
+ temperature: float | None = 0.0,
451
+ top_p: float = 1.0,
452
+ **api_kwargs,
453
+ ) -> pd.Series:
454
+ """Parse Series values using an LLM with a provided cache.
455
+ This method allows you to parse the Series content into structured data
456
+ using an LLM, optionally inferring a schema based on the provided purpose.
457
+ Args:
458
+ instructions (str): System prompt for the LLM.
459
+ cache (BatchingMapProxy[str, BaseModel]): Explicit cache instance for
460
+ batching and deduplication control.
461
+ response_format (type[BaseModel] | None): Pydantic model or built-in type
462
+ for structured output. If None, schema is inferred.
463
+ max_examples (int): Maximum number of examples to use for schema inference.
464
+ Defaults to 100.
465
+ temperature (float | None): Sampling temperature. Defaults to 0.0.
466
+ top_p (float): Nucleus sampling parameter. Defaults to 1.0.
467
+ Additional Keyword Args:
468
+ Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
469
+ `seed`, etc.) are forwarded verbatim to the underlying client.
470
+ Returns:
471
+ pandas.Series: Series with parsed structured data as instances of
472
+ `response_format` or inferred schema model.
473
+ """
474
+
475
+ schema: InferredSchema | None = None
476
+ if response_format is None:
477
+ schema = self.infer_schema(purpose=instructions, max_examples=max_examples, **api_kwargs)
478
+
479
+ return self.responses_with_cache(
480
+ instructions=schema.inference_prompt if schema else instructions,
481
+ cache=cache,
482
+ response_format=response_format or schema.model,
483
+ temperature=temperature,
484
+ top_p=top_p,
485
+ **api_kwargs,
486
+ )
487
+
488
+ def parse(
489
+ self,
490
+ instructions: str,
491
+ response_format: ResponseFormat = None,
492
+ max_examples: int = 100,
493
+ batch_size: int | None = None,
494
+ show_progress: bool = False,
495
+ temperature: float | None = 0.0,
496
+ top_p: float = 1.0,
497
+ **api_kwargs,
498
+ ) -> pd.Series:
499
+ """Parse Series values using an LLM with optional schema inference.
500
+
501
+ This method allows you to parse the Series content into structured data
502
+ using an LLM, optionally inferring a schema based on the provided purpose.
503
+
504
+ Args:
505
+ instructions (str): System prompt for the LLM.
506
+ response_format (type[BaseModel] | None): Pydantic model or built-in type
507
+ for structured output. If None, schema is inferred.
508
+ max_examples (int): Maximum number of examples to use for schema inference.
509
+ Defaults to 100.
510
+ batch_size (int | None): Number of requests to process in parallel.
511
+ Defaults to None (automatic optimization).
512
+ show_progress (bool): Whether to display a progress bar during processing.
513
+ Defaults to False.
514
+ temperature (float | None): Sampling temperature. Defaults to 0.0.
515
+ top_p (float): Nucleus sampling parameter. Defaults to 1.0.
516
+
517
+ Returns:
518
+ pandas.Series: Series with parsed structured data as instances of
519
+ `response_format` or inferred schema model.
520
+ """
521
+ return self.parse_with_cache(
522
+ instructions=instructions,
523
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
524
+ response_format=response_format,
525
+ max_examples=max_examples,
526
+ temperature=temperature,
527
+ top_p=top_p,
528
+ **api_kwargs,
529
+ )
530
+
531
+ def infer_schema(self, purpose: str, max_examples: int = 100, **api_kwargs) -> InferredSchema:
441
532
  """Infer a structured data schema from Series content using AI.
442
533
 
443
534
  This method analyzes a sample of the Series values to automatically infer
@@ -488,7 +579,7 @@ class OpenAIVecSeriesAccessor:
488
579
  inferer = CONTAINER.resolve(SchemaInferer)
489
580
 
490
581
  input: SchemaInferenceInput = SchemaInferenceInput(
491
- examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(), purpose=purpose
582
+ examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(), purpose=purpose, **api_kwargs
492
583
  )
493
584
  return inferer.infer_schema(input)
494
585
 
@@ -538,90 +629,6 @@ class OpenAIVecSeriesAccessor:
538
629
  extracted.columns = [f"{self._obj.name}_{col}" for col in extracted.columns]
539
630
  return extracted
540
631
 
541
- def auto_extract(
542
- self,
543
- purpose: str,
544
- max_examples: int = 100,
545
- batch_size: int | None = None,
546
- show_progress: bool = False,
547
- **api_kwargs,
548
- ) -> pd.DataFrame:
549
- """Automatically infer schema and extract structured data in one step.
550
-
551
- This convenience method combines schema inference and data extraction into
552
- a single operation. It first analyzes a sample of the Series to infer an
553
- appropriate schema based on the stated purpose, then immediately applies
554
- that schema to extract structured data from all values in the Series.
555
-
556
- Args:
557
- purpose (str): Plain language description of what information to extract
558
- and how it will be used (e.g., "Extract product features for search",
559
- "Parse customer feedback for sentiment analysis"). This guides both
560
- schema inference and field selection.
561
- max_examples (int): Maximum number of examples to use for schema inference.
562
- A larger sample may produce more accurate schemas but increases
563
- inference time. Defaults to 100.
564
- batch_size (int | None): Number of requests to process in parallel during
565
- extraction. Defaults to None (automatic optimization). Set to a specific
566
- value to control API usage and performance.
567
- show_progress (bool): Whether to display a progress bar during extraction.
568
- Useful for large datasets. Defaults to False.
569
- **api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
570
- `frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
571
-
572
- Returns:
573
- pd.DataFrame: A DataFrame with extracted structured data. Each inferred
574
- field becomes a column, with the same index as the original Series.
575
- Column names and types are determined by the inferred schema.
576
-
577
- Example:
578
- ```python
579
- # Extract structured data from product reviews
580
- reviews = pd.Series([
581
- "Great laptop! 16GB RAM, fast SSD, battery lasts 10 hours",
582
- "Decent phone. 128GB storage, camera is okay, screen is bright",
583
- "Gaming desktop with RTX 4090, 32GB RAM, runs everything smoothly"
584
- ])
585
-
586
- # One-step extraction
587
- extracted = reviews.ai.auto_extract(
588
- purpose="Extract product specifications and performance metrics",
589
- show_progress=True
590
- )
591
- # Result: DataFrame with columns like 'ram', 'storage', 'battery_life', etc.
592
-
593
- # Extract sentiment and issues from support tickets
594
- tickets = pd.Series([
595
- "Account locked, can't reset password, very frustrated",
596
- "Billing error, charged twice for subscription",
597
- "Great support! Issue resolved quickly"
598
- ])
599
-
600
- features = tickets.ai.auto_extract(
601
- purpose="Extract issue type and customer sentiment for support analytics"
602
- )
603
- ```
604
-
605
- Note:
606
- This method is ideal for exploratory data analysis when you don't have
607
- a predefined schema. For production use cases with stable schemas,
608
- consider using `infer_schema()` once and reusing the schema with `task()`.
609
- The inferred schema is not returned, so if you need to inspect or save it,
610
- use `infer_schema()` and `task()` separately.
611
- """
612
- schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
613
-
614
- return pd.DataFrame(
615
- {
616
- "inferred": self._obj.ai.task(
617
- task=schema.task,
618
- batch_size=batch_size,
619
- show_progress=show_progress,
620
- **api_kwargs,
621
- ),
622
- }
623
- ).ai.extract("inferred")
624
-
625
632
 
626
633
  @pd.api.extensions.register_dataframe_accessor("ai")
627
634
  class OpenAIVecDataFrameAccessor:
@@ -822,6 +829,95 @@ class OpenAIVecDataFrameAccessor:
822
829
  **api_kwargs,
823
830
  )
824
831
 
832
+ def parse_with_cache(
833
+ self,
834
+ instructions: str,
835
+ cache: BatchingMapProxy[str, ResponseFormat],
836
+ response_format: ResponseFormat = None,
837
+ max_examples: int = 100,
838
+ temperature: float | None = 0.0,
839
+ top_p: float = 1.0,
840
+ **api_kwargs,
841
+ ) -> pd.Series:
842
+ """Parse DataFrame rows using an LLM with a provided cache.
843
+
844
+ This method allows you to parse each DataFrame row (serialized as JSON)
845
+ into structured data using an LLM, optionally inferring a schema based
846
+ on the provided purpose.
847
+
848
+ Args:
849
+ instructions (str): System prompt for the LLM.
850
+ cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
851
+ batching and deduplication control.
852
+ response_format (type[BaseModel] | None): Pydantic model or built-in type
853
+ for structured output. If None, schema is inferred.
854
+ max_examples (int): Maximum number of examples to use for schema inference.
855
+ Defaults to 100.
856
+ temperature (float | None): Sampling temperature. Defaults to 0.0.
857
+ top_p (float): Nucleus sampling parameter. Defaults to 1.0.
858
+
859
+ Additional Keyword Args:
860
+ Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
861
+ `seed`, etc.) are forwarded verbatim to the underlying client.
862
+
863
+ Returns:
864
+ pandas.Series: Series with parsed structured data as instances of
865
+ `response_format` or inferred schema model.
866
+ """
867
+ return _df_rows_to_json_series(self._obj).ai.parse_with_cache(
868
+ instructions=instructions,
869
+ cache=cache,
870
+ response_format=response_format,
871
+ max_examples=max_examples,
872
+ temperature=temperature,
873
+ top_p=top_p,
874
+ **api_kwargs,
875
+ )
876
+
877
+ def parse(
878
+ self,
879
+ instructions: str,
880
+ response_format: ResponseFormat = None,
881
+ max_examples: int = 100,
882
+ batch_size: int | None = None,
883
+ show_progress: bool = False,
884
+ temperature: float | None = 0.0,
885
+ top_p: float = 1.0,
886
+ **api_kwargs,
887
+ ) -> pd.Series:
888
+ """Parse DataFrame rows using an LLM with optional schema inference.
889
+
890
+ This method allows you to parse each DataFrame row (serialized as JSON)
891
+ into structured data using an LLM, optionally inferring a schema based
892
+ on the provided purpose.
893
+
894
+ Args:
895
+ instructions (str): System prompt for the LLM.
896
+ response_format (type[BaseModel] | None): Pydantic model or built-in type
897
+ for structured output. If None, schema is inferred.
898
+ max_examples (int): Maximum number of examples to use for schema inference.
899
+ Defaults to 100.
900
+ batch_size (int | None): Number of requests to process in parallel.
901
+ Defaults to None (automatic optimization).
902
+ show_progress (bool): Whether to display a progress bar during processing.
903
+ Defaults to False.
904
+ temperature (float | None): Sampling temperature. Defaults to 0.0.
905
+ top_p (float): Nucleus sampling parameter. Defaults to 1.0.
906
+
907
+ Returns:
908
+ pandas.Series: Series with parsed structured data as instances of
909
+ `response_format` or inferred schema model.
910
+ """
911
+ return self.parse_with_cache(
912
+ instructions=instructions,
913
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
914
+ response_format=response_format,
915
+ max_examples=max_examples,
916
+ temperature=temperature,
917
+ top_p=top_p,
918
+ **api_kwargs,
919
+ )
920
+
825
921
  def infer_schema(self, purpose: str, max_examples: int = 100) -> InferredSchema:
826
922
  """Infer a structured data schema from DataFrame rows using AI.
827
923
 
@@ -988,100 +1084,6 @@ class OpenAIVecDataFrameAccessor:
988
1084
 
989
1085
  return df
990
1086
 
991
- def auto_extract(
992
- self,
993
- purpose: str,
994
- max_examples: int = 100,
995
- batch_size: int | None = None,
996
- show_progress: bool = False,
997
- **api_kwargs,
998
- ) -> pd.DataFrame:
999
- """Automatically infer schema and add extracted fields to the DataFrame.
1000
-
1001
- This convenience method combines schema inference and data extraction to
1002
- automatically add new columns to the existing DataFrame. It analyzes a
1003
- sample of the DataFrame rows to infer an appropriate schema based on the
1004
- stated purpose, then extracts structured data and joins it with the
1005
- original DataFrame.
1006
-
1007
- Args:
1008
- purpose (str): Plain language description of what information to extract
1009
- and how it will be used (e.g., "Extract customer sentiment metrics",
1010
- "Parse product attributes for analytics"). This guides both schema
1011
- inference and field selection.
1012
- max_examples (int): Maximum number of rows to use for schema inference.
1013
- A larger sample may produce more accurate schemas but increases
1014
- inference time. Defaults to 100.
1015
- batch_size (int | None): Number of requests to process in parallel during
1016
- extraction. Defaults to None (automatic optimization). Set to a specific
1017
- value to control API usage and performance.
1018
- show_progress (bool): Whether to display a progress bar during extraction.
1019
- Useful for large datasets. Defaults to False.
1020
- **api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
1021
- `frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
1022
-
1023
- Returns:
1024
- pd.DataFrame: The original DataFrame with new columns added from the
1025
- inferred structured data. Each inferred field becomes a new column.
1026
- The original columns and index are preserved.
1027
-
1028
- Example:
1029
- ```python
1030
- # Add sentiment and issue type to support tickets
1031
- df = pd.DataFrame({
1032
- 'ticket_id': [1, 2, 3],
1033
- 'description': [
1034
- "Can't login, password reset not working",
1035
- "Billing error, charged twice last month",
1036
- "Great service, issue resolved quickly!"
1037
- ],
1038
- 'date': ['2024-01-01', '2024-01-02', '2024-01-03']
1039
- })
1040
-
1041
- # Add inferred fields to existing DataFrame
1042
- enriched_df = df.ai.auto_extract(
1043
- purpose="Extract issue type and sentiment for support dashboard",
1044
- show_progress=True
1045
- )
1046
- # Result: Original df with new columns like 'issue_type', 'sentiment', etc.
1047
-
1048
- # Add product specifications to inventory data
1049
- inventory = pd.DataFrame({
1050
- 'sku': ['A001', 'B002', 'C003'],
1051
- 'description': [
1052
- "Laptop 16GB RAM, 512GB SSD, Intel i7",
1053
- "Phone 128GB, 5G, dual camera",
1054
- "Tablet 10-inch, WiFi only, 64GB"
1055
- ]
1056
- })
1057
-
1058
- enriched_inventory = inventory.ai.auto_extract(
1059
- purpose="Extract technical specifications for inventory system"
1060
- )
1061
- ```
1062
-
1063
- Note:
1064
- This method is ideal for enriching existing DataFrames with additional
1065
- structured fields extracted from text columns. The schema is inferred
1066
- from the entire DataFrame content (converted to JSON format). For
1067
- production use cases with stable schemas, consider using `infer_schema()`
1068
- once and reusing the schema with `task()`.
1069
- """
1070
- # Infer schema from DataFrame rows
1071
- schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
1072
-
1073
- # Extract structured data using the inferred schema
1074
- inferred_series = self._obj.ai.task(
1075
- task=schema.task,
1076
- batch_size=batch_size,
1077
- show_progress=show_progress,
1078
- **api_kwargs,
1079
- )
1080
-
1081
- return self._obj.assign(
1082
- inferred=inferred_series,
1083
- ).ai.extract("inferred")
1084
-
1085
1087
  def similarity(self, col1: str, col2: str) -> pd.Series:
1086
1088
  """Compute cosine similarity between two columns containing embedding vectors.
1087
1089
 
@@ -1176,7 +1178,11 @@ class AsyncOpenAIVecSeriesAccessor:
1176
1178
  temperature=temperature,
1177
1179
  top_p=top_p,
1178
1180
  )
1179
- results = await client.parse(self._obj.tolist(), **api_kwargs)
1181
+
1182
+ # Forward any extra kwargs to the underlying Responses API, excluding proxy-specific ones.
1183
+ proxy_params = {"show_progress", "batch_size", "max_concurrency"}
1184
+ filtered_kwargs = {k: v for k, v in api_kwargs.items() if k not in proxy_params}
1185
+ results = await client.parse(self._obj.tolist(), **filtered_kwargs)
1180
1186
  return pd.Series(results, index=self._obj.index, name=self._obj.name)
1181
1187
 
1182
1188
  async def responses(
@@ -1457,96 +1463,107 @@ class AsyncOpenAIVecSeriesAccessor:
1457
1463
  **api_kwargs,
1458
1464
  )
1459
1465
 
1460
- async def auto_extract(
1466
+ async def parse_with_cache(
1461
1467
  self,
1462
- purpose: str,
1468
+ instructions: str,
1469
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1470
+ response_format: ResponseFormat = None,
1463
1471
  max_examples: int = 100,
1464
- batch_size: int | None = None,
1465
- max_concurrency: int = 8,
1466
- show_progress: bool = False,
1472
+ temperature: float | None = 0.0,
1473
+ top_p: float = 1.0,
1467
1474
  **api_kwargs,
1468
- ) -> pd.DataFrame:
1469
- """Automatically infer schema and extract structured data in one step (asynchronously).
1475
+ ) -> pd.Series:
1476
+ """Parse Series values using an LLM with a provided cache (asynchronously).
1470
1477
 
1471
- This convenience method combines schema inference and data extraction into
1472
- a single operation. It first analyzes a sample of the Series to infer an
1473
- appropriate schema based on the stated purpose, then immediately applies
1474
- that schema to extract structured data from all values in the Series.
1478
+ This method allows you to parse the Series content into structured data
1479
+ using an LLM, optionally inferring a schema based on the provided purpose.
1475
1480
 
1476
1481
  Args:
1477
- purpose (str): Plain language description of what information to extract
1478
- and how it will be used (e.g., "Extract product features for search",
1479
- "Parse customer feedback for sentiment analysis"). This guides both
1480
- schema inference and field selection.
1482
+ instructions (str): System prompt for the LLM.
1483
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
1484
+ batching and deduplication control.
1485
+ response_format (type[BaseModel] | None): Pydantic model or built-in type
1486
+ for structured output. If None, schema is inferred.
1481
1487
  max_examples (int): Maximum number of examples to use for schema inference.
1482
- A larger sample may produce more accurate schemas but increases
1483
- inference time. Defaults to 100.
1484
- batch_size (int | None): Number of requests to process in parallel during
1485
- extraction. Defaults to None (automatic optimization). Set to a specific
1486
- value to control API usage and performance.
1487
- max_concurrency (int): Maximum number of concurrent requests during
1488
- extraction. Defaults to 8.
1489
- show_progress (bool): Whether to display a progress bar during extraction.
1490
- Useful for large datasets. Defaults to False.
1491
- **api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
1492
- `frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
1488
+ Defaults to 100.
1489
+ temperature (float | None): Sampling temperature. Defaults to 0.0.
1490
+ top_p (float): Nucleus sampling parameter. Defaults to 1.0.
1491
+
1492
+ Additional Keyword Args:
1493
+ Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
1494
+ `seed`, etc.) are forwarded verbatim to the underlying client.
1493
1495
 
1494
1496
  Returns:
1495
- pd.DataFrame: A DataFrame with extracted structured data. Each inferred
1496
- field becomes a column, with the same index as the original Series.
1497
- Column names and types are determined by the inferred schema.
1497
+ pandas.Series: Series with parsed structured data as instances of
1498
+ `response_format` or inferred schema model.
1498
1499
 
1499
- Example:
1500
- ```python
1501
- # Extract structured data from product reviews
1502
- reviews = pd.Series([
1503
- "Great laptop! 16GB RAM, fast SSD, battery lasts 10 hours",
1504
- "Decent phone. 128GB storage, camera is okay, screen is bright",
1505
- "Gaming desktop with RTX 4090, 32GB RAM, runs everything smoothly"
1506
- ])
1500
+ Note:
1501
+ This is an asynchronous method and must be awaited.
1502
+ """
1503
+ schema: InferredSchema | None = None
1504
+ if response_format is None:
1505
+ # Use synchronous schema inference
1506
+ schema = self._obj.ai.infer_schema(purpose=instructions, max_examples=max_examples)
1507
1507
 
1508
- # One-step extraction (must be awaited)
1509
- extracted = await reviews.aio.auto_extract(
1510
- purpose="Extract product specifications and performance metrics",
1511
- max_concurrency=4,
1512
- show_progress=True
1513
- )
1514
- # Result: DataFrame with columns like 'ram', 'storage', 'battery_life', etc.
1508
+ return await self.responses_with_cache(
1509
+ instructions=schema.inference_prompt if schema else instructions,
1510
+ cache=cache,
1511
+ response_format=response_format or schema.model,
1512
+ temperature=temperature,
1513
+ top_p=top_p,
1514
+ **api_kwargs,
1515
+ )
1515
1516
 
1516
- # Extract sentiment and issues from support tickets
1517
- tickets = pd.Series([
1518
- "Account locked, can't reset password, very frustrated",
1519
- "Billing error, charged twice for subscription",
1520
- "Great support! Issue resolved quickly"
1521
- ])
1517
+ async def parse(
1518
+ self,
1519
+ instructions: str,
1520
+ response_format: ResponseFormat = None,
1521
+ max_examples: int = 100,
1522
+ batch_size: int | None = None,
1523
+ max_concurrency: int = 8,
1524
+ show_progress: bool = False,
1525
+ temperature: float | None = 0.0,
1526
+ top_p: float = 1.0,
1527
+ **api_kwargs,
1528
+ ) -> pd.Series:
1529
+ """Parse Series values using an LLM with optional schema inference (asynchronously).
1522
1530
 
1523
- features = await tickets.aio.auto_extract(
1524
- purpose="Extract issue type and customer sentiment for support analytics",
1525
- batch_size=32
1526
- )
1527
- ```
1531
+ This method allows you to parse the Series content into structured data
1532
+ using an LLM, optionally inferring a schema based on the provided purpose.
1533
+
1534
+ Args:
1535
+ instructions (str): System prompt for the LLM.
1536
+ response_format (type[BaseModel] | None): Pydantic model or built-in type
1537
+ for structured output. If None, schema is inferred.
1538
+ max_examples (int): Maximum number of examples to use for schema inference.
1539
+ Defaults to 100.
1540
+ batch_size (int | None): Number of requests to process in parallel.
1541
+ Defaults to None (automatic optimization).
1542
+ max_concurrency (int): Maximum number of concurrent requests. Defaults to 8.
1543
+ show_progress (bool): Whether to display a progress bar during processing.
1544
+ Defaults to False.
1545
+ temperature (float | None): Sampling temperature. Defaults to 0.0.
1546
+ top_p (float): Nucleus sampling parameter. Defaults to 1.0.
1547
+
1548
+ Returns:
1549
+ pandas.Series: Series with parsed structured data as instances of
1550
+ `response_format` or inferred schema model.
1528
1551
 
1529
1552
  Note:
1530
- This is an asynchronous method and must be awaited. This method is ideal
1531
- for exploratory data analysis when you don't have a predefined schema.
1532
- For production use cases with stable schemas, consider using the synchronous
1533
- `infer_schema()` once and reusing the schema with `task()`. The inferred
1534
- schema is not returned, so if you need to inspect or save it, use
1535
- `infer_schema()` and `task()` separately.
1553
+ This is an asynchronous method and must be awaited.
1536
1554
  """
1537
- # Use synchronous infer_schema since it's not async
1538
- schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
1539
-
1540
- inferred_series = await self._obj.aio.task(
1541
- task=schema.task,
1542
- batch_size=batch_size,
1543
- max_concurrency=max_concurrency,
1544
- show_progress=show_progress,
1555
+ return await self.parse_with_cache(
1556
+ instructions=instructions,
1557
+ cache=AsyncBatchingMapProxy(
1558
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1559
+ ),
1560
+ response_format=response_format,
1561
+ max_examples=max_examples,
1562
+ temperature=temperature,
1563
+ top_p=top_p,
1545
1564
  **api_kwargs,
1546
1565
  )
1547
1566
 
1548
- return pd.DataFrame({"inferred": inferred_series}).ai.extract("inferred")
1549
-
1550
1567
 
1551
1568
  @pd.api.extensions.register_dataframe_accessor("aio")
1552
1569
  class AsyncOpenAIVecDataFrameAccessor:
@@ -1775,6 +1792,105 @@ class AsyncOpenAIVecDataFrameAccessor:
1775
1792
  **api_kwargs,
1776
1793
  )
1777
1794
 
1795
+ async def parse_with_cache(
1796
+ self,
1797
+ instructions: str,
1798
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1799
+ response_format: ResponseFormat = None,
1800
+ max_examples: int = 100,
1801
+ temperature: float | None = 0.0,
1802
+ top_p: float = 1.0,
1803
+ **api_kwargs,
1804
+ ) -> pd.Series:
1805
+ """Parse DataFrame rows using an LLM with a provided cache (asynchronously).
1806
+
1807
+ This method allows you to parse each DataFrame row (serialized as JSON)
1808
+ into structured data using an LLM, optionally inferring a schema based
1809
+ on the provided purpose.
1810
+
1811
+ Args:
1812
+ instructions (str): System prompt for the LLM.
1813
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
1814
+ batching and deduplication control.
1815
+ response_format (type[BaseModel] | None): Pydantic model or built-in type
1816
+ for structured output. If None, schema is inferred.
1817
+ max_examples (int): Maximum number of examples to use for schema inference.
1818
+ Defaults to 100.
1819
+ temperature (float | None): Sampling temperature. Defaults to 0.0.
1820
+ top_p (float): Nucleus sampling parameter. Defaults to 1.0.
1821
+
1822
+ Additional Keyword Args:
1823
+ Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
1824
+ `seed`, etc.) are forwarded verbatim to the underlying client.
1825
+
1826
+ Returns:
1827
+ pandas.Series: Series with parsed structured data as instances of
1828
+ `response_format` or inferred schema model.
1829
+
1830
+ Note:
1831
+ This is an asynchronous method and must be awaited.
1832
+ """
1833
+ return await _df_rows_to_json_series(self._obj).aio.parse_with_cache(
1834
+ instructions=instructions,
1835
+ cache=cache,
1836
+ response_format=response_format,
1837
+ max_examples=max_examples,
1838
+ temperature=temperature,
1839
+ top_p=top_p,
1840
+ **api_kwargs,
1841
+ )
1842
+
1843
+ async def parse(
1844
+ self,
1845
+ instructions: str,
1846
+ response_format: ResponseFormat = None,
1847
+ max_examples: int = 100,
1848
+ batch_size: int | None = None,
1849
+ max_concurrency: int = 8,
1850
+ show_progress: bool = False,
1851
+ temperature: float | None = 0.0,
1852
+ top_p: float = 1.0,
1853
+ **api_kwargs,
1854
+ ) -> pd.Series:
1855
+ """Parse DataFrame rows using an LLM with optional schema inference (asynchronously).
1856
+
1857
+ This method allows you to parse each DataFrame row (serialized as JSON)
1858
+ into structured data using an LLM, optionally inferring a schema based
1859
+ on the provided purpose.
1860
+
1861
+ Args:
1862
+ instructions (str): System prompt for the LLM.
1863
+ response_format (type[BaseModel] | None): Pydantic model or built-in type
1864
+ for structured output. If None, schema is inferred.
1865
+ max_examples (int): Maximum number of examples to use for schema inference.
1866
+ Defaults to 100.
1867
+ batch_size (int | None): Number of requests to process in parallel.
1868
+ Defaults to None (automatic optimization).
1869
+ max_concurrency (int): Maximum number of concurrent requests. Defaults to 8.
1870
+ show_progress (bool): Whether to display a progress bar during processing.
1871
+ Defaults to False.
1872
+ temperature (float | None): Sampling temperature. Defaults to 0.0.
1873
+ top_p (float): Nucleus sampling parameter. Defaults to 1.0.
1874
+
1875
+ Returns:
1876
+ pandas.Series: Series with parsed structured data as instances of
1877
+ `response_format` or inferred schema model.
1878
+
1879
+ Note:
1880
+ This is an asynchronous method and must be awaited.
1881
+ """
1882
+ return await self.parse_with_cache(
1883
+ instructions=instructions,
1884
+ cache=AsyncBatchingMapProxy(
1885
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1886
+ ),
1887
+ response_format=response_format,
1888
+ max_examples=max_examples,
1889
+ temperature=temperature,
1890
+ top_p=top_p,
1891
+ **api_kwargs,
1892
+ )
1893
+
1778
1894
  async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
1779
1895
  """Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
1780
1896
 
@@ -1954,103 +2070,3 @@ class AsyncOpenAIVecDataFrameAccessor:
1954
2070
  df.at[actual_index, target_column_name] = result.output
1955
2071
 
1956
2072
  return df
1957
-
1958
- async def auto_extract(
1959
- self,
1960
- purpose: str,
1961
- max_examples: int = 100,
1962
- batch_size: int | None = None,
1963
- max_concurrency: int = 8,
1964
- show_progress: bool = False,
1965
- **api_kwargs,
1966
- ) -> pd.DataFrame:
1967
- """Automatically infer schema and add extracted fields to the DataFrame (asynchronously).
1968
-
1969
- This convenience method combines schema inference and data extraction to
1970
- automatically add new columns to the existing DataFrame. It analyzes a
1971
- sample of the DataFrame rows to infer an appropriate schema based on the
1972
- stated purpose, then extracts structured data and joins it with the
1973
- original DataFrame.
1974
-
1975
- Args:
1976
- purpose (str): Plain language description of what information to extract
1977
- and how it will be used (e.g., "Extract customer sentiment metrics",
1978
- "Parse product attributes for analytics"). This guides both schema
1979
- inference and field selection.
1980
- max_examples (int): Maximum number of rows to use for schema inference.
1981
- A larger sample may produce more accurate schemas but increases
1982
- inference time. Defaults to 100.
1983
- batch_size (int | None): Number of requests to process in parallel during
1984
- extraction. Defaults to None (automatic optimization). Set to a specific
1985
- value to control API usage and performance.
1986
- max_concurrency (int): Maximum number of concurrent requests during
1987
- extraction. Defaults to 8.
1988
- show_progress (bool): Whether to display a progress bar during extraction.
1989
- Useful for large datasets. Defaults to False.
1990
- **api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
1991
- `frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
1992
-
1993
- Returns:
1994
- pd.DataFrame: The original DataFrame with new columns added from the
1995
- inferred structured data. Each inferred field becomes a new column.
1996
- The original columns and index are preserved.
1997
-
1998
- Example:
1999
- ```python
2000
- # Add sentiment and issue type to support tickets
2001
- df = pd.DataFrame({
2002
- 'ticket_id': [1, 2, 3],
2003
- 'description': [
2004
- "Can't login, password reset not working",
2005
- "Billing error, charged twice last month",
2006
- "Great service, issue resolved quickly!"
2007
- ],
2008
- 'date': ['2024-01-01', '2024-01-02', '2024-01-03']
2009
- })
2010
-
2011
- # Add inferred fields to existing DataFrame (must be awaited)
2012
- enriched_df = await df.aio.auto_extract(
2013
- purpose="Extract issue type and sentiment for support dashboard",
2014
- max_concurrency=4,
2015
- show_progress=True
2016
- )
2017
- # Result: Original df with new columns like 'issue_type', 'sentiment', etc.
2018
-
2019
- # Add product specifications to inventory data
2020
- inventory = pd.DataFrame({
2021
- 'sku': ['A001', 'B002', 'C003'],
2022
- 'description': [
2023
- "Laptop 16GB RAM, 512GB SSD, Intel i7",
2024
- "Phone 128GB, 5G, dual camera",
2025
- "Tablet 10-inch, WiFi only, 64GB"
2026
- ]
2027
- })
2028
-
2029
- enriched_inventory = await inventory.aio.auto_extract(
2030
- purpose="Extract technical specifications for inventory system",
2031
- batch_size=32
2032
- )
2033
- ```
2034
-
2035
- Note:
2036
- This is an asynchronous method and must be awaited. This method is ideal
2037
- for enriching existing DataFrames with additional structured fields
2038
- extracted from text columns. The schema is inferred synchronously from
2039
- the DataFrame content. For production use cases with stable schemas,
2040
- consider using `infer_schema()` once and reusing the schema with `task()`.
2041
- """
2042
- # Infer schema from DataFrame rows (synchronous)
2043
- schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
2044
-
2045
- # Extract structured data using the inferred schema (asynchronous)
2046
- inferred_series = await self._obj.aio.task(
2047
- task=schema.task,
2048
- batch_size=batch_size,
2049
- max_concurrency=max_concurrency,
2050
- show_progress=show_progress,
2051
- **api_kwargs,
2052
- )
2053
-
2054
- return self._obj.assign(
2055
- inferred=inferred_series,
2056
- ).ai.extract("inferred")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 0.14.9
3
+ Version: 0.14.10
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -12,7 +12,7 @@ openaivec/_responses.py,sha256=lVJRa_Uc7hQJnYJRgumqwBbu6GToZqsLFS6tIAFO1Fc,24014
12
12
  openaivec/_schema.py,sha256=RKjDPqet1TlReYibah0R0NIvCV1VWN5SZxiaBeV0gCY,15492
13
13
  openaivec/_serialize.py,sha256=u2Om94Sc_QgJkTlW2BAGw8wd6gYDhc6IRqvS-qevFSs,8399
14
14
  openaivec/_util.py,sha256=XfueAycVCQvgRLS7wF7e306b53lebORvZOBzbQjy4vE,6438
15
- openaivec/pandas_ext.py,sha256=rCkh8g9eqHn0gUG8j_-jdppQt_Yq_1Wg6FmsCEcpv3k,85985
15
+ openaivec/pandas_ext.py,sha256=_MdiZWokius62zI_sTp_nd-33fMNlnRHbyqso0eF_Hw,85406
16
16
  openaivec/spark.py,sha256=Dbuhlk8Z89Fwk3fbWp1Ud9uTpfNyfjZOIx8ARJMnQf0,25371
17
17
  openaivec/task/__init__.py,sha256=lrgoc9UIox7XnxZ96dQRl88a-8QfuZRFBHshxctpMB8,6178
18
18
  openaivec/task/customer_support/__init__.py,sha256=KWfGyXPdZyfGdRH17x7hPpJJ1N2EP9PPhZx0fvBAwSI,884
@@ -31,7 +31,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=Np-yY0d4Kr5WEjGjq4tNFHDNarBLajJr
31
31
  openaivec/task/nlp/translation.py,sha256=VYgiXtr2TL1tbqZkBpyVAy4ahrgd8UO4ZjhIL6xMdkI,6609
32
32
  openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
33
33
  openaivec/task/table/fillna.py,sha256=g_CpLnLzK1C5rCiVq15L3X0kywJK6CtSrKRYxQFuhn8,6606
34
- openaivec-0.14.9.dist-info/METADATA,sha256=C7UqwVFLIVYiMJdRdUMTuUbhamacXoM2EHfS1nIxROQ,27566
35
- openaivec-0.14.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
36
- openaivec-0.14.9.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
37
- openaivec-0.14.9.dist-info/RECORD,,
34
+ openaivec-0.14.10.dist-info/METADATA,sha256=BXQWevriu4qabbZM1paMO1PV_i8zmFPqiodTMwzeJnQ,27567
35
+ openaivec-0.14.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
36
+ openaivec-0.14.10.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
37
+ openaivec-0.14.10.dist-info/RECORD,,