openaivec 0.14.8__py3-none-any.whl → 0.14.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_dynamic.py +350 -0
- openaivec/_schema.py +101 -278
- openaivec/pandas_ext.py +370 -354
- {openaivec-0.14.8.dist-info → openaivec-0.14.10.dist-info}/METADATA +1 -1
- {openaivec-0.14.8.dist-info → openaivec-0.14.10.dist-info}/RECORD +7 -6
- {openaivec-0.14.8.dist-info → openaivec-0.14.10.dist-info}/WHEEL +0 -0
- {openaivec-0.14.8.dist-info → openaivec-0.14.10.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -216,8 +216,12 @@ class OpenAIVecSeriesAccessor:
|
|
|
216
216
|
top_p=top_p,
|
|
217
217
|
)
|
|
218
218
|
|
|
219
|
-
# Forward any extra kwargs to the underlying Responses API.
|
|
220
|
-
|
|
219
|
+
# Forward any extra kwargs to the underlying Responses API, excluding proxy-specific ones.
|
|
220
|
+
proxy_params = {"show_progress", "batch_size"}
|
|
221
|
+
filtered_kwargs = {k: v for k, v in api_kwargs.items() if k not in proxy_params}
|
|
222
|
+
return pd.Series(
|
|
223
|
+
client.parse(self._obj.tolist(), **filtered_kwargs), index=self._obj.index, name=self._obj.name
|
|
224
|
+
)
|
|
221
225
|
|
|
222
226
|
def responses(
|
|
223
227
|
self,
|
|
@@ -437,7 +441,94 @@ class OpenAIVecSeriesAccessor:
|
|
|
437
441
|
**api_kwargs,
|
|
438
442
|
)
|
|
439
443
|
|
|
440
|
-
def
|
|
444
|
+
def parse_with_cache(
|
|
445
|
+
self,
|
|
446
|
+
instructions: str,
|
|
447
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
448
|
+
response_format: ResponseFormat = None,
|
|
449
|
+
max_examples: int = 100,
|
|
450
|
+
temperature: float | None = 0.0,
|
|
451
|
+
top_p: float = 1.0,
|
|
452
|
+
**api_kwargs,
|
|
453
|
+
) -> pd.Series:
|
|
454
|
+
"""Parse Series values using an LLM with a provided cache.
|
|
455
|
+
This method allows you to parse the Series content into structured data
|
|
456
|
+
using an LLM, optionally inferring a schema based on the provided purpose.
|
|
457
|
+
Args:
|
|
458
|
+
instructions (str): System prompt for the LLM.
|
|
459
|
+
cache (BatchingMapProxy[str, BaseModel]): Explicit cache instance for
|
|
460
|
+
batching and deduplication control.
|
|
461
|
+
response_format (type[BaseModel] | None): Pydantic model or built-in type
|
|
462
|
+
for structured output. If None, schema is inferred.
|
|
463
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
464
|
+
Defaults to 100.
|
|
465
|
+
temperature (float | None): Sampling temperature. Defaults to 0.0.
|
|
466
|
+
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
467
|
+
Additional Keyword Args:
|
|
468
|
+
Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
|
|
469
|
+
`seed`, etc.) are forwarded verbatim to the underlying client.
|
|
470
|
+
Returns:
|
|
471
|
+
pandas.Series: Series with parsed structured data as instances of
|
|
472
|
+
`response_format` or inferred schema model.
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
schema: InferredSchema | None = None
|
|
476
|
+
if response_format is None:
|
|
477
|
+
schema = self.infer_schema(purpose=instructions, max_examples=max_examples, **api_kwargs)
|
|
478
|
+
|
|
479
|
+
return self.responses_with_cache(
|
|
480
|
+
instructions=schema.inference_prompt if schema else instructions,
|
|
481
|
+
cache=cache,
|
|
482
|
+
response_format=response_format or schema.model,
|
|
483
|
+
temperature=temperature,
|
|
484
|
+
top_p=top_p,
|
|
485
|
+
**api_kwargs,
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
def parse(
|
|
489
|
+
self,
|
|
490
|
+
instructions: str,
|
|
491
|
+
response_format: ResponseFormat = None,
|
|
492
|
+
max_examples: int = 100,
|
|
493
|
+
batch_size: int | None = None,
|
|
494
|
+
show_progress: bool = False,
|
|
495
|
+
temperature: float | None = 0.0,
|
|
496
|
+
top_p: float = 1.0,
|
|
497
|
+
**api_kwargs,
|
|
498
|
+
) -> pd.Series:
|
|
499
|
+
"""Parse Series values using an LLM with optional schema inference.
|
|
500
|
+
|
|
501
|
+
This method allows you to parse the Series content into structured data
|
|
502
|
+
using an LLM, optionally inferring a schema based on the provided purpose.
|
|
503
|
+
|
|
504
|
+
Args:
|
|
505
|
+
instructions (str): System prompt for the LLM.
|
|
506
|
+
response_format (type[BaseModel] | None): Pydantic model or built-in type
|
|
507
|
+
for structured output. If None, schema is inferred.
|
|
508
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
509
|
+
Defaults to 100.
|
|
510
|
+
batch_size (int | None): Number of requests to process in parallel.
|
|
511
|
+
Defaults to None (automatic optimization).
|
|
512
|
+
show_progress (bool): Whether to display a progress bar during processing.
|
|
513
|
+
Defaults to False.
|
|
514
|
+
temperature (float | None): Sampling temperature. Defaults to 0.0.
|
|
515
|
+
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
516
|
+
|
|
517
|
+
Returns:
|
|
518
|
+
pandas.Series: Series with parsed structured data as instances of
|
|
519
|
+
`response_format` or inferred schema model.
|
|
520
|
+
"""
|
|
521
|
+
return self.parse_with_cache(
|
|
522
|
+
instructions=instructions,
|
|
523
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
524
|
+
response_format=response_format,
|
|
525
|
+
max_examples=max_examples,
|
|
526
|
+
temperature=temperature,
|
|
527
|
+
top_p=top_p,
|
|
528
|
+
**api_kwargs,
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
def infer_schema(self, purpose: str, max_examples: int = 100, **api_kwargs) -> InferredSchema:
|
|
441
532
|
"""Infer a structured data schema from Series content using AI.
|
|
442
533
|
|
|
443
534
|
This method analyzes a sample of the Series values to automatically infer
|
|
@@ -488,7 +579,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
488
579
|
inferer = CONTAINER.resolve(SchemaInferer)
|
|
489
580
|
|
|
490
581
|
input: SchemaInferenceInput = SchemaInferenceInput(
|
|
491
|
-
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(), purpose=purpose
|
|
582
|
+
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(), purpose=purpose, **api_kwargs
|
|
492
583
|
)
|
|
493
584
|
return inferer.infer_schema(input)
|
|
494
585
|
|
|
@@ -538,90 +629,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
538
629
|
extracted.columns = [f"{self._obj.name}_{col}" for col in extracted.columns]
|
|
539
630
|
return extracted
|
|
540
631
|
|
|
541
|
-
def auto_extract(
|
|
542
|
-
self,
|
|
543
|
-
purpose: str,
|
|
544
|
-
max_examples: int = 100,
|
|
545
|
-
batch_size: int | None = None,
|
|
546
|
-
show_progress: bool = False,
|
|
547
|
-
**api_kwargs,
|
|
548
|
-
) -> pd.DataFrame:
|
|
549
|
-
"""Automatically infer schema and extract structured data in one step.
|
|
550
|
-
|
|
551
|
-
This convenience method combines schema inference and data extraction into
|
|
552
|
-
a single operation. It first analyzes a sample of the Series to infer an
|
|
553
|
-
appropriate schema based on the stated purpose, then immediately applies
|
|
554
|
-
that schema to extract structured data from all values in the Series.
|
|
555
|
-
|
|
556
|
-
Args:
|
|
557
|
-
purpose (str): Plain language description of what information to extract
|
|
558
|
-
and how it will be used (e.g., "Extract product features for search",
|
|
559
|
-
"Parse customer feedback for sentiment analysis"). This guides both
|
|
560
|
-
schema inference and field selection.
|
|
561
|
-
max_examples (int): Maximum number of examples to use for schema inference.
|
|
562
|
-
A larger sample may produce more accurate schemas but increases
|
|
563
|
-
inference time. Defaults to 100.
|
|
564
|
-
batch_size (int | None): Number of requests to process in parallel during
|
|
565
|
-
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
566
|
-
value to control API usage and performance.
|
|
567
|
-
show_progress (bool): Whether to display a progress bar during extraction.
|
|
568
|
-
Useful for large datasets. Defaults to False.
|
|
569
|
-
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
570
|
-
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
571
|
-
|
|
572
|
-
Returns:
|
|
573
|
-
pd.DataFrame: A DataFrame with extracted structured data. Each inferred
|
|
574
|
-
field becomes a column, with the same index as the original Series.
|
|
575
|
-
Column names and types are determined by the inferred schema.
|
|
576
|
-
|
|
577
|
-
Example:
|
|
578
|
-
```python
|
|
579
|
-
# Extract structured data from product reviews
|
|
580
|
-
reviews = pd.Series([
|
|
581
|
-
"Great laptop! 16GB RAM, fast SSD, battery lasts 10 hours",
|
|
582
|
-
"Decent phone. 128GB storage, camera is okay, screen is bright",
|
|
583
|
-
"Gaming desktop with RTX 4090, 32GB RAM, runs everything smoothly"
|
|
584
|
-
])
|
|
585
|
-
|
|
586
|
-
# One-step extraction
|
|
587
|
-
extracted = reviews.ai.auto_extract(
|
|
588
|
-
purpose="Extract product specifications and performance metrics",
|
|
589
|
-
show_progress=True
|
|
590
|
-
)
|
|
591
|
-
# Result: DataFrame with columns like 'ram', 'storage', 'battery_life', etc.
|
|
592
|
-
|
|
593
|
-
# Extract sentiment and issues from support tickets
|
|
594
|
-
tickets = pd.Series([
|
|
595
|
-
"Account locked, can't reset password, very frustrated",
|
|
596
|
-
"Billing error, charged twice for subscription",
|
|
597
|
-
"Great support! Issue resolved quickly"
|
|
598
|
-
])
|
|
599
|
-
|
|
600
|
-
features = tickets.ai.auto_extract(
|
|
601
|
-
purpose="Extract issue type and customer sentiment for support analytics"
|
|
602
|
-
)
|
|
603
|
-
```
|
|
604
|
-
|
|
605
|
-
Note:
|
|
606
|
-
This method is ideal for exploratory data analysis when you don't have
|
|
607
|
-
a predefined schema. For production use cases with stable schemas,
|
|
608
|
-
consider using `infer_schema()` once and reusing the schema with `task()`.
|
|
609
|
-
The inferred schema is not returned, so if you need to inspect or save it,
|
|
610
|
-
use `infer_schema()` and `task()` separately.
|
|
611
|
-
"""
|
|
612
|
-
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
613
|
-
|
|
614
|
-
return pd.DataFrame(
|
|
615
|
-
{
|
|
616
|
-
"inferred": self._obj.ai.task(
|
|
617
|
-
task=schema.task,
|
|
618
|
-
batch_size=batch_size,
|
|
619
|
-
show_progress=show_progress,
|
|
620
|
-
**api_kwargs,
|
|
621
|
-
),
|
|
622
|
-
}
|
|
623
|
-
).ai.extract("inferred")
|
|
624
|
-
|
|
625
632
|
|
|
626
633
|
@pd.api.extensions.register_dataframe_accessor("ai")
|
|
627
634
|
class OpenAIVecDataFrameAccessor:
|
|
@@ -822,6 +829,95 @@ class OpenAIVecDataFrameAccessor:
|
|
|
822
829
|
**api_kwargs,
|
|
823
830
|
)
|
|
824
831
|
|
|
832
|
+
def parse_with_cache(
|
|
833
|
+
self,
|
|
834
|
+
instructions: str,
|
|
835
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
836
|
+
response_format: ResponseFormat = None,
|
|
837
|
+
max_examples: int = 100,
|
|
838
|
+
temperature: float | None = 0.0,
|
|
839
|
+
top_p: float = 1.0,
|
|
840
|
+
**api_kwargs,
|
|
841
|
+
) -> pd.Series:
|
|
842
|
+
"""Parse DataFrame rows using an LLM with a provided cache.
|
|
843
|
+
|
|
844
|
+
This method allows you to parse each DataFrame row (serialized as JSON)
|
|
845
|
+
into structured data using an LLM, optionally inferring a schema based
|
|
846
|
+
on the provided purpose.
|
|
847
|
+
|
|
848
|
+
Args:
|
|
849
|
+
instructions (str): System prompt for the LLM.
|
|
850
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
|
|
851
|
+
batching and deduplication control.
|
|
852
|
+
response_format (type[BaseModel] | None): Pydantic model or built-in type
|
|
853
|
+
for structured output. If None, schema is inferred.
|
|
854
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
855
|
+
Defaults to 100.
|
|
856
|
+
temperature (float | None): Sampling temperature. Defaults to 0.0.
|
|
857
|
+
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
858
|
+
|
|
859
|
+
Additional Keyword Args:
|
|
860
|
+
Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
|
|
861
|
+
`seed`, etc.) are forwarded verbatim to the underlying client.
|
|
862
|
+
|
|
863
|
+
Returns:
|
|
864
|
+
pandas.Series: Series with parsed structured data as instances of
|
|
865
|
+
`response_format` or inferred schema model.
|
|
866
|
+
"""
|
|
867
|
+
return _df_rows_to_json_series(self._obj).ai.parse_with_cache(
|
|
868
|
+
instructions=instructions,
|
|
869
|
+
cache=cache,
|
|
870
|
+
response_format=response_format,
|
|
871
|
+
max_examples=max_examples,
|
|
872
|
+
temperature=temperature,
|
|
873
|
+
top_p=top_p,
|
|
874
|
+
**api_kwargs,
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
def parse(
|
|
878
|
+
self,
|
|
879
|
+
instructions: str,
|
|
880
|
+
response_format: ResponseFormat = None,
|
|
881
|
+
max_examples: int = 100,
|
|
882
|
+
batch_size: int | None = None,
|
|
883
|
+
show_progress: bool = False,
|
|
884
|
+
temperature: float | None = 0.0,
|
|
885
|
+
top_p: float = 1.0,
|
|
886
|
+
**api_kwargs,
|
|
887
|
+
) -> pd.Series:
|
|
888
|
+
"""Parse DataFrame rows using an LLM with optional schema inference.
|
|
889
|
+
|
|
890
|
+
This method allows you to parse each DataFrame row (serialized as JSON)
|
|
891
|
+
into structured data using an LLM, optionally inferring a schema based
|
|
892
|
+
on the provided purpose.
|
|
893
|
+
|
|
894
|
+
Args:
|
|
895
|
+
instructions (str): System prompt for the LLM.
|
|
896
|
+
response_format (type[BaseModel] | None): Pydantic model or built-in type
|
|
897
|
+
for structured output. If None, schema is inferred.
|
|
898
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
899
|
+
Defaults to 100.
|
|
900
|
+
batch_size (int | None): Number of requests to process in parallel.
|
|
901
|
+
Defaults to None (automatic optimization).
|
|
902
|
+
show_progress (bool): Whether to display a progress bar during processing.
|
|
903
|
+
Defaults to False.
|
|
904
|
+
temperature (float | None): Sampling temperature. Defaults to 0.0.
|
|
905
|
+
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
906
|
+
|
|
907
|
+
Returns:
|
|
908
|
+
pandas.Series: Series with parsed structured data as instances of
|
|
909
|
+
`response_format` or inferred schema model.
|
|
910
|
+
"""
|
|
911
|
+
return self.parse_with_cache(
|
|
912
|
+
instructions=instructions,
|
|
913
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
914
|
+
response_format=response_format,
|
|
915
|
+
max_examples=max_examples,
|
|
916
|
+
temperature=temperature,
|
|
917
|
+
top_p=top_p,
|
|
918
|
+
**api_kwargs,
|
|
919
|
+
)
|
|
920
|
+
|
|
825
921
|
def infer_schema(self, purpose: str, max_examples: int = 100) -> InferredSchema:
|
|
826
922
|
"""Infer a structured data schema from DataFrame rows using AI.
|
|
827
923
|
|
|
@@ -988,100 +1084,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
988
1084
|
|
|
989
1085
|
return df
|
|
990
1086
|
|
|
991
|
-
def auto_extract(
|
|
992
|
-
self,
|
|
993
|
-
purpose: str,
|
|
994
|
-
max_examples: int = 100,
|
|
995
|
-
batch_size: int | None = None,
|
|
996
|
-
show_progress: bool = False,
|
|
997
|
-
**api_kwargs,
|
|
998
|
-
) -> pd.DataFrame:
|
|
999
|
-
"""Automatically infer schema and add extracted fields to the DataFrame.
|
|
1000
|
-
|
|
1001
|
-
This convenience method combines schema inference and data extraction to
|
|
1002
|
-
automatically add new columns to the existing DataFrame. It analyzes a
|
|
1003
|
-
sample of the DataFrame rows to infer an appropriate schema based on the
|
|
1004
|
-
stated purpose, then extracts structured data and joins it with the
|
|
1005
|
-
original DataFrame.
|
|
1006
|
-
|
|
1007
|
-
Args:
|
|
1008
|
-
purpose (str): Plain language description of what information to extract
|
|
1009
|
-
and how it will be used (e.g., "Extract customer sentiment metrics",
|
|
1010
|
-
"Parse product attributes for analytics"). This guides both schema
|
|
1011
|
-
inference and field selection.
|
|
1012
|
-
max_examples (int): Maximum number of rows to use for schema inference.
|
|
1013
|
-
A larger sample may produce more accurate schemas but increases
|
|
1014
|
-
inference time. Defaults to 100.
|
|
1015
|
-
batch_size (int | None): Number of requests to process in parallel during
|
|
1016
|
-
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
1017
|
-
value to control API usage and performance.
|
|
1018
|
-
show_progress (bool): Whether to display a progress bar during extraction.
|
|
1019
|
-
Useful for large datasets. Defaults to False.
|
|
1020
|
-
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
1021
|
-
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
1022
|
-
|
|
1023
|
-
Returns:
|
|
1024
|
-
pd.DataFrame: The original DataFrame with new columns added from the
|
|
1025
|
-
inferred structured data. Each inferred field becomes a new column.
|
|
1026
|
-
The original columns and index are preserved.
|
|
1027
|
-
|
|
1028
|
-
Example:
|
|
1029
|
-
```python
|
|
1030
|
-
# Add sentiment and issue type to support tickets
|
|
1031
|
-
df = pd.DataFrame({
|
|
1032
|
-
'ticket_id': [1, 2, 3],
|
|
1033
|
-
'description': [
|
|
1034
|
-
"Can't login, password reset not working",
|
|
1035
|
-
"Billing error, charged twice last month",
|
|
1036
|
-
"Great service, issue resolved quickly!"
|
|
1037
|
-
],
|
|
1038
|
-
'date': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
1039
|
-
})
|
|
1040
|
-
|
|
1041
|
-
# Add inferred fields to existing DataFrame
|
|
1042
|
-
enriched_df = df.ai.auto_extract(
|
|
1043
|
-
purpose="Extract issue type and sentiment for support dashboard",
|
|
1044
|
-
show_progress=True
|
|
1045
|
-
)
|
|
1046
|
-
# Result: Original df with new columns like 'issue_type', 'sentiment', etc.
|
|
1047
|
-
|
|
1048
|
-
# Add product specifications to inventory data
|
|
1049
|
-
inventory = pd.DataFrame({
|
|
1050
|
-
'sku': ['A001', 'B002', 'C003'],
|
|
1051
|
-
'description': [
|
|
1052
|
-
"Laptop 16GB RAM, 512GB SSD, Intel i7",
|
|
1053
|
-
"Phone 128GB, 5G, dual camera",
|
|
1054
|
-
"Tablet 10-inch, WiFi only, 64GB"
|
|
1055
|
-
]
|
|
1056
|
-
})
|
|
1057
|
-
|
|
1058
|
-
enriched_inventory = inventory.ai.auto_extract(
|
|
1059
|
-
purpose="Extract technical specifications for inventory system"
|
|
1060
|
-
)
|
|
1061
|
-
```
|
|
1062
|
-
|
|
1063
|
-
Note:
|
|
1064
|
-
This method is ideal for enriching existing DataFrames with additional
|
|
1065
|
-
structured fields extracted from text columns. The schema is inferred
|
|
1066
|
-
from the entire DataFrame content (converted to JSON format). For
|
|
1067
|
-
production use cases with stable schemas, consider using `infer_schema()`
|
|
1068
|
-
once and reusing the schema with `task()`.
|
|
1069
|
-
"""
|
|
1070
|
-
# Infer schema from DataFrame rows
|
|
1071
|
-
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
1072
|
-
|
|
1073
|
-
# Extract structured data using the inferred schema
|
|
1074
|
-
inferred_series = self._obj.ai.task(
|
|
1075
|
-
task=schema.task,
|
|
1076
|
-
batch_size=batch_size,
|
|
1077
|
-
show_progress=show_progress,
|
|
1078
|
-
**api_kwargs,
|
|
1079
|
-
)
|
|
1080
|
-
|
|
1081
|
-
return self._obj.assign(
|
|
1082
|
-
inferred=inferred_series,
|
|
1083
|
-
).ai.extract("inferred")
|
|
1084
|
-
|
|
1085
1087
|
def similarity(self, col1: str, col2: str) -> pd.Series:
|
|
1086
1088
|
"""Compute cosine similarity between two columns containing embedding vectors.
|
|
1087
1089
|
|
|
@@ -1176,7 +1178,11 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1176
1178
|
temperature=temperature,
|
|
1177
1179
|
top_p=top_p,
|
|
1178
1180
|
)
|
|
1179
|
-
|
|
1181
|
+
|
|
1182
|
+
# Forward any extra kwargs to the underlying Responses API, excluding proxy-specific ones.
|
|
1183
|
+
proxy_params = {"show_progress", "batch_size", "max_concurrency"}
|
|
1184
|
+
filtered_kwargs = {k: v for k, v in api_kwargs.items() if k not in proxy_params}
|
|
1185
|
+
results = await client.parse(self._obj.tolist(), **filtered_kwargs)
|
|
1180
1186
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
1181
1187
|
|
|
1182
1188
|
async def responses(
|
|
@@ -1457,96 +1463,107 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1457
1463
|
**api_kwargs,
|
|
1458
1464
|
)
|
|
1459
1465
|
|
|
1460
|
-
async def
|
|
1466
|
+
async def parse_with_cache(
|
|
1461
1467
|
self,
|
|
1462
|
-
|
|
1468
|
+
instructions: str,
|
|
1469
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1470
|
+
response_format: ResponseFormat = None,
|
|
1463
1471
|
max_examples: int = 100,
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
show_progress: bool = False,
|
|
1472
|
+
temperature: float | None = 0.0,
|
|
1473
|
+
top_p: float = 1.0,
|
|
1467
1474
|
**api_kwargs,
|
|
1468
|
-
) -> pd.
|
|
1469
|
-
"""
|
|
1475
|
+
) -> pd.Series:
|
|
1476
|
+
"""Parse Series values using an LLM with a provided cache (asynchronously).
|
|
1470
1477
|
|
|
1471
|
-
This
|
|
1472
|
-
|
|
1473
|
-
appropriate schema based on the stated purpose, then immediately applies
|
|
1474
|
-
that schema to extract structured data from all values in the Series.
|
|
1478
|
+
This method allows you to parse the Series content into structured data
|
|
1479
|
+
using an LLM, optionally inferring a schema based on the provided purpose.
|
|
1475
1480
|
|
|
1476
1481
|
Args:
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1482
|
+
instructions (str): System prompt for the LLM.
|
|
1483
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
|
|
1484
|
+
batching and deduplication control.
|
|
1485
|
+
response_format (type[BaseModel] | None): Pydantic model or built-in type
|
|
1486
|
+
for structured output. If None, schema is inferred.
|
|
1481
1487
|
max_examples (int): Maximum number of examples to use for schema inference.
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
show_progress (bool): Whether to display a progress bar during extraction.
|
|
1490
|
-
Useful for large datasets. Defaults to False.
|
|
1491
|
-
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
1492
|
-
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
1488
|
+
Defaults to 100.
|
|
1489
|
+
temperature (float | None): Sampling temperature. Defaults to 0.0.
|
|
1490
|
+
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1491
|
+
|
|
1492
|
+
Additional Keyword Args:
|
|
1493
|
+
Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
|
|
1494
|
+
`seed`, etc.) are forwarded verbatim to the underlying client.
|
|
1493
1495
|
|
|
1494
1496
|
Returns:
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
Column names and types are determined by the inferred schema.
|
|
1497
|
+
pandas.Series: Series with parsed structured data as instances of
|
|
1498
|
+
`response_format` or inferred schema model.
|
|
1498
1499
|
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
])
|
|
1500
|
+
Note:
|
|
1501
|
+
This is an asynchronous method and must be awaited.
|
|
1502
|
+
"""
|
|
1503
|
+
schema: InferredSchema | None = None
|
|
1504
|
+
if response_format is None:
|
|
1505
|
+
# Use synchronous schema inference
|
|
1506
|
+
schema = self._obj.ai.infer_schema(purpose=instructions, max_examples=max_examples)
|
|
1507
1507
|
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1508
|
+
return await self.responses_with_cache(
|
|
1509
|
+
instructions=schema.inference_prompt if schema else instructions,
|
|
1510
|
+
cache=cache,
|
|
1511
|
+
response_format=response_format or schema.model,
|
|
1512
|
+
temperature=temperature,
|
|
1513
|
+
top_p=top_p,
|
|
1514
|
+
**api_kwargs,
|
|
1515
|
+
)
|
|
1515
1516
|
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1517
|
+
async def parse(
|
|
1518
|
+
self,
|
|
1519
|
+
instructions: str,
|
|
1520
|
+
response_format: ResponseFormat = None,
|
|
1521
|
+
max_examples: int = 100,
|
|
1522
|
+
batch_size: int | None = None,
|
|
1523
|
+
max_concurrency: int = 8,
|
|
1524
|
+
show_progress: bool = False,
|
|
1525
|
+
temperature: float | None = 0.0,
|
|
1526
|
+
top_p: float = 1.0,
|
|
1527
|
+
**api_kwargs,
|
|
1528
|
+
) -> pd.Series:
|
|
1529
|
+
"""Parse Series values using an LLM with optional schema inference (asynchronously).
|
|
1522
1530
|
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1531
|
+
This method allows you to parse the Series content into structured data
|
|
1532
|
+
using an LLM, optionally inferring a schema based on the provided purpose.
|
|
1533
|
+
|
|
1534
|
+
Args:
|
|
1535
|
+
instructions (str): System prompt for the LLM.
|
|
1536
|
+
response_format (type[BaseModel] | None): Pydantic model or built-in type
|
|
1537
|
+
for structured output. If None, schema is inferred.
|
|
1538
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
1539
|
+
Defaults to 100.
|
|
1540
|
+
batch_size (int | None): Number of requests to process in parallel.
|
|
1541
|
+
Defaults to None (automatic optimization).
|
|
1542
|
+
max_concurrency (int): Maximum number of concurrent requests. Defaults to 8.
|
|
1543
|
+
show_progress (bool): Whether to display a progress bar during processing.
|
|
1544
|
+
Defaults to False.
|
|
1545
|
+
temperature (float | None): Sampling temperature. Defaults to 0.0.
|
|
1546
|
+
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1547
|
+
|
|
1548
|
+
Returns:
|
|
1549
|
+
pandas.Series: Series with parsed structured data as instances of
|
|
1550
|
+
`response_format` or inferred schema model.
|
|
1528
1551
|
|
|
1529
1552
|
Note:
|
|
1530
|
-
This is an asynchronous method and must be awaited.
|
|
1531
|
-
for exploratory data analysis when you don't have a predefined schema.
|
|
1532
|
-
For production use cases with stable schemas, consider using the synchronous
|
|
1533
|
-
`infer_schema()` once and reusing the schema with `task()`. The inferred
|
|
1534
|
-
schema is not returned, so if you need to inspect or save it, use
|
|
1535
|
-
`infer_schema()` and `task()` separately.
|
|
1553
|
+
This is an asynchronous method and must be awaited.
|
|
1536
1554
|
"""
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1555
|
+
return await self.parse_with_cache(
|
|
1556
|
+
instructions=instructions,
|
|
1557
|
+
cache=AsyncBatchingMapProxy(
|
|
1558
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1559
|
+
),
|
|
1560
|
+
response_format=response_format,
|
|
1561
|
+
max_examples=max_examples,
|
|
1562
|
+
temperature=temperature,
|
|
1563
|
+
top_p=top_p,
|
|
1545
1564
|
**api_kwargs,
|
|
1546
1565
|
)
|
|
1547
1566
|
|
|
1548
|
-
return pd.DataFrame({"inferred": inferred_series}).ai.extract("inferred")
|
|
1549
|
-
|
|
1550
1567
|
|
|
1551
1568
|
@pd.api.extensions.register_dataframe_accessor("aio")
|
|
1552
1569
|
class AsyncOpenAIVecDataFrameAccessor:
|
|
@@ -1775,6 +1792,105 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1775
1792
|
**api_kwargs,
|
|
1776
1793
|
)
|
|
1777
1794
|
|
|
1795
|
+
async def parse_with_cache(
|
|
1796
|
+
self,
|
|
1797
|
+
instructions: str,
|
|
1798
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1799
|
+
response_format: ResponseFormat = None,
|
|
1800
|
+
max_examples: int = 100,
|
|
1801
|
+
temperature: float | None = 0.0,
|
|
1802
|
+
top_p: float = 1.0,
|
|
1803
|
+
**api_kwargs,
|
|
1804
|
+
) -> pd.Series:
|
|
1805
|
+
"""Parse DataFrame rows using an LLM with a provided cache (asynchronously).
|
|
1806
|
+
|
|
1807
|
+
This method allows you to parse each DataFrame row (serialized as JSON)
|
|
1808
|
+
into structured data using an LLM, optionally inferring a schema based
|
|
1809
|
+
on the provided purpose.
|
|
1810
|
+
|
|
1811
|
+
Args:
|
|
1812
|
+
instructions (str): System prompt for the LLM.
|
|
1813
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
|
|
1814
|
+
batching and deduplication control.
|
|
1815
|
+
response_format (type[BaseModel] | None): Pydantic model or built-in type
|
|
1816
|
+
for structured output. If None, schema is inferred.
|
|
1817
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
1818
|
+
Defaults to 100.
|
|
1819
|
+
temperature (float | None): Sampling temperature. Defaults to 0.0.
|
|
1820
|
+
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1821
|
+
|
|
1822
|
+
Additional Keyword Args:
|
|
1823
|
+
Arbitrary OpenAI Responses API parameters (e.g. `frequency_penalty`, `presence_penalty`,
|
|
1824
|
+
`seed`, etc.) are forwarded verbatim to the underlying client.
|
|
1825
|
+
|
|
1826
|
+
Returns:
|
|
1827
|
+
pandas.Series: Series with parsed structured data as instances of
|
|
1828
|
+
`response_format` or inferred schema model.
|
|
1829
|
+
|
|
1830
|
+
Note:
|
|
1831
|
+
This is an asynchronous method and must be awaited.
|
|
1832
|
+
"""
|
|
1833
|
+
return await _df_rows_to_json_series(self._obj).aio.parse_with_cache(
|
|
1834
|
+
instructions=instructions,
|
|
1835
|
+
cache=cache,
|
|
1836
|
+
response_format=response_format,
|
|
1837
|
+
max_examples=max_examples,
|
|
1838
|
+
temperature=temperature,
|
|
1839
|
+
top_p=top_p,
|
|
1840
|
+
**api_kwargs,
|
|
1841
|
+
)
|
|
1842
|
+
|
|
1843
|
+
async def parse(
|
|
1844
|
+
self,
|
|
1845
|
+
instructions: str,
|
|
1846
|
+
response_format: ResponseFormat = None,
|
|
1847
|
+
max_examples: int = 100,
|
|
1848
|
+
batch_size: int | None = None,
|
|
1849
|
+
max_concurrency: int = 8,
|
|
1850
|
+
show_progress: bool = False,
|
|
1851
|
+
temperature: float | None = 0.0,
|
|
1852
|
+
top_p: float = 1.0,
|
|
1853
|
+
**api_kwargs,
|
|
1854
|
+
) -> pd.Series:
|
|
1855
|
+
"""Parse DataFrame rows using an LLM with optional schema inference (asynchronously).
|
|
1856
|
+
|
|
1857
|
+
This method allows you to parse each DataFrame row (serialized as JSON)
|
|
1858
|
+
into structured data using an LLM, optionally inferring a schema based
|
|
1859
|
+
on the provided purpose.
|
|
1860
|
+
|
|
1861
|
+
Args:
|
|
1862
|
+
instructions (str): System prompt for the LLM.
|
|
1863
|
+
response_format (type[BaseModel] | None): Pydantic model or built-in type
|
|
1864
|
+
for structured output. If None, schema is inferred.
|
|
1865
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
1866
|
+
Defaults to 100.
|
|
1867
|
+
batch_size (int | None): Number of requests to process in parallel.
|
|
1868
|
+
Defaults to None (automatic optimization).
|
|
1869
|
+
max_concurrency (int): Maximum number of concurrent requests. Defaults to 8.
|
|
1870
|
+
show_progress (bool): Whether to display a progress bar during processing.
|
|
1871
|
+
Defaults to False.
|
|
1872
|
+
temperature (float | None): Sampling temperature. Defaults to 0.0.
|
|
1873
|
+
top_p (float): Nucleus sampling parameter. Defaults to 1.0.
|
|
1874
|
+
|
|
1875
|
+
Returns:
|
|
1876
|
+
pandas.Series: Series with parsed structured data as instances of
|
|
1877
|
+
`response_format` or inferred schema model.
|
|
1878
|
+
|
|
1879
|
+
Note:
|
|
1880
|
+
This is an asynchronous method and must be awaited.
|
|
1881
|
+
"""
|
|
1882
|
+
return await self.parse_with_cache(
|
|
1883
|
+
instructions=instructions,
|
|
1884
|
+
cache=AsyncBatchingMapProxy(
|
|
1885
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1886
|
+
),
|
|
1887
|
+
response_format=response_format,
|
|
1888
|
+
max_examples=max_examples,
|
|
1889
|
+
temperature=temperature,
|
|
1890
|
+
top_p=top_p,
|
|
1891
|
+
**api_kwargs,
|
|
1892
|
+
)
|
|
1893
|
+
|
|
1778
1894
|
async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
|
|
1779
1895
|
"""Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
|
|
1780
1896
|
|
|
@@ -1954,103 +2070,3 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1954
2070
|
df.at[actual_index, target_column_name] = result.output
|
|
1955
2071
|
|
|
1956
2072
|
return df
|
|
1957
|
-
|
|
1958
|
-
async def auto_extract(
|
|
1959
|
-
self,
|
|
1960
|
-
purpose: str,
|
|
1961
|
-
max_examples: int = 100,
|
|
1962
|
-
batch_size: int | None = None,
|
|
1963
|
-
max_concurrency: int = 8,
|
|
1964
|
-
show_progress: bool = False,
|
|
1965
|
-
**api_kwargs,
|
|
1966
|
-
) -> pd.DataFrame:
|
|
1967
|
-
"""Automatically infer schema and add extracted fields to the DataFrame (asynchronously).
|
|
1968
|
-
|
|
1969
|
-
This convenience method combines schema inference and data extraction to
|
|
1970
|
-
automatically add new columns to the existing DataFrame. It analyzes a
|
|
1971
|
-
sample of the DataFrame rows to infer an appropriate schema based on the
|
|
1972
|
-
stated purpose, then extracts structured data and joins it with the
|
|
1973
|
-
original DataFrame.
|
|
1974
|
-
|
|
1975
|
-
Args:
|
|
1976
|
-
purpose (str): Plain language description of what information to extract
|
|
1977
|
-
and how it will be used (e.g., "Extract customer sentiment metrics",
|
|
1978
|
-
"Parse product attributes for analytics"). This guides both schema
|
|
1979
|
-
inference and field selection.
|
|
1980
|
-
max_examples (int): Maximum number of rows to use for schema inference.
|
|
1981
|
-
A larger sample may produce more accurate schemas but increases
|
|
1982
|
-
inference time. Defaults to 100.
|
|
1983
|
-
batch_size (int | None): Number of requests to process in parallel during
|
|
1984
|
-
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
1985
|
-
value to control API usage and performance.
|
|
1986
|
-
max_concurrency (int): Maximum number of concurrent requests during
|
|
1987
|
-
extraction. Defaults to 8.
|
|
1988
|
-
show_progress (bool): Whether to display a progress bar during extraction.
|
|
1989
|
-
Useful for large datasets. Defaults to False.
|
|
1990
|
-
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
1991
|
-
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
1992
|
-
|
|
1993
|
-
Returns:
|
|
1994
|
-
pd.DataFrame: The original DataFrame with new columns added from the
|
|
1995
|
-
inferred structured data. Each inferred field becomes a new column.
|
|
1996
|
-
The original columns and index are preserved.
|
|
1997
|
-
|
|
1998
|
-
Example:
|
|
1999
|
-
```python
|
|
2000
|
-
# Add sentiment and issue type to support tickets
|
|
2001
|
-
df = pd.DataFrame({
|
|
2002
|
-
'ticket_id': [1, 2, 3],
|
|
2003
|
-
'description': [
|
|
2004
|
-
"Can't login, password reset not working",
|
|
2005
|
-
"Billing error, charged twice last month",
|
|
2006
|
-
"Great service, issue resolved quickly!"
|
|
2007
|
-
],
|
|
2008
|
-
'date': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
2009
|
-
})
|
|
2010
|
-
|
|
2011
|
-
# Add inferred fields to existing DataFrame (must be awaited)
|
|
2012
|
-
enriched_df = await df.aio.auto_extract(
|
|
2013
|
-
purpose="Extract issue type and sentiment for support dashboard",
|
|
2014
|
-
max_concurrency=4,
|
|
2015
|
-
show_progress=True
|
|
2016
|
-
)
|
|
2017
|
-
# Result: Original df with new columns like 'issue_type', 'sentiment', etc.
|
|
2018
|
-
|
|
2019
|
-
# Add product specifications to inventory data
|
|
2020
|
-
inventory = pd.DataFrame({
|
|
2021
|
-
'sku': ['A001', 'B002', 'C003'],
|
|
2022
|
-
'description': [
|
|
2023
|
-
"Laptop 16GB RAM, 512GB SSD, Intel i7",
|
|
2024
|
-
"Phone 128GB, 5G, dual camera",
|
|
2025
|
-
"Tablet 10-inch, WiFi only, 64GB"
|
|
2026
|
-
]
|
|
2027
|
-
})
|
|
2028
|
-
|
|
2029
|
-
enriched_inventory = await inventory.aio.auto_extract(
|
|
2030
|
-
purpose="Extract technical specifications for inventory system",
|
|
2031
|
-
batch_size=32
|
|
2032
|
-
)
|
|
2033
|
-
```
|
|
2034
|
-
|
|
2035
|
-
Note:
|
|
2036
|
-
This is an asynchronous method and must be awaited. This method is ideal
|
|
2037
|
-
for enriching existing DataFrames with additional structured fields
|
|
2038
|
-
extracted from text columns. The schema is inferred synchronously from
|
|
2039
|
-
the DataFrame content. For production use cases with stable schemas,
|
|
2040
|
-
consider using `infer_schema()` once and reusing the schema with `task()`.
|
|
2041
|
-
"""
|
|
2042
|
-
# Infer schema from DataFrame rows (synchronous)
|
|
2043
|
-
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
2044
|
-
|
|
2045
|
-
# Extract structured data using the inferred schema (asynchronous)
|
|
2046
|
-
inferred_series = await self._obj.aio.task(
|
|
2047
|
-
task=schema.task,
|
|
2048
|
-
batch_size=batch_size,
|
|
2049
|
-
max_concurrency=max_concurrency,
|
|
2050
|
-
show_progress=show_progress,
|
|
2051
|
-
**api_kwargs,
|
|
2052
|
-
)
|
|
2053
|
-
|
|
2054
|
-
return self._obj.assign(
|
|
2055
|
-
inferred=inferred_series,
|
|
2056
|
-
).ai.extract("inferred")
|