openaivec 0.14.4__tar.gz → 0.14.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openaivec-0.14.4 → openaivec-0.14.6}/PKG-INFO +1 -1
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_provider.py +15 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/pandas_ext.py +481 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/.env.example +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/.github/copilot-instructions.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/.github/workflows/python-mkdocs.yml +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/.github/workflows/python-package.yml +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/.github/workflows/python-test.yml +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/.github/workflows/python-update.yml +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/.gitignore +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/CODE_OF_CONDUCT.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/LICENSE +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/README.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/SECURITY.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/SUPPORT.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/main.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/pandas_ext.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/spark.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/task.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/customer_support/customer_sentiment.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/customer_support/inquiry_classification.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/customer_support/inquiry_summary.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/customer_support/intent_analysis.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/customer_support/response_suggestion.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/customer_support/urgency_analysis.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/nlp/dependency_parsing.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/nlp/keyword_extraction.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/nlp/morphological_analysis.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/nlp/named_entity_recognition.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/nlp/sentiment_analysis.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/nlp/translation.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/index.md +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/docs/robots.txt +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/mkdocs.yml +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/pyproject.toml +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/__init__.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_di.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_embeddings.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_log.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_model.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_optimize.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_prompt.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_proxy.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_responses.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_schema.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_serialize.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/_util.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/spark.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/__init__.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/__init__.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/customer_sentiment.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/inquiry_classification.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/inquiry_summary.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/intent_analysis.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/response_suggestion.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/urgency_analysis.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/nlp/__init__.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/nlp/dependency_parsing.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/nlp/keyword_extraction.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/nlp/morphological_analysis.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/nlp/named_entity_recognition.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/nlp/sentiment_analysis.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/nlp/translation.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/table/__init__.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/table/fillna.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/__init__.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_di.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_embeddings.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_optimize.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_pandas_ext.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_prompt.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_provider.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_proxy.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_proxy_suggester.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_responses.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_schema.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_serialize.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_serialize_pydantic_v2_compliance.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_spark.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_task.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/tests/test_util.py +0 -0
- {openaivec-0.14.4 → openaivec-0.14.6}/uv.lock +0 -0
|
@@ -13,6 +13,7 @@ from openaivec._model import (
|
|
|
13
13
|
OpenAIAPIKey,
|
|
14
14
|
ResponsesModelName,
|
|
15
15
|
)
|
|
16
|
+
from openaivec._schema import SchemaInferer
|
|
16
17
|
from openaivec._util import TextChunker
|
|
17
18
|
|
|
18
19
|
__all__ = []
|
|
@@ -142,6 +143,13 @@ CONTAINER.register(OpenAI, provide_openai_client)
|
|
|
142
143
|
CONTAINER.register(AsyncOpenAI, provide_async_openai_client)
|
|
143
144
|
CONTAINER.register(tiktoken.Encoding, lambda: tiktoken.get_encoding("o200k_base"))
|
|
144
145
|
CONTAINER.register(TextChunker, lambda: TextChunker(CONTAINER.resolve(tiktoken.Encoding)))
|
|
146
|
+
CONTAINER.register(
|
|
147
|
+
SchemaInferer,
|
|
148
|
+
lambda: SchemaInferer(
|
|
149
|
+
client=CONTAINER.resolve(OpenAI),
|
|
150
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
151
|
+
),
|
|
152
|
+
)
|
|
145
153
|
|
|
146
154
|
|
|
147
155
|
def reset_environment_registrations():
|
|
@@ -160,3 +168,10 @@ def reset_environment_registrations():
|
|
|
160
168
|
)
|
|
161
169
|
CONTAINER.register(OpenAI, provide_openai_client)
|
|
162
170
|
CONTAINER.register(AsyncOpenAI, provide_async_openai_client)
|
|
171
|
+
CONTAINER.register(
|
|
172
|
+
SchemaInferer,
|
|
173
|
+
lambda: SchemaInferer(
|
|
174
|
+
client=CONTAINER.resolve(OpenAI),
|
|
175
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
176
|
+
),
|
|
177
|
+
)
|
|
@@ -49,6 +49,8 @@ import pandas as pd
|
|
|
49
49
|
import tiktoken
|
|
50
50
|
from openai import AsyncOpenAI, OpenAI
|
|
51
51
|
|
|
52
|
+
from openaivec._schema import InferredSchema, SchemaInferenceInput, SchemaInferer
|
|
53
|
+
|
|
52
54
|
__all__ = [
|
|
53
55
|
"embeddings_model",
|
|
54
56
|
"responses_model",
|
|
@@ -434,6 +436,61 @@ class OpenAIVecSeriesAccessor:
|
|
|
434
436
|
**api_kwargs,
|
|
435
437
|
)
|
|
436
438
|
|
|
439
|
+
def infer_schema(self, purpose: str, max_examples: int = 100) -> InferredSchema:
|
|
440
|
+
"""Infer a structured data schema from Series content using AI.
|
|
441
|
+
|
|
442
|
+
This method analyzes a sample of the Series values to automatically infer
|
|
443
|
+
a structured schema that can be used for consistent data extraction.
|
|
444
|
+
The inferred schema includes field names, types, descriptions, and
|
|
445
|
+
potential enum values based on patterns found in the data.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
purpose (str): Plain language description of how the extracted
|
|
449
|
+
structured data will be used (e.g., "Extract customer sentiment
|
|
450
|
+
signals for analytics", "Parse product features for search").
|
|
451
|
+
This guides field relevance and helps exclude irrelevant information.
|
|
452
|
+
max_examples (int): Maximum number of examples to analyze from the
|
|
453
|
+
Series. The method will sample randomly from the Series up to this
|
|
454
|
+
limit. Defaults to 100.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
InferredSchema: An object containing:
|
|
458
|
+
- purpose: Normalized statement of the extraction objective
|
|
459
|
+
- fields: List of field specifications with names, types, and descriptions
|
|
460
|
+
- inference_prompt: Reusable prompt for future extractions
|
|
461
|
+
- model: Dynamically generated Pydantic model for parsing
|
|
462
|
+
- task: PreparedTask for batch extraction operations
|
|
463
|
+
|
|
464
|
+
Example:
|
|
465
|
+
```python
|
|
466
|
+
reviews = pd.Series([
|
|
467
|
+
"Great product! Fast shipping and excellent quality.",
|
|
468
|
+
"Terrible experience. Item broke after 2 days.",
|
|
469
|
+
"Average product. Price is fair but nothing special."
|
|
470
|
+
])
|
|
471
|
+
|
|
472
|
+
# Infer schema for sentiment analysis
|
|
473
|
+
schema = reviews.ai.infer_schema(
|
|
474
|
+
purpose="Extract sentiment and product quality indicators"
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
# Use the inferred schema for batch extraction
|
|
478
|
+
extracted = reviews.ai.task(schema.task)
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
Note:
|
|
482
|
+
The schema inference uses AI to analyze patterns in the data and may
|
|
483
|
+
require multiple attempts to produce a valid schema. Fields are limited
|
|
484
|
+
to primitive types (string, integer, float, boolean) with optional
|
|
485
|
+
enum values for categorical fields.
|
|
486
|
+
"""
|
|
487
|
+
inferer = CONTAINER.resolve(SchemaInferer)
|
|
488
|
+
|
|
489
|
+
input: SchemaInferenceInput = SchemaInferenceInput(
|
|
490
|
+
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(), purpose=purpose
|
|
491
|
+
)
|
|
492
|
+
return inferer.infer_schema(input)
|
|
493
|
+
|
|
437
494
|
def count_tokens(self) -> pd.Series:
|
|
438
495
|
"""Count `tiktoken` tokens per row.
|
|
439
496
|
|
|
@@ -480,6 +537,90 @@ class OpenAIVecSeriesAccessor:
|
|
|
480
537
|
extracted.columns = [f"{self._obj.name}_{col}" for col in extracted.columns]
|
|
481
538
|
return extracted
|
|
482
539
|
|
|
540
|
+
def auto_extract(
|
|
541
|
+
self,
|
|
542
|
+
purpose: str,
|
|
543
|
+
max_examples: int = 100,
|
|
544
|
+
batch_size: int | None = None,
|
|
545
|
+
show_progress: bool = False,
|
|
546
|
+
**api_kwargs,
|
|
547
|
+
) -> pd.DataFrame:
|
|
548
|
+
"""Automatically infer schema and extract structured data in one step.
|
|
549
|
+
|
|
550
|
+
This convenience method combines schema inference and data extraction into
|
|
551
|
+
a single operation. It first analyzes a sample of the Series to infer an
|
|
552
|
+
appropriate schema based on the stated purpose, then immediately applies
|
|
553
|
+
that schema to extract structured data from all values in the Series.
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
purpose (str): Plain language description of what information to extract
|
|
557
|
+
and how it will be used (e.g., "Extract product features for search",
|
|
558
|
+
"Parse customer feedback for sentiment analysis"). This guides both
|
|
559
|
+
schema inference and field selection.
|
|
560
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
561
|
+
A larger sample may produce more accurate schemas but increases
|
|
562
|
+
inference time. Defaults to 100.
|
|
563
|
+
batch_size (int | None): Number of requests to process in parallel during
|
|
564
|
+
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
565
|
+
value to control API usage and performance.
|
|
566
|
+
show_progress (bool): Whether to display a progress bar during extraction.
|
|
567
|
+
Useful for large datasets. Defaults to False.
|
|
568
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
569
|
+
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
pd.DataFrame: A DataFrame with extracted structured data. Each inferred
|
|
573
|
+
field becomes a column, with the same index as the original Series.
|
|
574
|
+
Column names and types are determined by the inferred schema.
|
|
575
|
+
|
|
576
|
+
Example:
|
|
577
|
+
```python
|
|
578
|
+
# Extract structured data from product reviews
|
|
579
|
+
reviews = pd.Series([
|
|
580
|
+
"Great laptop! 16GB RAM, fast SSD, battery lasts 10 hours",
|
|
581
|
+
"Decent phone. 128GB storage, camera is okay, screen is bright",
|
|
582
|
+
"Gaming desktop with RTX 4090, 32GB RAM, runs everything smoothly"
|
|
583
|
+
])
|
|
584
|
+
|
|
585
|
+
# One-step extraction
|
|
586
|
+
extracted = reviews.ai.auto_extract(
|
|
587
|
+
purpose="Extract product specifications and performance metrics",
|
|
588
|
+
show_progress=True
|
|
589
|
+
)
|
|
590
|
+
# Result: DataFrame with columns like 'ram', 'storage', 'battery_life', etc.
|
|
591
|
+
|
|
592
|
+
# Extract sentiment and issues from support tickets
|
|
593
|
+
tickets = pd.Series([
|
|
594
|
+
"Account locked, can't reset password, very frustrated",
|
|
595
|
+
"Billing error, charged twice for subscription",
|
|
596
|
+
"Great support! Issue resolved quickly"
|
|
597
|
+
])
|
|
598
|
+
|
|
599
|
+
features = tickets.ai.auto_extract(
|
|
600
|
+
purpose="Extract issue type and customer sentiment for support analytics"
|
|
601
|
+
)
|
|
602
|
+
```
|
|
603
|
+
|
|
604
|
+
Note:
|
|
605
|
+
This method is ideal for exploratory data analysis when you don't have
|
|
606
|
+
a predefined schema. For production use cases with stable schemas,
|
|
607
|
+
consider using `infer_schema()` once and reusing the schema with `task()`.
|
|
608
|
+
The inferred schema is not returned, so if you need to inspect or save it,
|
|
609
|
+
use `infer_schema()` and `task()` separately.
|
|
610
|
+
"""
|
|
611
|
+
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
612
|
+
|
|
613
|
+
return pd.DataFrame(
|
|
614
|
+
{
|
|
615
|
+
"inferred": self._obj.ai.task(
|
|
616
|
+
task=schema.task,
|
|
617
|
+
batch_size=batch_size,
|
|
618
|
+
show_progress=show_progress,
|
|
619
|
+
**api_kwargs,
|
|
620
|
+
),
|
|
621
|
+
}
|
|
622
|
+
).ai.extract("inferred")
|
|
623
|
+
|
|
483
624
|
|
|
484
625
|
@pd.api.extensions.register_dataframe_accessor("ai")
|
|
485
626
|
class OpenAIVecDataFrameAccessor:
|
|
@@ -680,6 +821,62 @@ class OpenAIVecDataFrameAccessor:
|
|
|
680
821
|
**api_kwargs,
|
|
681
822
|
)
|
|
682
823
|
|
|
824
|
+
def infer_schema(self, purpose: str, max_examples: int = 100) -> InferredSchema:
|
|
825
|
+
"""Infer a structured data schema from DataFrame rows using AI.
|
|
826
|
+
|
|
827
|
+
This method analyzes a sample of DataFrame rows to automatically infer
|
|
828
|
+
a structured schema that can be used for consistent data extraction.
|
|
829
|
+
Each row is converted to JSON format and analyzed to identify patterns,
|
|
830
|
+
field types, and potential categorical values.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
purpose (str): Plain language description of how the extracted
|
|
834
|
+
structured data will be used (e.g., "Extract operational metrics
|
|
835
|
+
for dashboard", "Parse customer attributes for segmentation").
|
|
836
|
+
This guides field relevance and helps exclude irrelevant information.
|
|
837
|
+
max_examples (int): Maximum number of rows to analyze from the
|
|
838
|
+
DataFrame. The method will sample randomly up to this limit.
|
|
839
|
+
Defaults to 100.
|
|
840
|
+
|
|
841
|
+
Returns:
|
|
842
|
+
InferredSchema: An object containing:
|
|
843
|
+
- purpose: Normalized statement of the extraction objective
|
|
844
|
+
- fields: List of field specifications with names, types, and descriptions
|
|
845
|
+
- inference_prompt: Reusable prompt for future extractions
|
|
846
|
+
- model: Dynamically generated Pydantic model for parsing
|
|
847
|
+
- task: PreparedTask for batch extraction operations
|
|
848
|
+
|
|
849
|
+
Example:
|
|
850
|
+
```python
|
|
851
|
+
df = pd.DataFrame({
|
|
852
|
+
'text': [
|
|
853
|
+
"Order #123: Shipped to NYC, arriving Tuesday",
|
|
854
|
+
"Order #456: Delayed due to weather, new ETA Friday",
|
|
855
|
+
"Order #789: Delivered to customer in LA"
|
|
856
|
+
],
|
|
857
|
+
'timestamp': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
858
|
+
})
|
|
859
|
+
|
|
860
|
+
# Infer schema for logistics tracking
|
|
861
|
+
schema = df.ai.infer_schema(
|
|
862
|
+
purpose="Extract shipping status and location data for logistics tracking"
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
# Apply the schema to extract structured data
|
|
866
|
+
extracted_df = df.ai.task(schema.task)
|
|
867
|
+
```
|
|
868
|
+
|
|
869
|
+
Note:
|
|
870
|
+
The DataFrame rows are internally converted to JSON format before
|
|
871
|
+
analysis. The inferred schema is flat (no nested structures) and
|
|
872
|
+
uses only primitive types to ensure compatibility with pandas and
|
|
873
|
+
Spark operations.
|
|
874
|
+
"""
|
|
875
|
+
return _df_rows_to_json_series(self._obj).ai.infer_schema(
|
|
876
|
+
purpose=purpose,
|
|
877
|
+
max_examples=max_examples,
|
|
878
|
+
)
|
|
879
|
+
|
|
683
880
|
def extract(self, column: str) -> pd.DataFrame:
|
|
684
881
|
"""Flatten one column of Pydantic models/dicts into top‑level columns.
|
|
685
882
|
|
|
@@ -790,6 +987,100 @@ class OpenAIVecDataFrameAccessor:
|
|
|
790
987
|
|
|
791
988
|
return df
|
|
792
989
|
|
|
990
|
+
def auto_extract(
|
|
991
|
+
self,
|
|
992
|
+
purpose: str,
|
|
993
|
+
max_examples: int = 100,
|
|
994
|
+
batch_size: int | None = None,
|
|
995
|
+
show_progress: bool = False,
|
|
996
|
+
**api_kwargs,
|
|
997
|
+
) -> pd.DataFrame:
|
|
998
|
+
"""Automatically infer schema and add extracted fields to the DataFrame.
|
|
999
|
+
|
|
1000
|
+
This convenience method combines schema inference and data extraction to
|
|
1001
|
+
automatically add new columns to the existing DataFrame. It analyzes a
|
|
1002
|
+
sample of the DataFrame rows to infer an appropriate schema based on the
|
|
1003
|
+
stated purpose, then extracts structured data and joins it with the
|
|
1004
|
+
original DataFrame.
|
|
1005
|
+
|
|
1006
|
+
Args:
|
|
1007
|
+
purpose (str): Plain language description of what information to extract
|
|
1008
|
+
and how it will be used (e.g., "Extract customer sentiment metrics",
|
|
1009
|
+
"Parse product attributes for analytics"). This guides both schema
|
|
1010
|
+
inference and field selection.
|
|
1011
|
+
max_examples (int): Maximum number of rows to use for schema inference.
|
|
1012
|
+
A larger sample may produce more accurate schemas but increases
|
|
1013
|
+
inference time. Defaults to 100.
|
|
1014
|
+
batch_size (int | None): Number of requests to process in parallel during
|
|
1015
|
+
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
1016
|
+
value to control API usage and performance.
|
|
1017
|
+
show_progress (bool): Whether to display a progress bar during extraction.
|
|
1018
|
+
Useful for large datasets. Defaults to False.
|
|
1019
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
1020
|
+
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
1021
|
+
|
|
1022
|
+
Returns:
|
|
1023
|
+
pd.DataFrame: The original DataFrame with new columns added from the
|
|
1024
|
+
inferred structured data. Each inferred field becomes a new column.
|
|
1025
|
+
The original columns and index are preserved.
|
|
1026
|
+
|
|
1027
|
+
Example:
|
|
1028
|
+
```python
|
|
1029
|
+
# Add sentiment and issue type to support tickets
|
|
1030
|
+
df = pd.DataFrame({
|
|
1031
|
+
'ticket_id': [1, 2, 3],
|
|
1032
|
+
'description': [
|
|
1033
|
+
"Can't login, password reset not working",
|
|
1034
|
+
"Billing error, charged twice last month",
|
|
1035
|
+
"Great service, issue resolved quickly!"
|
|
1036
|
+
],
|
|
1037
|
+
'date': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
1038
|
+
})
|
|
1039
|
+
|
|
1040
|
+
# Add inferred fields to existing DataFrame
|
|
1041
|
+
enriched_df = df.ai.auto_extract(
|
|
1042
|
+
purpose="Extract issue type and sentiment for support dashboard",
|
|
1043
|
+
show_progress=True
|
|
1044
|
+
)
|
|
1045
|
+
# Result: Original df with new columns like 'issue_type', 'sentiment', etc.
|
|
1046
|
+
|
|
1047
|
+
# Add product specifications to inventory data
|
|
1048
|
+
inventory = pd.DataFrame({
|
|
1049
|
+
'sku': ['A001', 'B002', 'C003'],
|
|
1050
|
+
'description': [
|
|
1051
|
+
"Laptop 16GB RAM, 512GB SSD, Intel i7",
|
|
1052
|
+
"Phone 128GB, 5G, dual camera",
|
|
1053
|
+
"Tablet 10-inch, WiFi only, 64GB"
|
|
1054
|
+
]
|
|
1055
|
+
})
|
|
1056
|
+
|
|
1057
|
+
enriched_inventory = inventory.ai.auto_extract(
|
|
1058
|
+
purpose="Extract technical specifications for inventory system"
|
|
1059
|
+
)
|
|
1060
|
+
```
|
|
1061
|
+
|
|
1062
|
+
Note:
|
|
1063
|
+
This method is ideal for enriching existing DataFrames with additional
|
|
1064
|
+
structured fields extracted from text columns. The schema is inferred
|
|
1065
|
+
from the entire DataFrame content (converted to JSON format). For
|
|
1066
|
+
production use cases with stable schemas, consider using `infer_schema()`
|
|
1067
|
+
once and reusing the schema with `task()`.
|
|
1068
|
+
"""
|
|
1069
|
+
# Infer schema from DataFrame rows
|
|
1070
|
+
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
1071
|
+
|
|
1072
|
+
# Extract structured data using the inferred schema
|
|
1073
|
+
inferred_series = self._obj.ai.task(
|
|
1074
|
+
task=schema.task,
|
|
1075
|
+
batch_size=batch_size,
|
|
1076
|
+
show_progress=show_progress,
|
|
1077
|
+
**api_kwargs,
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1080
|
+
return self._obj.assign(
|
|
1081
|
+
inferred=inferred_series,
|
|
1082
|
+
).ai.extract("inferred")
|
|
1083
|
+
|
|
793
1084
|
def similarity(self, col1: str, col2: str) -> pd.Series:
|
|
794
1085
|
"""Compute cosine similarity between two columns containing embedding vectors.
|
|
795
1086
|
|
|
@@ -1165,6 +1456,96 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1165
1456
|
**api_kwargs,
|
|
1166
1457
|
)
|
|
1167
1458
|
|
|
1459
|
+
async def auto_extract(
|
|
1460
|
+
self,
|
|
1461
|
+
purpose: str,
|
|
1462
|
+
max_examples: int = 100,
|
|
1463
|
+
batch_size: int | None = None,
|
|
1464
|
+
max_concurrency: int = 8,
|
|
1465
|
+
show_progress: bool = False,
|
|
1466
|
+
**api_kwargs,
|
|
1467
|
+
) -> pd.DataFrame:
|
|
1468
|
+
"""Automatically infer schema and extract structured data in one step (asynchronously).
|
|
1469
|
+
|
|
1470
|
+
This convenience method combines schema inference and data extraction into
|
|
1471
|
+
a single operation. It first analyzes a sample of the Series to infer an
|
|
1472
|
+
appropriate schema based on the stated purpose, then immediately applies
|
|
1473
|
+
that schema to extract structured data from all values in the Series.
|
|
1474
|
+
|
|
1475
|
+
Args:
|
|
1476
|
+
purpose (str): Plain language description of what information to extract
|
|
1477
|
+
and how it will be used (e.g., "Extract product features for search",
|
|
1478
|
+
"Parse customer feedback for sentiment analysis"). This guides both
|
|
1479
|
+
schema inference and field selection.
|
|
1480
|
+
max_examples (int): Maximum number of examples to use for schema inference.
|
|
1481
|
+
A larger sample may produce more accurate schemas but increases
|
|
1482
|
+
inference time. Defaults to 100.
|
|
1483
|
+
batch_size (int | None): Number of requests to process in parallel during
|
|
1484
|
+
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
1485
|
+
value to control API usage and performance.
|
|
1486
|
+
max_concurrency (int): Maximum number of concurrent requests during
|
|
1487
|
+
extraction. Defaults to 8.
|
|
1488
|
+
show_progress (bool): Whether to display a progress bar during extraction.
|
|
1489
|
+
Useful for large datasets. Defaults to False.
|
|
1490
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
1491
|
+
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
1492
|
+
|
|
1493
|
+
Returns:
|
|
1494
|
+
pd.DataFrame: A DataFrame with extracted structured data. Each inferred
|
|
1495
|
+
field becomes a column, with the same index as the original Series.
|
|
1496
|
+
Column names and types are determined by the inferred schema.
|
|
1497
|
+
|
|
1498
|
+
Example:
|
|
1499
|
+
```python
|
|
1500
|
+
# Extract structured data from product reviews
|
|
1501
|
+
reviews = pd.Series([
|
|
1502
|
+
"Great laptop! 16GB RAM, fast SSD, battery lasts 10 hours",
|
|
1503
|
+
"Decent phone. 128GB storage, camera is okay, screen is bright",
|
|
1504
|
+
"Gaming desktop with RTX 4090, 32GB RAM, runs everything smoothly"
|
|
1505
|
+
])
|
|
1506
|
+
|
|
1507
|
+
# One-step extraction (must be awaited)
|
|
1508
|
+
extracted = await reviews.aio.auto_extract(
|
|
1509
|
+
purpose="Extract product specifications and performance metrics",
|
|
1510
|
+
max_concurrency=4,
|
|
1511
|
+
show_progress=True
|
|
1512
|
+
)
|
|
1513
|
+
# Result: DataFrame with columns like 'ram', 'storage', 'battery_life', etc.
|
|
1514
|
+
|
|
1515
|
+
# Extract sentiment and issues from support tickets
|
|
1516
|
+
tickets = pd.Series([
|
|
1517
|
+
"Account locked, can't reset password, very frustrated",
|
|
1518
|
+
"Billing error, charged twice for subscription",
|
|
1519
|
+
"Great support! Issue resolved quickly"
|
|
1520
|
+
])
|
|
1521
|
+
|
|
1522
|
+
features = await tickets.aio.auto_extract(
|
|
1523
|
+
purpose="Extract issue type and customer sentiment for support analytics",
|
|
1524
|
+
batch_size=32
|
|
1525
|
+
)
|
|
1526
|
+
```
|
|
1527
|
+
|
|
1528
|
+
Note:
|
|
1529
|
+
This is an asynchronous method and must be awaited. This method is ideal
|
|
1530
|
+
for exploratory data analysis when you don't have a predefined schema.
|
|
1531
|
+
For production use cases with stable schemas, consider using the synchronous
|
|
1532
|
+
`infer_schema()` once and reusing the schema with `task()`. The inferred
|
|
1533
|
+
schema is not returned, so if you need to inspect or save it, use
|
|
1534
|
+
`infer_schema()` and `task()` separately.
|
|
1535
|
+
"""
|
|
1536
|
+
# Use synchronous infer_schema since it's not async
|
|
1537
|
+
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
1538
|
+
|
|
1539
|
+
inferred_series = await self._obj.aio.task(
|
|
1540
|
+
task=schema.task,
|
|
1541
|
+
batch_size=batch_size,
|
|
1542
|
+
max_concurrency=max_concurrency,
|
|
1543
|
+
show_progress=show_progress,
|
|
1544
|
+
**api_kwargs,
|
|
1545
|
+
)
|
|
1546
|
+
|
|
1547
|
+
return pd.DataFrame({"inferred": inferred_series}).ai.extract("inferred")
|
|
1548
|
+
|
|
1168
1549
|
|
|
1169
1550
|
@pd.api.extensions.register_dataframe_accessor("aio")
|
|
1170
1551
|
class AsyncOpenAIVecDataFrameAccessor:
|
|
@@ -1572,3 +1953,103 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1572
1953
|
df.at[actual_index, target_column_name] = result.output
|
|
1573
1954
|
|
|
1574
1955
|
return df
|
|
1956
|
+
|
|
1957
|
+
async def auto_extract(
|
|
1958
|
+
self,
|
|
1959
|
+
purpose: str,
|
|
1960
|
+
max_examples: int = 100,
|
|
1961
|
+
batch_size: int | None = None,
|
|
1962
|
+
max_concurrency: int = 8,
|
|
1963
|
+
show_progress: bool = False,
|
|
1964
|
+
**api_kwargs,
|
|
1965
|
+
) -> pd.DataFrame:
|
|
1966
|
+
"""Automatically infer schema and add extracted fields to the DataFrame (asynchronously).
|
|
1967
|
+
|
|
1968
|
+
This convenience method combines schema inference and data extraction to
|
|
1969
|
+
automatically add new columns to the existing DataFrame. It analyzes a
|
|
1970
|
+
sample of the DataFrame rows to infer an appropriate schema based on the
|
|
1971
|
+
stated purpose, then extracts structured data and joins it with the
|
|
1972
|
+
original DataFrame.
|
|
1973
|
+
|
|
1974
|
+
Args:
|
|
1975
|
+
purpose (str): Plain language description of what information to extract
|
|
1976
|
+
and how it will be used (e.g., "Extract customer sentiment metrics",
|
|
1977
|
+
"Parse product attributes for analytics"). This guides both schema
|
|
1978
|
+
inference and field selection.
|
|
1979
|
+
max_examples (int): Maximum number of rows to use for schema inference.
|
|
1980
|
+
A larger sample may produce more accurate schemas but increases
|
|
1981
|
+
inference time. Defaults to 100.
|
|
1982
|
+
batch_size (int | None): Number of requests to process in parallel during
|
|
1983
|
+
extraction. Defaults to None (automatic optimization). Set to a specific
|
|
1984
|
+
value to control API usage and performance.
|
|
1985
|
+
max_concurrency (int): Maximum number of concurrent requests during
|
|
1986
|
+
extraction. Defaults to 8.
|
|
1987
|
+
show_progress (bool): Whether to display a progress bar during extraction.
|
|
1988
|
+
Useful for large datasets. Defaults to False.
|
|
1989
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., `temperature`, `top_p`,
|
|
1990
|
+
`frequency_penalty`, `presence_penalty`, `seed`) forwarded to the task execution.
|
|
1991
|
+
|
|
1992
|
+
Returns:
|
|
1993
|
+
pd.DataFrame: The original DataFrame with new columns added from the
|
|
1994
|
+
inferred structured data. Each inferred field becomes a new column.
|
|
1995
|
+
The original columns and index are preserved.
|
|
1996
|
+
|
|
1997
|
+
Example:
|
|
1998
|
+
```python
|
|
1999
|
+
# Add sentiment and issue type to support tickets
|
|
2000
|
+
df = pd.DataFrame({
|
|
2001
|
+
'ticket_id': [1, 2, 3],
|
|
2002
|
+
'description': [
|
|
2003
|
+
"Can't login, password reset not working",
|
|
2004
|
+
"Billing error, charged twice last month",
|
|
2005
|
+
"Great service, issue resolved quickly!"
|
|
2006
|
+
],
|
|
2007
|
+
'date': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
2008
|
+
})
|
|
2009
|
+
|
|
2010
|
+
# Add inferred fields to existing DataFrame (must be awaited)
|
|
2011
|
+
enriched_df = await df.aio.auto_extract(
|
|
2012
|
+
purpose="Extract issue type and sentiment for support dashboard",
|
|
2013
|
+
max_concurrency=4,
|
|
2014
|
+
show_progress=True
|
|
2015
|
+
)
|
|
2016
|
+
# Result: Original df with new columns like 'issue_type', 'sentiment', etc.
|
|
2017
|
+
|
|
2018
|
+
# Add product specifications to inventory data
|
|
2019
|
+
inventory = pd.DataFrame({
|
|
2020
|
+
'sku': ['A001', 'B002', 'C003'],
|
|
2021
|
+
'description': [
|
|
2022
|
+
"Laptop 16GB RAM, 512GB SSD, Intel i7",
|
|
2023
|
+
"Phone 128GB, 5G, dual camera",
|
|
2024
|
+
"Tablet 10-inch, WiFi only, 64GB"
|
|
2025
|
+
]
|
|
2026
|
+
})
|
|
2027
|
+
|
|
2028
|
+
enriched_inventory = await inventory.aio.auto_extract(
|
|
2029
|
+
purpose="Extract technical specifications for inventory system",
|
|
2030
|
+
batch_size=32
|
|
2031
|
+
)
|
|
2032
|
+
```
|
|
2033
|
+
|
|
2034
|
+
Note:
|
|
2035
|
+
This is an asynchronous method and must be awaited. This method is ideal
|
|
2036
|
+
for enriching existing DataFrames with additional structured fields
|
|
2037
|
+
extracted from text columns. The schema is inferred synchronously from
|
|
2038
|
+
the DataFrame content. For production use cases with stable schemas,
|
|
2039
|
+
consider using `infer_schema()` once and reusing the schema with `task()`.
|
|
2040
|
+
"""
|
|
2041
|
+
# Infer schema from DataFrame rows (synchronous)
|
|
2042
|
+
schema = self._obj.ai.infer_schema(purpose=purpose, max_examples=max_examples)
|
|
2043
|
+
|
|
2044
|
+
# Extract structured data using the inferred schema (asynchronous)
|
|
2045
|
+
inferred_series = await self._obj.aio.task(
|
|
2046
|
+
task=schema.task,
|
|
2047
|
+
batch_size=batch_size,
|
|
2048
|
+
max_concurrency=max_concurrency,
|
|
2049
|
+
show_progress=show_progress,
|
|
2050
|
+
**api_kwargs,
|
|
2051
|
+
)
|
|
2052
|
+
|
|
2053
|
+
return self._obj.assign(
|
|
2054
|
+
inferred=inferred_series,
|
|
2055
|
+
).ai.extract("inferred")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/customer_support/inquiry_classification.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{openaivec-0.14.4 → openaivec-0.14.6}/docs/api/tasks/customer_support/response_suggestion.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/customer_sentiment.py
RENAMED
|
File without changes
|
{openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/inquiry_classification.py
RENAMED
|
File without changes
|
{openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/inquiry_summary.py
RENAMED
|
File without changes
|
{openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/intent_analysis.py
RENAMED
|
File without changes
|
{openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/response_suggestion.py
RENAMED
|
File without changes
|
{openaivec-0.14.4 → openaivec-0.14.6}/src/openaivec/task/customer_support/urgency_analysis.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|