openaivec 0.10.0__py3-none-any.whl → 1.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/__init__.py +13 -4
- openaivec/_cache/__init__.py +12 -0
- openaivec/_cache/optimize.py +109 -0
- openaivec/_cache/proxy.py +806 -0
- openaivec/_di.py +326 -0
- openaivec/_embeddings.py +203 -0
- openaivec/{log.py → _log.py} +2 -2
- openaivec/_model.py +113 -0
- openaivec/{prompt.py → _prompt.py} +95 -28
- openaivec/_provider.py +207 -0
- openaivec/_responses.py +511 -0
- openaivec/_schema/__init__.py +9 -0
- openaivec/_schema/infer.py +340 -0
- openaivec/_schema/spec.py +350 -0
- openaivec/_serialize.py +234 -0
- openaivec/{util.py → _util.py} +25 -85
- openaivec/pandas_ext.py +1635 -425
- openaivec/spark.py +604 -335
- openaivec/task/__init__.py +27 -29
- openaivec/task/customer_support/__init__.py +9 -15
- openaivec/task/customer_support/customer_sentiment.py +51 -41
- openaivec/task/customer_support/inquiry_classification.py +86 -61
- openaivec/task/customer_support/inquiry_summary.py +44 -45
- openaivec/task/customer_support/intent_analysis.py +56 -41
- openaivec/task/customer_support/response_suggestion.py +49 -43
- openaivec/task/customer_support/urgency_analysis.py +76 -71
- openaivec/task/nlp/__init__.py +4 -4
- openaivec/task/nlp/dependency_parsing.py +19 -20
- openaivec/task/nlp/keyword_extraction.py +22 -24
- openaivec/task/nlp/morphological_analysis.py +25 -25
- openaivec/task/nlp/named_entity_recognition.py +26 -28
- openaivec/task/nlp/sentiment_analysis.py +29 -21
- openaivec/task/nlp/translation.py +24 -30
- openaivec/task/table/__init__.py +3 -0
- openaivec/task/table/fillna.py +183 -0
- openaivec-1.0.10.dist-info/METADATA +399 -0
- openaivec-1.0.10.dist-info/RECORD +39 -0
- {openaivec-0.10.0.dist-info → openaivec-1.0.10.dist-info}/WHEEL +1 -1
- openaivec/embeddings.py +0 -172
- openaivec/responses.py +0 -392
- openaivec/serialize.py +0 -225
- openaivec/task/model.py +0 -84
- openaivec-0.10.0.dist-info/METADATA +0 -546
- openaivec-0.10.0.dist-info/RECORD +0 -29
- {openaivec-0.10.0.dist-info → openaivec-1.0.10.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py
CHANGED
|
@@ -2,29 +2,40 @@
|
|
|
2
2
|
|
|
3
3
|
## Setup
|
|
4
4
|
```python
|
|
5
|
-
from openai import OpenAI
|
|
5
|
+
from openai import OpenAI, AzureOpenAI, AsyncOpenAI, AsyncAzureOpenAI
|
|
6
6
|
from openaivec import pandas_ext
|
|
7
7
|
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
#
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
#
|
|
26
|
-
|
|
27
|
-
|
|
8
|
+
# Option 1: Use environment variables (automatic detection)
|
|
9
|
+
# Set OPENAI_API_KEY or Azure OpenAI environment variables
|
|
10
|
+
# (AZURE_OPENAI_API_KEY, AZURE_OPENAI_BASE_URL, AZURE_OPENAI_API_VERSION)
|
|
11
|
+
# No explicit setup needed - clients are automatically created
|
|
12
|
+
|
|
13
|
+
# Option 2: Register an existing OpenAI client instance
|
|
14
|
+
client = OpenAI(api_key="your-api-key")
|
|
15
|
+
pandas_ext.set_client(client)
|
|
16
|
+
|
|
17
|
+
# Option 3: Register an Azure OpenAI client instance
|
|
18
|
+
azure_client = AzureOpenAI(
|
|
19
|
+
api_key="your-azure-key",
|
|
20
|
+
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
21
|
+
api_version="preview"
|
|
22
|
+
)
|
|
23
|
+
pandas_ext.set_client(azure_client)
|
|
24
|
+
|
|
25
|
+
# Option 4: Register an async Azure OpenAI client instance
|
|
26
|
+
async_azure_client = AsyncAzureOpenAI(
|
|
27
|
+
api_key="your-azure-key",
|
|
28
|
+
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
29
|
+
api_version="preview"
|
|
30
|
+
)
|
|
31
|
+
pandas_ext.set_async_client(async_azure_client)
|
|
32
|
+
|
|
33
|
+
# Set up model names (optional, defaults shown)
|
|
34
|
+
pandas_ext.set_responses_model("gpt-4.1-mini")
|
|
35
|
+
pandas_ext.set_embeddings_model("text-embedding-3-small")
|
|
36
|
+
|
|
37
|
+
# Inspect current configuration
|
|
38
|
+
configured_model = pandas_ext.get_responses_model()
|
|
28
39
|
```
|
|
29
40
|
|
|
30
41
|
This module provides `.ai` and `.aio` accessors for pandas Series and DataFrames
|
|
@@ -33,202 +44,144 @@ to easily interact with OpenAI APIs for tasks like generating responses or embed
|
|
|
33
44
|
|
|
34
45
|
import inspect
|
|
35
46
|
import json
|
|
36
|
-
import os
|
|
37
47
|
import logging
|
|
38
|
-
from
|
|
48
|
+
from collections.abc import Awaitable, Callable
|
|
49
|
+
from typing import TypeVar
|
|
39
50
|
|
|
40
51
|
import numpy as np
|
|
41
52
|
import pandas as pd
|
|
42
|
-
from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
|
|
43
|
-
from pydantic import BaseModel
|
|
44
53
|
import tiktoken
|
|
54
|
+
from openai import AsyncOpenAI, OpenAI
|
|
55
|
+
from pydantic import BaseModel
|
|
45
56
|
|
|
46
|
-
from .
|
|
47
|
-
from .
|
|
48
|
-
from .
|
|
57
|
+
from openaivec._cache import AsyncBatchingMapProxy, BatchingMapProxy
|
|
58
|
+
from openaivec._embeddings import AsyncBatchEmbeddings, BatchEmbeddings
|
|
59
|
+
from openaivec._model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
|
|
60
|
+
from openaivec._provider import CONTAINER, _check_azure_v1_api_url
|
|
61
|
+
from openaivec._responses import AsyncBatchResponses, BatchResponses
|
|
62
|
+
from openaivec._schema import SchemaInferenceInput, SchemaInferenceOutput, SchemaInferer
|
|
63
|
+
from openaivec.task.table import FillNaResponse, fillna
|
|
49
64
|
|
|
50
65
|
__all__ = [
|
|
51
|
-
"
|
|
52
|
-
"
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
66
|
+
"get_async_client",
|
|
67
|
+
"get_client",
|
|
68
|
+
"get_embeddings_model",
|
|
69
|
+
"get_responses_model",
|
|
70
|
+
"set_async_client",
|
|
71
|
+
"set_client",
|
|
72
|
+
"set_embeddings_model",
|
|
73
|
+
"set_responses_model",
|
|
57
74
|
]
|
|
58
75
|
|
|
59
76
|
_LOGGER = logging.getLogger(__name__)
|
|
60
77
|
|
|
61
78
|
|
|
62
|
-
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
# Internal helpers (not exported)
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
def _df_rows_to_json_series(df: pd.DataFrame) -> pd.Series:
|
|
83
|
+
"""Return a Series of JSON strings (UTF-8, no ASCII escaping) representing DataFrame rows.
|
|
63
84
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
85
|
+
Each element is the JSON serialisation of the corresponding row as a dict. Index and
|
|
86
|
+
name are preserved so downstream operations retain alignment. This consolidates the
|
|
87
|
+
previously duplicated inline pipeline used by responses*/task* DataFrame helpers.
|
|
88
|
+
"""
|
|
89
|
+
return pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
90
|
+
lambda x: json.dumps(x, ensure_ascii=False)
|
|
91
|
+
)
|
|
70
92
|
|
|
71
93
|
|
|
72
|
-
|
|
73
|
-
def _wakeup() -> None:
|
|
74
|
-
pass
|
|
94
|
+
T = TypeVar("T") # For pipe function return type
|
|
75
95
|
|
|
76
96
|
|
|
77
|
-
def
|
|
78
|
-
"""Register a custom OpenAI
|
|
97
|
+
def set_client(client: OpenAI) -> None:
|
|
98
|
+
"""Register a custom OpenAI-compatible client for pandas helpers.
|
|
79
99
|
|
|
80
100
|
Args:
|
|
81
|
-
client (OpenAI): A pre
|
|
82
|
-
`openai.AzureOpenAI` instance.
|
|
83
|
-
The same instance is reused by every helper in this module.
|
|
101
|
+
client (OpenAI): A pre-configured `openai.OpenAI` or
|
|
102
|
+
`openai.AzureOpenAI` instance reused by every helper in this module.
|
|
84
103
|
"""
|
|
85
|
-
|
|
86
|
-
|
|
104
|
+
if client.__class__.__name__ == "AzureOpenAI" and hasattr(client, "base_url"):
|
|
105
|
+
_check_azure_v1_api_url(str(client.base_url))
|
|
87
106
|
|
|
107
|
+
CONTAINER.register(OpenAI, lambda: client)
|
|
88
108
|
|
|
89
|
-
def use_async(client: AsyncOpenAI) -> None:
|
|
90
|
-
"""Register a custom asynchronous OpenAI‑compatible client.
|
|
91
109
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
110
|
+
def get_client() -> OpenAI:
|
|
111
|
+
"""Get the currently registered OpenAI-compatible client.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
OpenAI: The registered `openai.OpenAI` or `openai.AzureOpenAI` instance.
|
|
96
115
|
"""
|
|
97
|
-
|
|
98
|
-
_ASYNC_CLIENT = client
|
|
116
|
+
return CONTAINER.resolve(OpenAI)
|
|
99
117
|
|
|
100
118
|
|
|
101
|
-
def
|
|
102
|
-
"""
|
|
119
|
+
def set_async_client(client: AsyncOpenAI) -> None:
|
|
120
|
+
"""Register a custom asynchronous OpenAI-compatible client.
|
|
103
121
|
|
|
104
122
|
Args:
|
|
105
|
-
|
|
106
|
-
`openai.
|
|
123
|
+
client (AsyncOpenAI): A pre-configured `openai.AsyncOpenAI` or
|
|
124
|
+
`openai.AsyncAzureOpenAI` instance reused by every helper in this module.
|
|
107
125
|
"""
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
_ASYNC_CLIENT = AsyncOpenAI(api_key=api_key)
|
|
126
|
+
if client.__class__.__name__ == "AsyncAzureOpenAI" and hasattr(client, "base_url"):
|
|
127
|
+
_check_azure_v1_api_url(str(client.base_url))
|
|
111
128
|
|
|
129
|
+
CONTAINER.register(AsyncOpenAI, lambda: client)
|
|
112
130
|
|
|
113
|
-
def use_azure_openai(api_key: str, endpoint: str, api_version: str) -> None:
|
|
114
|
-
"""Create and register an `openai.AzureOpenAI` client.
|
|
115
131
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
132
|
+
def get_async_client() -> AsyncOpenAI:
|
|
133
|
+
"""Get the currently registered asynchronous OpenAI-compatible client.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
AsyncOpenAI: The registered `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` instance.
|
|
121
137
|
"""
|
|
122
|
-
|
|
123
|
-
_CLIENT = AzureOpenAI(
|
|
124
|
-
api_key=api_key,
|
|
125
|
-
azure_endpoint=endpoint,
|
|
126
|
-
api_version=api_version,
|
|
127
|
-
)
|
|
128
|
-
_ASYNC_CLIENT = AsyncAzureOpenAI(
|
|
129
|
-
api_key=api_key,
|
|
130
|
-
azure_endpoint=endpoint,
|
|
131
|
-
api_version=api_version,
|
|
132
|
-
)
|
|
138
|
+
return CONTAINER.resolve(AsyncOpenAI)
|
|
133
139
|
|
|
134
140
|
|
|
135
|
-
def
|
|
141
|
+
def set_responses_model(name: str) -> None:
|
|
136
142
|
"""Override the model used for text responses.
|
|
137
143
|
|
|
138
144
|
Args:
|
|
139
|
-
name (str):
|
|
140
|
-
(for example, ``gpt-
|
|
145
|
+
name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name
|
|
146
|
+
(for example, ``gpt-4.1-mini``).
|
|
141
147
|
"""
|
|
142
|
-
|
|
143
|
-
_RESPONSES_MODEL_NAME = name
|
|
148
|
+
CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(name))
|
|
144
149
|
|
|
145
|
-
try:
|
|
146
|
-
_TIKTOKEN_ENCODING = tiktoken.encoding_for_model(name)
|
|
147
150
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
)
|
|
153
|
-
|
|
151
|
+
def get_responses_model() -> str:
|
|
152
|
+
"""Get the currently registered model name for text responses.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
str: The model name (for example, ``gpt-4.1-mini``).
|
|
156
|
+
"""
|
|
157
|
+
return CONTAINER.resolve(ResponsesModelName).value
|
|
154
158
|
|
|
155
159
|
|
|
156
|
-
def
|
|
160
|
+
def set_embeddings_model(name: str) -> None:
|
|
157
161
|
"""Override the model used for text embeddings.
|
|
158
162
|
|
|
159
163
|
Args:
|
|
160
|
-
name (str):
|
|
164
|
+
name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name,
|
|
165
|
+
e.g. ``text-embedding-3-small``.
|
|
161
166
|
"""
|
|
162
|
-
|
|
163
|
-
_EMBEDDINGS_MODEL_NAME = name
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
def _get_openai_client() -> OpenAI:
|
|
167
|
-
global _CLIENT
|
|
168
|
-
if _CLIENT is not None:
|
|
169
|
-
return _CLIENT
|
|
170
|
-
|
|
171
|
-
if "OPENAI_API_KEY" in os.environ:
|
|
172
|
-
_CLIENT = OpenAI()
|
|
173
|
-
return _CLIENT
|
|
167
|
+
CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(name))
|
|
174
168
|
|
|
175
|
-
aoai_param_names = [
|
|
176
|
-
"AZURE_OPENAI_API_KEY",
|
|
177
|
-
"AZURE_OPENAI_ENDPOINT",
|
|
178
|
-
"AZURE_OPENAI_API_VERSION",
|
|
179
|
-
]
|
|
180
169
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
api_key=os.environ["AZURE_OPENAI_API_KEY"],
|
|
184
|
-
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
|
|
185
|
-
api_version=os.environ["AZURE_OPENAI_API_VERSION"],
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
return _CLIENT
|
|
189
|
-
|
|
190
|
-
raise ValueError(
|
|
191
|
-
"No OpenAI API key found. Please set the OPENAI_API_KEY environment variable or provide Azure OpenAI parameters."
|
|
192
|
-
"If using Azure OpenAI, ensure AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, and AZURE_OPENAI_API_VERSION are set."
|
|
193
|
-
"If using OpenAI, ensure OPENAI_API_KEY is set."
|
|
194
|
-
)
|
|
170
|
+
def get_embeddings_model() -> str:
|
|
171
|
+
"""Get the currently registered model name for text embeddings.
|
|
195
172
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
return _ASYNC_CLIENT
|
|
201
|
-
|
|
202
|
-
if "OPENAI_API_KEY" in os.environ:
|
|
203
|
-
_ASYNC_CLIENT = AsyncOpenAI()
|
|
204
|
-
return _ASYNC_CLIENT
|
|
205
|
-
|
|
206
|
-
aoai_param_names = [
|
|
207
|
-
"AZURE_OPENAI_API_KEY",
|
|
208
|
-
"AZURE_OPENAI_ENDPOINT",
|
|
209
|
-
"AZURE_OPENAI_API_VERSION",
|
|
210
|
-
]
|
|
211
|
-
if all(param in os.environ for param in aoai_param_names):
|
|
212
|
-
_ASYNC_CLIENT = AsyncAzureOpenAI(
|
|
213
|
-
api_key=os.environ["AZURE_OPENAI_API_KEY"],
|
|
214
|
-
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
|
|
215
|
-
api_version=os.environ["AZURE_OPENAI_API_VERSION"],
|
|
216
|
-
)
|
|
217
|
-
return _ASYNC_CLIENT
|
|
218
|
-
|
|
219
|
-
raise ValueError(
|
|
220
|
-
"No OpenAI API key found. Please set the OPENAI_API_KEY environment variable or provide Azure OpenAI parameters."
|
|
221
|
-
"If using Azure OpenAI, ensure AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, and AZURE_OPENAI_API_VERSION are set."
|
|
222
|
-
"If using OpenAI, ensure OPENAI_API_KEY is set."
|
|
223
|
-
)
|
|
173
|
+
Returns:
|
|
174
|
+
str: The model name (for example, ``text-embedding-3-small``).
|
|
175
|
+
"""
|
|
176
|
+
return CONTAINER.resolve(EmbeddingsModelName).value
|
|
224
177
|
|
|
225
178
|
|
|
226
179
|
def _extract_value(x, series_name):
|
|
227
180
|
"""Return a homogeneous ``dict`` representation of any Series value.
|
|
228
181
|
|
|
229
182
|
Args:
|
|
230
|
-
x: Single element taken from the Series.
|
|
231
|
-
|
|
183
|
+
x (Any): Single element taken from the Series.
|
|
184
|
+
series_name (str): Name of the Series (used for logging).
|
|
232
185
|
|
|
233
186
|
Returns:
|
|
234
187
|
dict: A dictionary representation or an empty ``dict`` if ``x`` cannot
|
|
@@ -241,7 +194,9 @@ def _extract_value(x, series_name):
|
|
|
241
194
|
elif isinstance(x, dict):
|
|
242
195
|
return x
|
|
243
196
|
|
|
244
|
-
_LOGGER.warning(
|
|
197
|
+
_LOGGER.warning(
|
|
198
|
+
f"The value '{x}' in the series '{series_name}' is not a dict or BaseModel. Returning an empty dict."
|
|
199
|
+
)
|
|
245
200
|
return {}
|
|
246
201
|
|
|
247
202
|
|
|
@@ -252,126 +207,463 @@ class OpenAIVecSeriesAccessor:
|
|
|
252
207
|
def __init__(self, series_obj: pd.Series):
|
|
253
208
|
self._obj = series_obj
|
|
254
209
|
|
|
210
|
+
def responses_with_cache(
|
|
211
|
+
self,
|
|
212
|
+
instructions: str,
|
|
213
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
214
|
+
response_format: type[ResponseFormat] = str,
|
|
215
|
+
**api_kwargs,
|
|
216
|
+
) -> pd.Series:
|
|
217
|
+
"""Call an LLM once for every Series element using a provided cache.
|
|
218
|
+
|
|
219
|
+
This is a lower-level method that allows explicit cache management for advanced
|
|
220
|
+
use cases. Most users should use the standard ``responses`` method instead.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
instructions (str): System prompt prepended to every user message.
|
|
224
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
|
|
225
|
+
batching and deduplication control.
|
|
226
|
+
response_format (type[ResponseFormat], optional): Pydantic model or built-in
|
|
227
|
+
type the assistant should return. Defaults to ``str``.
|
|
228
|
+
**api_kwargs: Arbitrary OpenAI Responses API parameters (e.g. ``temperature``,
|
|
229
|
+
``top_p``, ``frequency_penalty``, ``presence_penalty``, ``seed``, etc.) are
|
|
230
|
+
forwarded verbatim to the underlying client.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
client: BatchResponses = BatchResponses(
|
|
237
|
+
client=CONTAINER.resolve(OpenAI),
|
|
238
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
239
|
+
system_message=instructions,
|
|
240
|
+
response_format=response_format,
|
|
241
|
+
cache=cache,
|
|
242
|
+
api_kwargs=api_kwargs,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
246
|
+
|
|
255
247
|
def responses(
|
|
256
248
|
self,
|
|
257
249
|
instructions: str,
|
|
258
|
-
response_format:
|
|
259
|
-
batch_size: int =
|
|
260
|
-
|
|
261
|
-
|
|
250
|
+
response_format: type[ResponseFormat] = str,
|
|
251
|
+
batch_size: int | None = None,
|
|
252
|
+
show_progress: bool = True,
|
|
253
|
+
**api_kwargs,
|
|
262
254
|
) -> pd.Series:
|
|
263
255
|
"""Call an LLM once for every Series element.
|
|
264
256
|
|
|
265
257
|
Example:
|
|
266
258
|
```python
|
|
267
259
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
260
|
+
# Basic usage
|
|
268
261
|
animals.ai.responses("translate to French")
|
|
262
|
+
|
|
263
|
+
# With progress bar in Jupyter notebooks
|
|
264
|
+
large_series = pd.Series(["data"] * 1000)
|
|
265
|
+
large_series.ai.responses(
|
|
266
|
+
"analyze this data",
|
|
267
|
+
batch_size=32,
|
|
268
|
+
show_progress=True
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# With custom temperature
|
|
272
|
+
animals.ai.responses(
|
|
273
|
+
"translate creatively",
|
|
274
|
+
temperature=0.8
|
|
275
|
+
)
|
|
269
276
|
```
|
|
270
|
-
This method returns a Series of strings, each containing the
|
|
271
|
-
assistant's response to the corresponding input.
|
|
272
|
-
The model used is set by the `responses_model` function.
|
|
273
|
-
The default model is `gpt-4o-mini`.
|
|
274
277
|
|
|
275
278
|
Args:
|
|
276
279
|
instructions (str): System prompt prepended to every user message.
|
|
277
|
-
response_format (
|
|
280
|
+
response_format (type[ResponseFormat], optional): Pydantic model or built‑in
|
|
278
281
|
type the assistant should return. Defaults to ``str``.
|
|
279
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
280
|
-
request. Defaults to ``
|
|
281
|
-
|
|
282
|
-
|
|
282
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
283
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
284
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
285
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
286
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
283
287
|
|
|
284
288
|
Returns:
|
|
285
289
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
286
290
|
"""
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
system_message=instructions,
|
|
291
|
+
return self.responses_with_cache(
|
|
292
|
+
instructions=instructions,
|
|
293
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
291
294
|
response_format=response_format,
|
|
292
|
-
|
|
293
|
-
top_p=top_p,
|
|
295
|
+
**api_kwargs,
|
|
294
296
|
)
|
|
295
297
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
|
|
303
|
-
"""Execute a prepared task on every Series element.
|
|
298
|
+
def embeddings_with_cache(
|
|
299
|
+
self,
|
|
300
|
+
cache: BatchingMapProxy[str, np.ndarray],
|
|
301
|
+
**api_kwargs,
|
|
302
|
+
) -> pd.Series:
|
|
303
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache.
|
|
304
304
|
|
|
305
|
-
This method
|
|
306
|
-
|
|
307
|
-
|
|
305
|
+
This method allows external control over caching behavior by accepting
|
|
306
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
307
|
+
across multiple operations or custom batch size management.
|
|
308
308
|
|
|
309
309
|
Example:
|
|
310
310
|
```python
|
|
311
|
-
from openaivec.
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
311
|
+
from openaivec._cache import BatchingMapProxy
|
|
312
|
+
import numpy as np
|
|
313
|
+
|
|
314
|
+
# Create a shared cache with custom batch size
|
|
315
|
+
shared_cache = BatchingMapProxy[str, np.ndarray](batch_size=64)
|
|
316
|
+
|
|
317
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
318
|
+
embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
|
|
318
319
|
```
|
|
319
|
-
This method returns a Series containing the task results for each
|
|
320
|
-
corresponding input element, following the task's defined structure.
|
|
321
320
|
|
|
322
321
|
Args:
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
322
|
+
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
323
|
+
instance for managing API call batching and deduplication.
|
|
324
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
325
|
+
**api_kwargs: Additional keyword arguments to pass to the OpenAI API.
|
|
327
326
|
|
|
328
327
|
Returns:
|
|
329
|
-
pandas.Series: Series whose values are
|
|
330
|
-
|
|
328
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
329
|
+
(dtype ``float32``).
|
|
331
330
|
"""
|
|
332
|
-
client =
|
|
333
|
-
client=
|
|
334
|
-
model_name=
|
|
335
|
-
|
|
331
|
+
client: BatchEmbeddings = BatchEmbeddings(
|
|
332
|
+
client=CONTAINER.resolve(OpenAI),
|
|
333
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
334
|
+
cache=cache,
|
|
335
|
+
api_kwargs=api_kwargs,
|
|
336
336
|
)
|
|
337
337
|
|
|
338
338
|
return pd.Series(
|
|
339
|
-
client.
|
|
339
|
+
client.create(self._obj.tolist()),
|
|
340
340
|
index=self._obj.index,
|
|
341
341
|
name=self._obj.name,
|
|
342
342
|
)
|
|
343
343
|
|
|
344
|
-
def embeddings(self, batch_size: int =
|
|
344
|
+
def embeddings(self, batch_size: int | None = None, show_progress: bool = True, **api_kwargs) -> pd.Series:
|
|
345
345
|
"""Compute OpenAI embeddings for every Series element.
|
|
346
346
|
|
|
347
347
|
Example:
|
|
348
348
|
```python
|
|
349
349
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
350
|
+
# Basic usage
|
|
350
351
|
animals.ai.embeddings()
|
|
352
|
+
|
|
353
|
+
# With progress bar for large datasets
|
|
354
|
+
large_texts = pd.Series(["text"] * 5000)
|
|
355
|
+
embeddings = large_texts.ai.embeddings(
|
|
356
|
+
batch_size=100,
|
|
357
|
+
show_progress=True
|
|
358
|
+
)
|
|
351
359
|
```
|
|
352
|
-
This method returns a Series of numpy arrays, each containing the
|
|
353
|
-
embedding vector for the corresponding input.
|
|
354
|
-
The embedding model is set by the `embeddings_model` function.
|
|
355
|
-
The default embedding model is `text-embedding-3-small`.
|
|
356
360
|
|
|
357
361
|
Args:
|
|
358
|
-
batch_size (int, optional): Number of inputs grouped into a
|
|
359
|
-
single request. Defaults to ``
|
|
362
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
363
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
364
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
365
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
366
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
360
367
|
|
|
361
368
|
Returns:
|
|
362
369
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
363
370
|
(dtype ``float32``).
|
|
364
371
|
"""
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
372
|
+
return self.embeddings_with_cache(
|
|
373
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
374
|
+
**api_kwargs,
|
|
368
375
|
)
|
|
369
376
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
377
|
+
def task_with_cache(
|
|
378
|
+
self,
|
|
379
|
+
task: PreparedTask[ResponseFormat],
|
|
380
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
381
|
+
**api_kwargs,
|
|
382
|
+
) -> pd.Series:
|
|
383
|
+
"""Execute a prepared task on every Series element using a provided cache.
|
|
384
|
+
|
|
385
|
+
This mirrors ``responses_with_cache`` but uses the task's stored instructions
|
|
386
|
+
and response format. A supplied ``BatchingMapProxy`` enables cross‑operation
|
|
387
|
+
deduplicated reuse and external batch size / progress control.
|
|
388
|
+
|
|
389
|
+
Example:
|
|
390
|
+
```python
|
|
391
|
+
from openaivec._cache import BatchingMapProxy
|
|
392
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
393
|
+
reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
task (PreparedTask): Prepared task (instructions + response_format).
|
|
398
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
399
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
400
|
+
|
|
401
|
+
Note:
|
|
402
|
+
Core routing keys (``model``, system instructions, user input) are managed
|
|
403
|
+
internally and cannot be overridden.
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
pandas.Series: Task results aligned with the original Series index.
|
|
407
|
+
"""
|
|
408
|
+
client: BatchResponses = BatchResponses(
|
|
409
|
+
client=CONTAINER.resolve(OpenAI),
|
|
410
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
411
|
+
system_message=task.instructions,
|
|
412
|
+
response_format=task.response_format,
|
|
413
|
+
cache=cache,
|
|
414
|
+
api_kwargs=api_kwargs,
|
|
415
|
+
)
|
|
416
|
+
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
417
|
+
|
|
418
|
+
def task(
|
|
419
|
+
self,
|
|
420
|
+
task: PreparedTask,
|
|
421
|
+
batch_size: int | None = None,
|
|
422
|
+
show_progress: bool = True,
|
|
423
|
+
**api_kwargs,
|
|
424
|
+
) -> pd.Series:
|
|
425
|
+
"""Execute a prepared task on every Series element.
|
|
426
|
+
|
|
427
|
+
Example:
|
|
428
|
+
```python
|
|
429
|
+
from openaivec._model import PreparedTask
|
|
430
|
+
|
|
431
|
+
# Assume you have a prepared task for sentiment analysis
|
|
432
|
+
sentiment_task = PreparedTask(...)
|
|
433
|
+
|
|
434
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
435
|
+
# Basic usage
|
|
436
|
+
results = reviews.ai.task(sentiment_task)
|
|
437
|
+
|
|
438
|
+
# With progress bar for large datasets
|
|
439
|
+
large_reviews = pd.Series(["review text"] * 2000)
|
|
440
|
+
results = large_reviews.ai.task(
|
|
441
|
+
sentiment_task,
|
|
442
|
+
batch_size=50,
|
|
443
|
+
show_progress=True
|
|
444
|
+
)
|
|
445
|
+
```
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
449
|
+
response format for processing the inputs.
|
|
450
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
451
|
+
request to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
452
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
453
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
454
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
455
|
+
|
|
456
|
+
Note:
|
|
457
|
+
Core batching / routing keys (``model``, ``instructions`` / system message,
|
|
458
|
+
user ``input``) are managed by the library and cannot be overridden.
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
pandas.Series: Series whose values are instances of the task's response format.
|
|
462
|
+
"""
|
|
463
|
+
return self.task_with_cache(
|
|
464
|
+
task=task,
|
|
465
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
466
|
+
**api_kwargs,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
def parse_with_cache(
|
|
470
|
+
self,
|
|
471
|
+
instructions: str,
|
|
472
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
473
|
+
response_format: type[ResponseFormat] | None = None,
|
|
474
|
+
max_examples: int = 100,
|
|
475
|
+
**api_kwargs,
|
|
476
|
+
) -> pd.Series:
|
|
477
|
+
"""Parse Series values using an LLM with a provided cache.
|
|
478
|
+
|
|
479
|
+
This method allows external control over caching behavior while parsing
|
|
480
|
+
Series content into structured data. If no response format is provided,
|
|
481
|
+
the method automatically infers an appropriate schema by analyzing the
|
|
482
|
+
data patterns.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
instructions (str): Plain language description of what information
|
|
486
|
+
to extract (e.g., "Extract customer information including name
|
|
487
|
+
and contact details"). This guides both the extraction process
|
|
488
|
+
and schema inference.
|
|
489
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
490
|
+
instance for managing API call batching and deduplication.
|
|
491
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
492
|
+
response_format (type[ResponseFormat] | None, optional): Target structure
|
|
493
|
+
for the parsed data. Can be a Pydantic model class, built-in type
|
|
494
|
+
(str, int, float, bool, list, dict), or None. If None, the method
|
|
495
|
+
infers an appropriate schema based on the instructions and data.
|
|
496
|
+
Defaults to None.
|
|
497
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
498
|
+
analyze when inferring the schema. Only used when response_format
|
|
499
|
+
is None. Defaults to 100.
|
|
500
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
501
|
+
frequency_penalty, presence_penalty, seed, etc.) forwarded to
|
|
502
|
+
the underlying API calls.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
pandas.Series: Series containing parsed structured data. Each value
|
|
506
|
+
is an instance of the specified response_format or the inferred
|
|
507
|
+
schema model, aligned with the original Series index.
|
|
508
|
+
"""
|
|
509
|
+
|
|
510
|
+
schema: SchemaInferenceOutput | None = None
|
|
511
|
+
if response_format is None:
|
|
512
|
+
schema = self.infer_schema(instructions=instructions, max_examples=max_examples, **api_kwargs)
|
|
513
|
+
|
|
514
|
+
return self.responses_with_cache(
|
|
515
|
+
instructions=schema.inference_prompt if schema else instructions,
|
|
516
|
+
cache=cache,
|
|
517
|
+
response_format=response_format or schema.model,
|
|
518
|
+
**api_kwargs,
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
def parse(
|
|
522
|
+
self,
|
|
523
|
+
instructions: str,
|
|
524
|
+
response_format: type[ResponseFormat] | None = None,
|
|
525
|
+
max_examples: int = 100,
|
|
526
|
+
batch_size: int | None = None,
|
|
527
|
+
show_progress: bool = True,
|
|
528
|
+
**api_kwargs,
|
|
529
|
+
) -> pd.Series:
|
|
530
|
+
"""Parse Series values into structured data using an LLM.
|
|
531
|
+
|
|
532
|
+
This method extracts structured information from unstructured text in
|
|
533
|
+
the Series. When no response format is provided, it automatically
|
|
534
|
+
infers an appropriate schema by analyzing patterns in the data.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
instructions (str): Plain language description of what information
|
|
538
|
+
to extract (e.g., "Extract product details including price,
|
|
539
|
+
category, and availability"). This guides both the extraction
|
|
540
|
+
process and schema inference.
|
|
541
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
542
|
+
structure for the parsed data. Can be a Pydantic model class,
|
|
543
|
+
built-in type (str, int, float, bool, list, dict), or None.
|
|
544
|
+
If None, automatically infers a schema. Defaults to None.
|
|
545
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
546
|
+
analyze when inferring schema. Only used when response_format
|
|
547
|
+
is None. Defaults to 100.
|
|
548
|
+
batch_size (int | None, optional): Number of requests to process
|
|
549
|
+
per batch. None enables automatic optimization. Defaults to None.
|
|
550
|
+
show_progress (bool, optional): Display progress bar in Jupyter
|
|
551
|
+
notebooks. Defaults to True.
|
|
552
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
553
|
+
frequency_penalty, presence_penalty, seed, etc.).
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
pandas.Series: Series containing parsed structured data as instances
|
|
557
|
+
of response_format or the inferred schema model.
|
|
558
|
+
|
|
559
|
+
Example:
|
|
560
|
+
```python
|
|
561
|
+
# With explicit schema
|
|
562
|
+
from pydantic import BaseModel
|
|
563
|
+
class Product(BaseModel):
|
|
564
|
+
name: str
|
|
565
|
+
price: float
|
|
566
|
+
in_stock: bool
|
|
567
|
+
|
|
568
|
+
descriptions = pd.Series([
|
|
569
|
+
"iPhone 15 Pro - $999, available now",
|
|
570
|
+
"Samsung Galaxy S24 - $899, out of stock"
|
|
571
|
+
])
|
|
572
|
+
products = descriptions.ai.parse(
|
|
573
|
+
"Extract product information",
|
|
574
|
+
response_format=Product
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# With automatic schema inference
|
|
578
|
+
reviews = pd.Series([
|
|
579
|
+
"Great product! 5 stars. Fast shipping.",
|
|
580
|
+
"Poor quality. 2 stars. Slow delivery."
|
|
581
|
+
])
|
|
582
|
+
parsed = reviews.ai.parse(
|
|
583
|
+
"Extract review rating and shipping feedback"
|
|
584
|
+
)
|
|
585
|
+
```
|
|
586
|
+
"""
|
|
587
|
+
return self.parse_with_cache(
|
|
588
|
+
instructions=instructions,
|
|
589
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
590
|
+
response_format=response_format,
|
|
591
|
+
max_examples=max_examples,
|
|
592
|
+
**api_kwargs,
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> SchemaInferenceOutput:
|
|
596
|
+
"""Infer a structured data schema from Series content using AI.
|
|
597
|
+
|
|
598
|
+
This method analyzes a sample of Series values to automatically generate
|
|
599
|
+
a Pydantic model that captures the relevant information structure. The
|
|
600
|
+
inferred schema supports both flat and hierarchical (nested) structures,
|
|
601
|
+
making it suitable for complex data extraction tasks.
|
|
602
|
+
|
|
603
|
+
Args:
|
|
604
|
+
instructions (str): Plain language description of the extraction goal
|
|
605
|
+
(e.g., "Extract customer information for CRM system", "Parse
|
|
606
|
+
event details for calendar integration"). This guides which
|
|
607
|
+
fields to include and their purpose.
|
|
608
|
+
max_examples (int, optional): Maximum number of Series values to
|
|
609
|
+
analyze for pattern detection. The method samples randomly up
|
|
610
|
+
to this limit. Higher values may improve schema quality but
|
|
611
|
+
increase inference time. Defaults to 100.
|
|
612
|
+
**api_kwargs: Additional OpenAI API parameters for fine-tuning
|
|
613
|
+
the inference process.
|
|
614
|
+
|
|
615
|
+
Returns:
|
|
616
|
+
InferredSchema: A comprehensive schema object containing:
|
|
617
|
+
- instructions: Refined extraction objective statement
|
|
618
|
+
- fields: Hierarchical field specifications with names, types,
|
|
619
|
+
descriptions, and nested structures where applicable
|
|
620
|
+
- inference_prompt: Optimized prompt for consistent extraction
|
|
621
|
+
- model: Dynamically generated Pydantic model class supporting
|
|
622
|
+
both flat and nested structures
|
|
623
|
+
- task: PreparedTask configured for batch extraction using
|
|
624
|
+
the inferred schema
|
|
625
|
+
|
|
626
|
+
Example:
|
|
627
|
+
```python
|
|
628
|
+
# Simple flat structure
|
|
629
|
+
reviews = pd.Series([
|
|
630
|
+
"5 stars! Great product, fast shipping to NYC.",
|
|
631
|
+
"2 stars. Product broke, slow delivery to LA."
|
|
632
|
+
])
|
|
633
|
+
schema = reviews.ai.infer_schema(
|
|
634
|
+
"Extract review ratings and shipping information"
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
# Hierarchical structure
|
|
638
|
+
orders = pd.Series([
|
|
639
|
+
"Order #123: John Doe, 123 Main St, NYC. Items: iPhone ($999), Case ($29)",
|
|
640
|
+
"Order #456: Jane Smith, 456 Oak Ave, LA. Items: iPad ($799)"
|
|
641
|
+
])
|
|
642
|
+
schema = orders.ai.infer_schema(
|
|
643
|
+
"Extract order details including customer and items"
|
|
644
|
+
)
|
|
645
|
+
# Inferred schema may include nested structures like:
|
|
646
|
+
# - customer: {name: str, address: str, city: str}
|
|
647
|
+
# - items: [{product: str, price: float}]
|
|
648
|
+
|
|
649
|
+
# Apply the schema for extraction
|
|
650
|
+
extracted = orders.ai.task(schema.task)
|
|
651
|
+
```
|
|
652
|
+
|
|
653
|
+
Note:
|
|
654
|
+
The inference process uses multiple AI iterations to ensure schema
|
|
655
|
+
validity. Nested structures are automatically detected when the
|
|
656
|
+
data contains hierarchical relationships. The generated Pydantic
|
|
657
|
+
model ensures type safety and validation for all extracted data.
|
|
658
|
+
"""
|
|
659
|
+
inferer = CONTAINER.resolve(SchemaInferer)
|
|
660
|
+
|
|
661
|
+
input: SchemaInferenceInput = SchemaInferenceInput(
|
|
662
|
+
examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(),
|
|
663
|
+
instructions=instructions,
|
|
664
|
+
**api_kwargs,
|
|
374
665
|
)
|
|
666
|
+
return inferer.infer_schema(input)
|
|
375
667
|
|
|
376
668
|
def count_tokens(self) -> pd.Series:
|
|
377
669
|
"""Count `tiktoken` tokens per row.
|
|
@@ -382,12 +674,13 @@ class OpenAIVecSeriesAccessor:
|
|
|
382
674
|
animals.ai.count_tokens()
|
|
383
675
|
```
|
|
384
676
|
This method uses the `tiktoken` library to count tokens based on the
|
|
385
|
-
model name
|
|
677
|
+
model name configured via `set_responses_model`.
|
|
386
678
|
|
|
387
679
|
Returns:
|
|
388
680
|
pandas.Series: Token counts for each element.
|
|
389
681
|
"""
|
|
390
|
-
|
|
682
|
+
encoding: tiktoken.Encoding = CONTAINER.resolve(tiktoken.Encoding)
|
|
683
|
+
return self._obj.map(encoding.encode).map(len).rename("num_tokens")
|
|
391
684
|
|
|
392
685
|
def extract(self) -> pd.DataFrame:
|
|
393
686
|
"""Expand a Series of Pydantic models/dicts into columns.
|
|
@@ -426,47 +719,65 @@ class OpenAIVecDataFrameAccessor:
|
|
|
426
719
|
def __init__(self, df_obj: pd.DataFrame):
|
|
427
720
|
self._obj = df_obj
|
|
428
721
|
|
|
429
|
-
def
|
|
430
|
-
|
|
722
|
+
def responses_with_cache(
|
|
723
|
+
self,
|
|
724
|
+
instructions: str,
|
|
725
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
726
|
+
response_format: type[ResponseFormat] = str,
|
|
727
|
+
**api_kwargs,
|
|
728
|
+
) -> pd.Series:
|
|
729
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache.
|
|
730
|
+
|
|
731
|
+
This method allows external control over caching behavior by accepting
|
|
732
|
+
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
733
|
+
across multiple operations or custom batch size management.
|
|
431
734
|
|
|
432
735
|
Example:
|
|
433
736
|
```python
|
|
737
|
+
from openaivec._cache import BatchingMapProxy
|
|
738
|
+
|
|
739
|
+
# Create a shared cache with custom batch size
|
|
740
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
741
|
+
|
|
434
742
|
df = pd.DataFrame([
|
|
435
|
-
{"
|
|
436
|
-
{"
|
|
437
|
-
{"
|
|
743
|
+
{"name": "cat", "legs": 4},
|
|
744
|
+
{"name": "dog", "legs": 4},
|
|
745
|
+
{"name": "elephant", "legs": 4},
|
|
438
746
|
])
|
|
439
|
-
df.ai.
|
|
747
|
+
result = df.ai.responses_with_cache(
|
|
748
|
+
"what is the animal's name?",
|
|
749
|
+
cache=shared_cache
|
|
750
|
+
)
|
|
440
751
|
```
|
|
441
|
-
This method returns a DataFrame with the same index as the original,
|
|
442
|
-
where each column corresponds to a key in the dictionaries.
|
|
443
|
-
The source column is dropped.
|
|
444
752
|
|
|
445
753
|
Args:
|
|
446
|
-
|
|
754
|
+
instructions (str): System prompt for the assistant.
|
|
755
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
756
|
+
instance for managing API call batching and deduplication.
|
|
757
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
758
|
+
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
759
|
+
responses. Defaults to ``str``.
|
|
760
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
447
761
|
|
|
448
762
|
Returns:
|
|
449
|
-
pandas.
|
|
763
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
450
764
|
"""
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
.pipe(lambda df: df.join(df[column].ai.extract()))
|
|
457
|
-
.pipe(lambda df: df.set_index(self._obj.index))
|
|
458
|
-
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
765
|
+
return _df_rows_to_json_series(self._obj).ai.responses_with_cache(
|
|
766
|
+
instructions=instructions,
|
|
767
|
+
cache=cache,
|
|
768
|
+
response_format=response_format,
|
|
769
|
+
**api_kwargs,
|
|
459
770
|
)
|
|
460
771
|
|
|
461
772
|
def responses(
|
|
462
773
|
self,
|
|
463
774
|
instructions: str,
|
|
464
|
-
response_format:
|
|
465
|
-
batch_size: int =
|
|
466
|
-
|
|
467
|
-
|
|
775
|
+
response_format: type[ResponseFormat] = str,
|
|
776
|
+
batch_size: int | None = None,
|
|
777
|
+
show_progress: bool = True,
|
|
778
|
+
**api_kwargs,
|
|
468
779
|
) -> pd.Series:
|
|
469
|
-
"""Generate a response for each row after
|
|
780
|
+
"""Generate a response for each row after serializing it to JSON.
|
|
470
781
|
|
|
471
782
|
Example:
|
|
472
783
|
```python
|
|
@@ -475,105 +786,482 @@ class OpenAIVecDataFrameAccessor:
|
|
|
475
786
|
{"name": "dog", "legs": 4},
|
|
476
787
|
{"name": "elephant", "legs": 4},
|
|
477
788
|
])
|
|
789
|
+
# Basic usage
|
|
478
790
|
df.ai.responses("what is the animal's name?")
|
|
791
|
+
|
|
792
|
+
# With progress bar for large datasets
|
|
793
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
794
|
+
large_df.ai.responses(
|
|
795
|
+
"generate a name for this ID",
|
|
796
|
+
batch_size=20,
|
|
797
|
+
show_progress=True
|
|
798
|
+
)
|
|
479
799
|
```
|
|
480
|
-
This method returns a Series of strings, each containing the
|
|
481
|
-
assistant's response to the corresponding input.
|
|
482
|
-
Each row is serialised to JSON before being sent to the assistant.
|
|
483
|
-
The model used is set by the `responses_model` function.
|
|
484
|
-
The default model is `gpt-4o-mini`.
|
|
485
800
|
|
|
486
801
|
Args:
|
|
487
802
|
instructions (str): System prompt for the assistant.
|
|
488
|
-
response_format (
|
|
803
|
+
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
489
804
|
responses. Defaults to ``str``.
|
|
490
|
-
batch_size (int, optional): Number of requests sent in one batch.
|
|
491
|
-
Defaults to ``
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
response_format=response_format,
|
|
505
|
-
batch_size=batch_size,
|
|
506
|
-
temperature=temperature,
|
|
507
|
-
top_p=top_p,
|
|
508
|
-
)
|
|
509
|
-
)
|
|
805
|
+
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
806
|
+
Defaults to ``None`` (automatic batch size optimization
|
|
807
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
808
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
809
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
810
|
+
|
|
811
|
+
Returns:
|
|
812
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
813
|
+
"""
|
|
814
|
+
return self.responses_with_cache(
|
|
815
|
+
instructions=instructions,
|
|
816
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
817
|
+
response_format=response_format,
|
|
818
|
+
**api_kwargs,
|
|
510
819
|
)
|
|
511
820
|
|
|
512
|
-
def
|
|
513
|
-
|
|
821
|
+
def task_with_cache(
|
|
822
|
+
self,
|
|
823
|
+
task: PreparedTask[ResponseFormat],
|
|
824
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
825
|
+
**api_kwargs,
|
|
826
|
+
) -> pd.Series:
|
|
827
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
|
|
828
|
+
|
|
829
|
+
Args:
|
|
830
|
+
task (PreparedTask): Prepared task (instructions + response_format).
|
|
831
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
832
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
833
|
+
|
|
834
|
+
Note:
|
|
835
|
+
Core routing keys are managed internally.
|
|
836
|
+
|
|
837
|
+
Returns:
|
|
838
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
839
|
+
"""
|
|
840
|
+
return _df_rows_to_json_series(self._obj).ai.task_with_cache(
|
|
841
|
+
task=task,
|
|
842
|
+
cache=cache,
|
|
843
|
+
**api_kwargs,
|
|
844
|
+
)
|
|
514
845
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
846
|
+
def task(
|
|
847
|
+
self,
|
|
848
|
+
task: PreparedTask,
|
|
849
|
+
batch_size: int | None = None,
|
|
850
|
+
show_progress: bool = True,
|
|
851
|
+
**api_kwargs,
|
|
852
|
+
) -> pd.Series:
|
|
853
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON.
|
|
519
854
|
|
|
520
855
|
Example:
|
|
521
856
|
```python
|
|
522
|
-
from openaivec.
|
|
523
|
-
|
|
857
|
+
from openaivec._model import PreparedTask
|
|
858
|
+
|
|
524
859
|
# Assume you have a prepared task for data analysis
|
|
525
860
|
analysis_task = PreparedTask(...)
|
|
526
|
-
|
|
861
|
+
|
|
527
862
|
df = pd.DataFrame([
|
|
528
863
|
{"name": "cat", "legs": 4},
|
|
529
864
|
{"name": "dog", "legs": 4},
|
|
530
865
|
{"name": "elephant", "legs": 4},
|
|
531
866
|
])
|
|
867
|
+
# Basic usage
|
|
532
868
|
results = df.ai.task(analysis_task)
|
|
869
|
+
|
|
870
|
+
# With progress bar for large datasets
|
|
871
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
872
|
+
results = large_df.ai.task(
|
|
873
|
+
analysis_task,
|
|
874
|
+
batch_size=50,
|
|
875
|
+
show_progress=True
|
|
876
|
+
)
|
|
533
877
|
```
|
|
534
|
-
This method returns a Series containing the task results for each
|
|
535
|
-
corresponding row, following the task's defined structure.
|
|
536
878
|
|
|
537
879
|
Args:
|
|
538
880
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
539
|
-
response format
|
|
540
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
541
|
-
to optimize API usage. Defaults to
|
|
881
|
+
response format for processing the inputs.
|
|
882
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
883
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
884
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
885
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
886
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
887
|
+
|
|
888
|
+
Note:
|
|
889
|
+
Core batching / routing keys (``model``, ``instructions`` / system message, user ``input``)
|
|
890
|
+
are managed by the library and cannot be overridden.
|
|
542
891
|
|
|
543
892
|
Returns:
|
|
544
893
|
pandas.Series: Series whose values are instances of the task's
|
|
545
894
|
response format, aligned with the DataFrame's original index.
|
|
546
895
|
"""
|
|
547
|
-
return self._obj.
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
)
|
|
896
|
+
return _df_rows_to_json_series(self._obj).ai.task(
|
|
897
|
+
task=task,
|
|
898
|
+
batch_size=batch_size,
|
|
899
|
+
show_progress=show_progress,
|
|
900
|
+
**api_kwargs,
|
|
553
901
|
)
|
|
554
902
|
|
|
555
|
-
def
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
903
|
+
def parse_with_cache(
|
|
904
|
+
self,
|
|
905
|
+
instructions: str,
|
|
906
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
907
|
+
response_format: type[ResponseFormat] | None = None,
|
|
908
|
+
max_examples: int = 100,
|
|
909
|
+
**api_kwargs,
|
|
910
|
+
) -> pd.Series:
|
|
911
|
+
"""Parse DataFrame rows into structured data using an LLM with a provided cache.
|
|
560
912
|
|
|
913
|
+
This method processes each DataFrame row (converted to JSON) and extracts
|
|
914
|
+
structured information using an LLM. External cache control enables
|
|
915
|
+
deduplication across operations and custom batch management.
|
|
561
916
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
917
|
+
Args:
|
|
918
|
+
instructions (str): Plain language description of what information
|
|
919
|
+
to extract from each row (e.g., "Extract shipping details and
|
|
920
|
+
order status"). Guides both extraction and schema inference.
|
|
921
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
922
|
+
instance for managing API call batching and deduplication.
|
|
923
|
+
Set cache.batch_size=None for automatic optimization.
|
|
924
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
925
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
926
|
+
type, or None for automatic schema inference. Defaults to None.
|
|
927
|
+
max_examples (int, optional): Maximum rows to analyze when inferring
|
|
928
|
+
schema (only used when response_format is None). Defaults to 100.
|
|
929
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p,
|
|
930
|
+
frequency_penalty, presence_penalty, seed, etc.).
|
|
931
|
+
|
|
932
|
+
Returns:
|
|
933
|
+
pandas.Series: Series containing parsed structured data as instances
|
|
934
|
+
of response_format or the inferred schema model, indexed like
|
|
935
|
+
the original DataFrame.
|
|
936
|
+
"""
|
|
937
|
+
return _df_rows_to_json_series(self._obj).ai.parse_with_cache(
|
|
938
|
+
instructions=instructions,
|
|
939
|
+
cache=cache,
|
|
940
|
+
response_format=response_format,
|
|
941
|
+
max_examples=max_examples,
|
|
942
|
+
**api_kwargs,
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
def parse(
|
|
946
|
+
self,
|
|
947
|
+
instructions: str,
|
|
948
|
+
response_format: type[ResponseFormat] | None = None,
|
|
949
|
+
max_examples: int = 100,
|
|
950
|
+
batch_size: int | None = None,
|
|
951
|
+
show_progress: bool = True,
|
|
952
|
+
**api_kwargs,
|
|
953
|
+
) -> pd.Series:
|
|
954
|
+
"""Parse DataFrame rows into structured data using an LLM.
|
|
955
|
+
|
|
956
|
+
Each row is converted to JSON and processed to extract structured
|
|
957
|
+
information. When no response format is provided, the method
|
|
958
|
+
automatically infers an appropriate schema from the data.
|
|
959
|
+
|
|
960
|
+
Args:
|
|
961
|
+
instructions (str): Plain language description of extraction goals
|
|
962
|
+
(e.g., "Extract transaction details including amount, date,
|
|
963
|
+
and merchant"). Guides extraction and schema inference.
|
|
964
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
965
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
966
|
+
type, or None for automatic inference. Defaults to None.
|
|
967
|
+
max_examples (int, optional): Maximum rows to analyze for schema
|
|
968
|
+
inference (when response_format is None). Defaults to 100.
|
|
969
|
+
batch_size (int | None, optional): Rows per API batch. None
|
|
970
|
+
enables automatic optimization. Defaults to None.
|
|
971
|
+
show_progress (bool, optional): Show progress bar in Jupyter
|
|
972
|
+
notebooks. Defaults to True.
|
|
973
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
974
|
+
|
|
975
|
+
Returns:
|
|
976
|
+
pandas.Series: Parsed structured data indexed like the original
|
|
977
|
+
DataFrame.
|
|
978
|
+
|
|
979
|
+
Example:
|
|
980
|
+
```python
|
|
981
|
+
df = pd.DataFrame({
|
|
982
|
+
'log': [
|
|
983
|
+
'2024-01-01 10:00 ERROR Database connection failed',
|
|
984
|
+
'2024-01-01 10:05 INFO Service started successfully'
|
|
985
|
+
]
|
|
986
|
+
})
|
|
987
|
+
|
|
988
|
+
# With automatic schema inference
|
|
989
|
+
parsed = df.ai.parse("Extract timestamp, level, and message")
|
|
990
|
+
# Returns Series with inferred structure like:
|
|
991
|
+
# {timestamp: str, level: str, message: str}
|
|
992
|
+
```
|
|
993
|
+
"""
|
|
994
|
+
return self.parse_with_cache(
|
|
995
|
+
instructions=instructions,
|
|
996
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
997
|
+
response_format=response_format,
|
|
998
|
+
max_examples=max_examples,
|
|
999
|
+
**api_kwargs,
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> SchemaInferenceOutput:
|
|
1003
|
+
"""Infer a structured data schema from DataFrame rows using AI.
|
|
1004
|
+
|
|
1005
|
+
This method analyzes a sample of DataFrame rows to automatically infer
|
|
1006
|
+
a structured schema that can be used for consistent data extraction.
|
|
1007
|
+
Each row is converted to JSON format and analyzed to identify patterns,
|
|
1008
|
+
field types, and potential categorical values.
|
|
1009
|
+
|
|
1010
|
+
Args:
|
|
1011
|
+
instructions (str): Plain language description of how the extracted
|
|
1012
|
+
structured data will be used (e.g., "Extract operational metrics
|
|
1013
|
+
for dashboard", "Parse customer attributes for segmentation").
|
|
1014
|
+
This guides field relevance and helps exclude irrelevant information.
|
|
1015
|
+
max_examples (int): Maximum number of rows to analyze from the
|
|
1016
|
+
DataFrame. The method will sample randomly up to this limit.
|
|
1017
|
+
Defaults to 100.
|
|
1018
|
+
|
|
1019
|
+
Returns:
|
|
1020
|
+
InferredSchema: An object containing:
|
|
1021
|
+
- instructions: Normalized statement of the extraction objective
|
|
1022
|
+
- fields: List of field specifications with names, types, and descriptions
|
|
1023
|
+
- inference_prompt: Reusable prompt for future extractions
|
|
1024
|
+
- model: Dynamically generated Pydantic model for parsing
|
|
1025
|
+
- task: PreparedTask for batch extraction operations
|
|
1026
|
+
|
|
1027
|
+
Example:
|
|
1028
|
+
```python
|
|
1029
|
+
df = pd.DataFrame({
|
|
1030
|
+
'text': [
|
|
1031
|
+
"Order #123: Shipped to NYC, arriving Tuesday",
|
|
1032
|
+
"Order #456: Delayed due to weather, new ETA Friday",
|
|
1033
|
+
"Order #789: Delivered to customer in LA"
|
|
1034
|
+
],
|
|
1035
|
+
'timestamp': ['2024-01-01', '2024-01-02', '2024-01-03']
|
|
1036
|
+
})
|
|
1037
|
+
|
|
1038
|
+
# Infer schema for logistics tracking
|
|
1039
|
+
schema = df.ai.infer_schema(
|
|
1040
|
+
instructions="Extract shipping status and location data for logistics tracking"
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
# Apply the schema to extract structured data
|
|
1044
|
+
extracted_df = df.ai.task(schema.task)
|
|
1045
|
+
```
|
|
1046
|
+
|
|
1047
|
+
Note:
|
|
1048
|
+
Each row is converted to JSON before analysis. The inference
|
|
1049
|
+
process automatically detects hierarchical relationships and
|
|
1050
|
+
creates appropriate nested structures when present. The generated
|
|
1051
|
+
Pydantic model ensures type safety and validation.
|
|
1052
|
+
"""
|
|
1053
|
+
return _df_rows_to_json_series(self._obj).ai.infer_schema(
|
|
1054
|
+
instructions=instructions,
|
|
1055
|
+
max_examples=max_examples,
|
|
1056
|
+
**api_kwargs,
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
def extract(self, column: str) -> pd.DataFrame:
|
|
1060
|
+
"""Flatten one column of Pydantic models/dicts into top‑level columns.
|
|
1061
|
+
|
|
1062
|
+
Example:
|
|
1063
|
+
```python
|
|
1064
|
+
df = pd.DataFrame([
|
|
1065
|
+
{"animal": {"name": "cat", "legs": 4}},
|
|
1066
|
+
{"animal": {"name": "dog", "legs": 4}},
|
|
1067
|
+
{"animal": {"name": "elephant", "legs": 4}},
|
|
1068
|
+
])
|
|
1069
|
+
df.ai.extract("animal")
|
|
1070
|
+
```
|
|
1071
|
+
This method returns a DataFrame with the same index as the original,
|
|
1072
|
+
where each column corresponds to a key in the dictionaries.
|
|
1073
|
+
The source column is dropped.
|
|
1074
|
+
|
|
1075
|
+
Args:
|
|
1076
|
+
column (str): Column to expand.
|
|
1077
|
+
|
|
1078
|
+
Returns:
|
|
1079
|
+
pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
|
|
1080
|
+
"""
|
|
1081
|
+
if column not in self._obj.columns:
|
|
1082
|
+
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
|
1083
|
+
|
|
1084
|
+
return (
|
|
1085
|
+
self._obj.pipe(lambda df: df.reset_index(drop=True))
|
|
1086
|
+
.pipe(lambda df: df.join(df[column].ai.extract()))
|
|
1087
|
+
.pipe(lambda df: df.set_index(self._obj.index))
|
|
1088
|
+
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
1089
|
+
)
|
|
1090
|
+
|
|
1091
|
+
def fillna(
|
|
1092
|
+
self,
|
|
1093
|
+
target_column_name: str,
|
|
1094
|
+
max_examples: int = 500,
|
|
1095
|
+
batch_size: int | None = None,
|
|
1096
|
+
show_progress: bool = True,
|
|
1097
|
+
) -> pd.DataFrame:
|
|
1098
|
+
"""Fill missing values in a DataFrame column using AI-powered inference.
|
|
1099
|
+
|
|
1100
|
+
This method uses machine learning to intelligently fill missing (NaN) values
|
|
1101
|
+
in a specified column by analyzing patterns from non-missing rows in the DataFrame.
|
|
1102
|
+
It creates a prepared task that provides examples of similar rows to help the AI
|
|
1103
|
+
model predict appropriate values for the missing entries.
|
|
1104
|
+
|
|
1105
|
+
Args:
|
|
1106
|
+
target_column_name (str): The name of the column containing missing values
|
|
1107
|
+
that need to be filled.
|
|
1108
|
+
max_examples (int, optional): The maximum number of example rows to use
|
|
1109
|
+
for context when predicting missing values. Higher values may improve
|
|
1110
|
+
accuracy but increase API costs and processing time. Defaults to 500.
|
|
1111
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
1112
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1113
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1114
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
1115
|
+
|
|
1116
|
+
Returns:
|
|
1117
|
+
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
1118
|
+
column. The original DataFrame is not modified.
|
|
1119
|
+
|
|
1120
|
+
Example:
|
|
1121
|
+
```python
|
|
1122
|
+
df = pd.DataFrame({
|
|
1123
|
+
'name': ['Alice', 'Bob', None, 'David'],
|
|
1124
|
+
'age': [25, 30, 35, None],
|
|
1125
|
+
'city': ['Tokyo', 'Osaka', 'Kyoto', 'Tokyo']
|
|
1126
|
+
})
|
|
1127
|
+
|
|
1128
|
+
# Fill missing values in the 'name' column
|
|
1129
|
+
filled_df = df.ai.fillna('name')
|
|
1130
|
+
|
|
1131
|
+
# With progress bar for large datasets
|
|
1132
|
+
large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
|
|
1133
|
+
filled_df = large_df.ai.fillna('name', batch_size=32, show_progress=True)
|
|
1134
|
+
```
|
|
1135
|
+
|
|
1136
|
+
Note:
|
|
1137
|
+
If the target column has no missing values, the original DataFrame
|
|
1138
|
+
is returned unchanged.
|
|
1139
|
+
"""
|
|
1140
|
+
|
|
1141
|
+
task: PreparedTask = fillna(self._obj, target_column_name, max_examples)
|
|
1142
|
+
missing_rows = self._obj[self._obj[target_column_name].isna()]
|
|
1143
|
+
if missing_rows.empty:
|
|
1144
|
+
return self._obj
|
|
1145
|
+
|
|
1146
|
+
filled_values: list[FillNaResponse] = missing_rows.ai.task(
|
|
1147
|
+
task=task, batch_size=batch_size, show_progress=show_progress
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
# get deep copy of the DataFrame to avoid modifying the original
|
|
1151
|
+
df = self._obj.copy()
|
|
1152
|
+
|
|
1153
|
+
# Get the actual indices of missing rows to map the results correctly
|
|
1154
|
+
missing_indices = missing_rows.index.tolist()
|
|
1155
|
+
|
|
1156
|
+
for i, result in enumerate(filled_values):
|
|
1157
|
+
if result.output is not None:
|
|
1158
|
+
# Use the actual index from the original DataFrame, not the relative index from result
|
|
1159
|
+
actual_index = missing_indices[i]
|
|
1160
|
+
df.at[actual_index, target_column_name] = result.output
|
|
1161
|
+
|
|
1162
|
+
return df
|
|
1163
|
+
|
|
1164
|
+
def similarity(self, col1: str, col2: str) -> pd.Series:
|
|
1165
|
+
"""Compute cosine similarity between two columns containing embedding vectors.
|
|
1166
|
+
|
|
1167
|
+
This method calculates the cosine similarity between vectors stored in
|
|
1168
|
+
two columns of the DataFrame. The vectors should be numpy arrays or
|
|
1169
|
+
array-like objects that support dot product operations.
|
|
1170
|
+
|
|
1171
|
+
Example:
|
|
1172
|
+
```python
|
|
1173
|
+
df = pd.DataFrame({
|
|
1174
|
+
'vec1': [np.array([1, 0, 0]), np.array([0, 1, 0])],
|
|
1175
|
+
'vec2': [np.array([1, 0, 0]), np.array([1, 1, 0])]
|
|
1176
|
+
})
|
|
1177
|
+
similarities = df.ai.similarity('vec1', 'vec2')
|
|
1178
|
+
```
|
|
1179
|
+
|
|
1180
|
+
Args:
|
|
1181
|
+
col1 (str): Name of the first column containing embedding vectors.
|
|
1182
|
+
col2 (str): Name of the second column containing embedding vectors.
|
|
1183
|
+
|
|
1184
|
+
Returns:
|
|
1185
|
+
pandas.Series: Series containing cosine similarity scores between
|
|
1186
|
+
corresponding vectors in col1 and col2, with values ranging
|
|
1187
|
+
from -1 to 1, where 1 indicates identical direction.
|
|
1188
|
+
"""
|
|
1189
|
+
return self._obj.apply(
|
|
1190
|
+
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
1191
|
+
axis=1,
|
|
1192
|
+
).rename("similarity") # type: ignore[arg-type]
|
|
1193
|
+
|
|
1194
|
+
|
|
1195
|
+
@pd.api.extensions.register_series_accessor("aio")
|
|
1196
|
+
class AsyncOpenAIVecSeriesAccessor:
|
|
1197
|
+
"""pandas Series accessor (``.aio``) that adds OpenAI helpers."""
|
|
565
1198
|
|
|
566
1199
|
def __init__(self, series_obj: pd.Series):
|
|
567
1200
|
self._obj = series_obj
|
|
568
1201
|
|
|
1202
|
+
async def responses_with_cache(
|
|
1203
|
+
self,
|
|
1204
|
+
instructions: str,
|
|
1205
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1206
|
+
response_format: type[ResponseFormat] = str,
|
|
1207
|
+
**api_kwargs,
|
|
1208
|
+
) -> pd.Series:
|
|
1209
|
+
"""Call an LLM once for every Series element using a provided cache (asynchronously).
|
|
1210
|
+
|
|
1211
|
+
This method allows external control over caching behavior by accepting
|
|
1212
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1213
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1214
|
+
is controlled by the cache instance itself.
|
|
1215
|
+
|
|
1216
|
+
Example:
|
|
1217
|
+
```python
|
|
1218
|
+
result = await series.aio.responses_with_cache(
|
|
1219
|
+
"classify",
|
|
1220
|
+
cache=shared,
|
|
1221
|
+
max_output_tokens=256,
|
|
1222
|
+
frequency_penalty=0.2,
|
|
1223
|
+
)
|
|
1224
|
+
```
|
|
1225
|
+
|
|
1226
|
+
Args:
|
|
1227
|
+
instructions (str): System prompt prepended to every user message.
|
|
1228
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1229
|
+
instance for managing API call batching and deduplication.
|
|
1230
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1231
|
+
response_format (type[ResponseFormat], optional): Pydantic model or built‑in
|
|
1232
|
+
type the assistant should return. Defaults to ``str``.
|
|
1233
|
+
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
1234
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
|
|
1235
|
+
``max_output_tokens``, penalties, future parameters). Core batching keys
|
|
1236
|
+
(model, instructions, input, text_format) are protected and silently
|
|
1237
|
+
ignored if provided.
|
|
1238
|
+
|
|
1239
|
+
Returns:
|
|
1240
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
1241
|
+
|
|
1242
|
+
Note:
|
|
1243
|
+
This is an asynchronous method and must be awaited.
|
|
1244
|
+
"""
|
|
1245
|
+
client: AsyncBatchResponses = AsyncBatchResponses(
|
|
1246
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1247
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
1248
|
+
system_message=instructions,
|
|
1249
|
+
response_format=response_format,
|
|
1250
|
+
cache=cache,
|
|
1251
|
+
api_kwargs=api_kwargs,
|
|
1252
|
+
)
|
|
1253
|
+
|
|
1254
|
+
results = await client.parse(self._obj.tolist())
|
|
1255
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
1256
|
+
|
|
569
1257
|
async def responses(
|
|
570
1258
|
self,
|
|
571
1259
|
instructions: str,
|
|
572
|
-
response_format:
|
|
573
|
-
batch_size: int =
|
|
574
|
-
temperature: float = 0.0,
|
|
575
|
-
top_p: float = 1.0,
|
|
1260
|
+
response_format: type[ResponseFormat] = str,
|
|
1261
|
+
batch_size: int | None = None,
|
|
576
1262
|
max_concurrency: int = 8,
|
|
1263
|
+
show_progress: bool = True,
|
|
1264
|
+
**api_kwargs,
|
|
577
1265
|
) -> pd.Series:
|
|
578
1266
|
"""Call an LLM once for every Series element (asynchronously).
|
|
579
1267
|
|
|
@@ -582,22 +1270,32 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
582
1270
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
583
1271
|
# Must be awaited
|
|
584
1272
|
results = await animals.aio.responses("translate to French")
|
|
1273
|
+
|
|
1274
|
+
# With progress bar for large datasets
|
|
1275
|
+
large_series = pd.Series(["data"] * 1000)
|
|
1276
|
+
results = await large_series.aio.responses(
|
|
1277
|
+
"analyze this data",
|
|
1278
|
+
batch_size=32,
|
|
1279
|
+
max_concurrency=4,
|
|
1280
|
+
show_progress=True
|
|
1281
|
+
)
|
|
585
1282
|
```
|
|
586
|
-
This method returns a Series of strings, each containing the
|
|
587
|
-
assistant's response to the corresponding input.
|
|
588
|
-
The model used is set by the `responses_model` function.
|
|
589
|
-
The default model is `gpt-4o-mini`.
|
|
590
1283
|
|
|
591
1284
|
Args:
|
|
592
1285
|
instructions (str): System prompt prepended to every user message.
|
|
593
|
-
response_format (
|
|
1286
|
+
response_format (type[ResponseFormat], optional): Pydantic model or built‑in
|
|
594
1287
|
type the assistant should return. Defaults to ``str``.
|
|
595
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
596
|
-
request. Defaults to ``
|
|
597
|
-
|
|
598
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1288
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1289
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
1290
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
599
1291
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
600
1292
|
requests. Defaults to ``8``.
|
|
1293
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
1294
|
+
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
1295
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
|
|
1296
|
+
``max_output_tokens``, penalties, future parameters). Core batching keys
|
|
1297
|
+
(model, instructions, input, text_format) are protected and silently
|
|
1298
|
+
ignored if provided.
|
|
601
1299
|
|
|
602
1300
|
Returns:
|
|
603
1301
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
@@ -605,18 +1303,64 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
605
1303
|
Note:
|
|
606
1304
|
This is an asynchronous method and must be awaited.
|
|
607
1305
|
"""
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
1306
|
+
return await self.responses_with_cache(
|
|
1307
|
+
instructions=instructions,
|
|
1308
|
+
cache=AsyncBatchingMapProxy(
|
|
1309
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1310
|
+
),
|
|
612
1311
|
response_format=response_format,
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
1312
|
+
**api_kwargs,
|
|
1313
|
+
)
|
|
1314
|
+
|
|
1315
|
+
async def embeddings_with_cache(
|
|
1316
|
+
self,
|
|
1317
|
+
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
1318
|
+
**api_kwargs,
|
|
1319
|
+
) -> pd.Series:
|
|
1320
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
1321
|
+
|
|
1322
|
+
This method allows external control over caching behavior by accepting
|
|
1323
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1324
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1325
|
+
is controlled by the cache instance itself.
|
|
1326
|
+
|
|
1327
|
+
Example:
|
|
1328
|
+
```python
|
|
1329
|
+
from openaivec._cache import AsyncBatchingMapProxy
|
|
1330
|
+
import numpy as np
|
|
1331
|
+
|
|
1332
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1333
|
+
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
1334
|
+
batch_size=64, max_concurrency=4
|
|
1335
|
+
)
|
|
1336
|
+
|
|
1337
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
1338
|
+
# Must be awaited
|
|
1339
|
+
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
1340
|
+
```
|
|
1341
|
+
|
|
1342
|
+
Args:
|
|
1343
|
+
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
1344
|
+
instance for managing API call batching and deduplication.
|
|
1345
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1346
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
1347
|
+
|
|
1348
|
+
Returns:
|
|
1349
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
1350
|
+
(dtype ``float32``).
|
|
1351
|
+
|
|
1352
|
+
Note:
|
|
1353
|
+
This is an asynchronous method and must be awaited.
|
|
1354
|
+
"""
|
|
1355
|
+
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
1356
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1357
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
1358
|
+
cache=cache,
|
|
1359
|
+
api_kwargs=api_kwargs,
|
|
616
1360
|
)
|
|
617
1361
|
|
|
618
1362
|
# Await the async operation
|
|
619
|
-
results = await client.
|
|
1363
|
+
results = await client.create(self._obj.tolist())
|
|
620
1364
|
|
|
621
1365
|
return pd.Series(
|
|
622
1366
|
results,
|
|
@@ -624,7 +1368,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
624
1368
|
name=self._obj.name,
|
|
625
1369
|
)
|
|
626
1370
|
|
|
627
|
-
async def embeddings(
|
|
1371
|
+
async def embeddings(
|
|
1372
|
+
self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = True, **api_kwargs
|
|
1373
|
+
) -> pd.Series:
|
|
628
1374
|
"""Compute OpenAI embeddings for every Series element (asynchronously).
|
|
629
1375
|
|
|
630
1376
|
Example:
|
|
@@ -632,17 +1378,24 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
632
1378
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
633
1379
|
# Must be awaited
|
|
634
1380
|
embeddings = await animals.aio.embeddings()
|
|
1381
|
+
|
|
1382
|
+
# With progress bar for large datasets
|
|
1383
|
+
large_texts = pd.Series(["text"] * 5000)
|
|
1384
|
+
embeddings = await large_texts.aio.embeddings(
|
|
1385
|
+
batch_size=100,
|
|
1386
|
+
max_concurrency=4,
|
|
1387
|
+
show_progress=True
|
|
1388
|
+
)
|
|
635
1389
|
```
|
|
636
|
-
This method returns a Series of numpy arrays, each containing the
|
|
637
|
-
embedding vector for the corresponding input.
|
|
638
|
-
The embedding model is set by the `embeddings_model` function.
|
|
639
|
-
The default embedding model is `text-embedding-3-small`.
|
|
640
1390
|
|
|
641
1391
|
Args:
|
|
642
|
-
batch_size (int, optional): Number of inputs grouped into a
|
|
643
|
-
single request. Defaults to ``
|
|
1392
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
1393
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
1394
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
644
1395
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
645
1396
|
requests. Defaults to ``8``.
|
|
1397
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
1398
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
646
1399
|
|
|
647
1400
|
Returns:
|
|
648
1401
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -651,49 +1404,121 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
651
1404
|
Note:
|
|
652
1405
|
This is an asynchronous method and must be awaited.
|
|
653
1406
|
"""
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
1407
|
+
return await self.embeddings_with_cache(
|
|
1408
|
+
cache=AsyncBatchingMapProxy(
|
|
1409
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1410
|
+
),
|
|
1411
|
+
**api_kwargs,
|
|
658
1412
|
)
|
|
659
1413
|
|
|
660
|
-
|
|
661
|
-
|
|
1414
|
+
async def task_with_cache(
|
|
1415
|
+
self,
|
|
1416
|
+
task: PreparedTask[ResponseFormat],
|
|
1417
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1418
|
+
**api_kwargs,
|
|
1419
|
+
) -> pd.Series:
|
|
1420
|
+
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
662
1421
|
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
1422
|
+
This method allows external control over caching behavior by accepting
|
|
1423
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1424
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1425
|
+
is controlled by the cache instance itself.
|
|
1426
|
+
|
|
1427
|
+
Args:
|
|
1428
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
1429
|
+
response format for processing the inputs.
|
|
1430
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1431
|
+
instance for managing API call batching and deduplication.
|
|
1432
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1433
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
1434
|
+
|
|
1435
|
+
Example:
|
|
1436
|
+
```python
|
|
1437
|
+
from openaivec._model import PreparedTask
|
|
1438
|
+
from openaivec._cache import AsyncBatchingMapProxy
|
|
1439
|
+
|
|
1440
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1441
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
1442
|
+
|
|
1443
|
+
# Assume you have a prepared task for sentiment analysis
|
|
1444
|
+
sentiment_task = PreparedTask(...)
|
|
1445
|
+
|
|
1446
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
1447
|
+
# Must be awaited
|
|
1448
|
+
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
1449
|
+
```
|
|
1450
|
+
|
|
1451
|
+
Additional Keyword Args:
|
|
1452
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1453
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1454
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1455
|
+
library and cannot be overridden.
|
|
1456
|
+
|
|
1457
|
+
Returns:
|
|
1458
|
+
pandas.Series: Series whose values are instances of the task's
|
|
1459
|
+
response format, aligned with the original Series index.
|
|
1460
|
+
|
|
1461
|
+
Note:
|
|
1462
|
+
This is an asynchronous method and must be awaited.
|
|
1463
|
+
"""
|
|
1464
|
+
client = AsyncBatchResponses(
|
|
1465
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1466
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
1467
|
+
system_message=task.instructions,
|
|
1468
|
+
response_format=task.response_format,
|
|
1469
|
+
cache=cache,
|
|
1470
|
+
api_kwargs=api_kwargs,
|
|
667
1471
|
)
|
|
1472
|
+
results = await client.parse(self._obj.tolist())
|
|
668
1473
|
|
|
669
|
-
|
|
670
|
-
"""Execute a prepared task on every Series element (asynchronously).
|
|
1474
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
671
1475
|
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
1476
|
+
async def task(
|
|
1477
|
+
self,
|
|
1478
|
+
task: PreparedTask,
|
|
1479
|
+
batch_size: int | None = None,
|
|
1480
|
+
max_concurrency: int = 8,
|
|
1481
|
+
show_progress: bool = True,
|
|
1482
|
+
**api_kwargs,
|
|
1483
|
+
) -> pd.Series:
|
|
1484
|
+
"""Execute a prepared task on every Series element (asynchronously).
|
|
675
1485
|
|
|
676
1486
|
Example:
|
|
677
1487
|
```python
|
|
678
|
-
from openaivec.
|
|
679
|
-
|
|
1488
|
+
from openaivec._model import PreparedTask
|
|
1489
|
+
|
|
680
1490
|
# Assume you have a prepared task for sentiment analysis
|
|
681
1491
|
sentiment_task = PreparedTask(...)
|
|
682
|
-
|
|
1492
|
+
|
|
683
1493
|
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
684
1494
|
# Must be awaited
|
|
685
1495
|
results = await reviews.aio.task(sentiment_task)
|
|
1496
|
+
|
|
1497
|
+
# With progress bar for large datasets
|
|
1498
|
+
large_reviews = pd.Series(["review text"] * 2000)
|
|
1499
|
+
results = await large_reviews.aio.task(
|
|
1500
|
+
sentiment_task,
|
|
1501
|
+
batch_size=50,
|
|
1502
|
+
max_concurrency=4,
|
|
1503
|
+
show_progress=True
|
|
1504
|
+
)
|
|
686
1505
|
```
|
|
687
|
-
This method returns a Series containing the task results for each
|
|
688
|
-
corresponding input element, following the task's defined structure.
|
|
689
1506
|
|
|
690
1507
|
Args:
|
|
691
1508
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
692
|
-
response format
|
|
693
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
694
|
-
request to optimize API usage. Defaults to
|
|
1509
|
+
response format for processing the inputs.
|
|
1510
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1511
|
+
request to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1512
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
695
1513
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
696
1514
|
requests. Defaults to 8.
|
|
1515
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
1516
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
1517
|
+
|
|
1518
|
+
Note:
|
|
1519
|
+
The task's stored API parameters are used. Core batching / routing
|
|
1520
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1521
|
+
library and cannot be overridden.
|
|
697
1522
|
|
|
698
1523
|
Returns:
|
|
699
1524
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -702,20 +1527,117 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
702
1527
|
Note:
|
|
703
1528
|
This is an asynchronous method and must be awaited.
|
|
704
1529
|
"""
|
|
705
|
-
|
|
706
|
-
client=_get_async_openai_client(),
|
|
707
|
-
model_name=_RESPONSES_MODEL_NAME,
|
|
1530
|
+
return await self.task_with_cache(
|
|
708
1531
|
task=task,
|
|
709
|
-
|
|
1532
|
+
cache=AsyncBatchingMapProxy(
|
|
1533
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1534
|
+
),
|
|
1535
|
+
**api_kwargs,
|
|
710
1536
|
)
|
|
711
1537
|
|
|
712
|
-
|
|
713
|
-
|
|
1538
|
+
async def parse_with_cache(
|
|
1539
|
+
self,
|
|
1540
|
+
instructions: str,
|
|
1541
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1542
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1543
|
+
max_examples: int = 100,
|
|
1544
|
+
**api_kwargs,
|
|
1545
|
+
) -> pd.Series:
|
|
1546
|
+
"""Parse Series values into structured data using an LLM with a provided cache (asynchronously).
|
|
714
1547
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
1548
|
+
This async method provides external cache control while parsing Series
|
|
1549
|
+
content into structured data. Automatic schema inference is performed
|
|
1550
|
+
when no response format is specified.
|
|
1551
|
+
|
|
1552
|
+
Args:
|
|
1553
|
+
instructions (str): Plain language description of what to extract
|
|
1554
|
+
(e.g., "Extract dates, amounts, and descriptions from receipts").
|
|
1555
|
+
Guides both extraction and schema inference.
|
|
1556
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
|
|
1557
|
+
async cache for managing concurrent API calls and deduplication.
|
|
1558
|
+
Set cache.batch_size=None for automatic optimization.
|
|
1559
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1560
|
+
structure for parsed data. Can be a Pydantic model, built-in
|
|
1561
|
+
type, or None for automatic inference. Defaults to None.
|
|
1562
|
+
max_examples (int, optional): Maximum values to analyze for schema
|
|
1563
|
+
inference (when response_format is None). Defaults to 100.
|
|
1564
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1565
|
+
|
|
1566
|
+
Returns:
|
|
1567
|
+
pandas.Series: Series containing parsed structured data aligned
|
|
1568
|
+
with the original index.
|
|
1569
|
+
|
|
1570
|
+
Note:
|
|
1571
|
+
This is an asynchronous method and must be awaited.
|
|
1572
|
+
"""
|
|
1573
|
+
schema: SchemaInferenceOutput | None = None
|
|
1574
|
+
if response_format is None:
|
|
1575
|
+
# Use synchronous schema inference
|
|
1576
|
+
schema = self._obj.ai.infer_schema(instructions=instructions, max_examples=max_examples)
|
|
1577
|
+
|
|
1578
|
+
return await self.responses_with_cache(
|
|
1579
|
+
instructions=schema.inference_prompt if schema else instructions,
|
|
1580
|
+
cache=cache,
|
|
1581
|
+
response_format=response_format or schema.model,
|
|
1582
|
+
**api_kwargs,
|
|
1583
|
+
)
|
|
1584
|
+
|
|
1585
|
+
async def parse(
|
|
1586
|
+
self,
|
|
1587
|
+
instructions: str,
|
|
1588
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1589
|
+
max_examples: int = 100,
|
|
1590
|
+
batch_size: int | None = None,
|
|
1591
|
+
max_concurrency: int = 8,
|
|
1592
|
+
show_progress: bool = True,
|
|
1593
|
+
**api_kwargs,
|
|
1594
|
+
) -> pd.Series:
|
|
1595
|
+
"""Parse Series values into structured data using an LLM (asynchronously).
|
|
1596
|
+
|
|
1597
|
+
Async version of the parse method, extracting structured information
|
|
1598
|
+
from unstructured text with automatic schema inference when needed.
|
|
1599
|
+
|
|
1600
|
+
Args:
|
|
1601
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1602
|
+
product names, prices, and categories from descriptions").
|
|
1603
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1604
|
+
structure. None triggers automatic schema inference. Defaults to None.
|
|
1605
|
+
max_examples (int, optional): Maximum values for schema inference.
|
|
1606
|
+
Defaults to 100.
|
|
1607
|
+
batch_size (int | None, optional): Requests per batch. None for
|
|
1608
|
+
automatic optimization. Defaults to None.
|
|
1609
|
+
max_concurrency (int, optional): Maximum concurrent API requests.
|
|
1610
|
+
Defaults to 8.
|
|
1611
|
+
show_progress (bool, optional): Show progress bar. Defaults to True.
|
|
1612
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1613
|
+
|
|
1614
|
+
Returns:
|
|
1615
|
+
pandas.Series: Parsed structured data indexed like the original Series.
|
|
1616
|
+
|
|
1617
|
+
Example:
|
|
1618
|
+
```python
|
|
1619
|
+
emails = pd.Series([
|
|
1620
|
+
"Meeting tomorrow at 3pm with John about Q4 planning",
|
|
1621
|
+
"Lunch with Sarah on Friday to discuss new project"
|
|
1622
|
+
])
|
|
1623
|
+
|
|
1624
|
+
# Async extraction with schema inference
|
|
1625
|
+
parsed = await emails.aio.parse(
|
|
1626
|
+
"Extract meeting details including time, person, and topic"
|
|
1627
|
+
)
|
|
1628
|
+
```
|
|
1629
|
+
|
|
1630
|
+
Note:
|
|
1631
|
+
This is an asynchronous method and must be awaited.
|
|
1632
|
+
"""
|
|
1633
|
+
return await self.parse_with_cache(
|
|
1634
|
+
instructions=instructions,
|
|
1635
|
+
cache=AsyncBatchingMapProxy(
|
|
1636
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1637
|
+
),
|
|
1638
|
+
response_format=response_format,
|
|
1639
|
+
max_examples=max_examples,
|
|
1640
|
+
**api_kwargs,
|
|
719
1641
|
)
|
|
720
1642
|
|
|
721
1643
|
|
|
@@ -726,82 +1648,167 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
726
1648
|
def __init__(self, df_obj: pd.DataFrame):
|
|
727
1649
|
self._obj = df_obj
|
|
728
1650
|
|
|
1651
|
+
async def responses_with_cache(
|
|
1652
|
+
self,
|
|
1653
|
+
instructions: str,
|
|
1654
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1655
|
+
response_format: type[ResponseFormat] = str,
|
|
1656
|
+
**api_kwargs,
|
|
1657
|
+
) -> pd.Series:
|
|
1658
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
|
|
1659
|
+
|
|
1660
|
+
This method allows external control over caching behavior by accepting
|
|
1661
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1662
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1663
|
+
is controlled by the cache instance itself.
|
|
1664
|
+
|
|
1665
|
+
Example:
|
|
1666
|
+
```python
|
|
1667
|
+
from openaivec._cache import AsyncBatchingMapProxy
|
|
1668
|
+
|
|
1669
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1670
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
1671
|
+
|
|
1672
|
+
df = pd.DataFrame([
|
|
1673
|
+
{"name": "cat", "legs": 4},
|
|
1674
|
+
{"name": "dog", "legs": 4},
|
|
1675
|
+
{"name": "elephant", "legs": 4},
|
|
1676
|
+
])
|
|
1677
|
+
# Must be awaited
|
|
1678
|
+
result = await df.aio.responses_with_cache(
|
|
1679
|
+
"what is the animal's name?",
|
|
1680
|
+
cache=shared_cache
|
|
1681
|
+
)
|
|
1682
|
+
```
|
|
1683
|
+
|
|
1684
|
+
Args:
|
|
1685
|
+
instructions (str): System prompt for the assistant.
|
|
1686
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1687
|
+
instance for managing API call batching and deduplication.
|
|
1688
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1689
|
+
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
1690
|
+
responses. Defaults to ``str``.
|
|
1691
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
1692
|
+
|
|
1693
|
+
Returns:
|
|
1694
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
1695
|
+
|
|
1696
|
+
Note:
|
|
1697
|
+
This is an asynchronous method and must be awaited.
|
|
1698
|
+
"""
|
|
1699
|
+
# Await the call to the async Series method using .aio
|
|
1700
|
+
return await _df_rows_to_json_series(self._obj).aio.responses_with_cache(
|
|
1701
|
+
instructions=instructions,
|
|
1702
|
+
cache=cache,
|
|
1703
|
+
response_format=response_format,
|
|
1704
|
+
**api_kwargs,
|
|
1705
|
+
)
|
|
1706
|
+
|
|
729
1707
|
async def responses(
|
|
730
1708
|
self,
|
|
731
1709
|
instructions: str,
|
|
732
|
-
response_format:
|
|
733
|
-
batch_size: int =
|
|
734
|
-
temperature: float = 0.0,
|
|
735
|
-
top_p: float = 1.0,
|
|
1710
|
+
response_format: type[ResponseFormat] = str,
|
|
1711
|
+
batch_size: int | None = None,
|
|
736
1712
|
max_concurrency: int = 8,
|
|
1713
|
+
show_progress: bool = True,
|
|
1714
|
+
**api_kwargs,
|
|
737
1715
|
) -> pd.Series:
|
|
738
|
-
"""Generate a response for each row after
|
|
1716
|
+
"""Generate a response for each row after serializing it to JSON (asynchronously).
|
|
739
1717
|
|
|
740
1718
|
Example:
|
|
741
1719
|
```python
|
|
742
1720
|
df = pd.DataFrame([
|
|
743
|
-
{
|
|
744
|
-
{
|
|
745
|
-
{
|
|
1721
|
+
{"name": "cat", "legs": 4},
|
|
1722
|
+
{"name": "dog", "legs": 4},
|
|
1723
|
+
{"name": "elephant", "legs": 4},
|
|
746
1724
|
])
|
|
747
1725
|
# Must be awaited
|
|
748
|
-
results = await df.aio.responses(
|
|
1726
|
+
results = await df.aio.responses("what is the animal's name?")
|
|
1727
|
+
|
|
1728
|
+
# With progress bar for large datasets
|
|
1729
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
1730
|
+
results = await large_df.aio.responses(
|
|
1731
|
+
"generate a name for this ID",
|
|
1732
|
+
batch_size=20,
|
|
1733
|
+
max_concurrency=4,
|
|
1734
|
+
show_progress=True
|
|
1735
|
+
)
|
|
749
1736
|
```
|
|
750
|
-
This method returns a Series of strings, each containing the
|
|
751
|
-
assistant's response to the corresponding input.
|
|
752
|
-
Each row is serialised to JSON before being sent to the assistant.
|
|
753
|
-
The model used is set by the `responses_model` function.
|
|
754
|
-
The default model is `gpt-4o-mini`.
|
|
755
1737
|
|
|
756
1738
|
Args:
|
|
757
1739
|
instructions (str): System prompt for the assistant.
|
|
758
|
-
response_format (
|
|
1740
|
+
response_format (type[ResponseFormat], optional): Desired Python type of the
|
|
759
1741
|
responses. Defaults to ``str``.
|
|
760
|
-
batch_size (int, optional): Number of requests sent in one batch.
|
|
761
|
-
Defaults to ``
|
|
762
|
-
|
|
763
|
-
|
|
1742
|
+
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
1743
|
+
Defaults to ``None`` (automatic batch size optimization
|
|
1744
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
1745
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
764
1746
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
765
1747
|
requests. Defaults to ``8``.
|
|
1748
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
766
1749
|
|
|
767
1750
|
Returns:
|
|
768
|
-
pandas.Series: Responses aligned with the DataFrame
|
|
1751
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
769
1752
|
|
|
770
1753
|
Note:
|
|
771
1754
|
This is an asynchronous method and must be awaited.
|
|
772
1755
|
"""
|
|
773
|
-
|
|
774
|
-
lambda df: (
|
|
775
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
776
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
777
|
-
)
|
|
778
|
-
)
|
|
779
|
-
)
|
|
780
|
-
# Await the call to the async Series method using .aio
|
|
781
|
-
return await series_of_json.aio.responses(
|
|
1756
|
+
return await self.responses_with_cache(
|
|
782
1757
|
instructions=instructions,
|
|
1758
|
+
cache=AsyncBatchingMapProxy(
|
|
1759
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1760
|
+
),
|
|
783
1761
|
response_format=response_format,
|
|
784
|
-
|
|
785
|
-
temperature=temperature,
|
|
786
|
-
top_p=top_p,
|
|
787
|
-
max_concurrency=max_concurrency,
|
|
1762
|
+
**api_kwargs,
|
|
788
1763
|
)
|
|
789
1764
|
|
|
790
|
-
async def
|
|
791
|
-
|
|
1765
|
+
async def task_with_cache(
|
|
1766
|
+
self,
|
|
1767
|
+
task: PreparedTask[ResponseFormat],
|
|
1768
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1769
|
+
**api_kwargs,
|
|
1770
|
+
) -> pd.Series:
|
|
1771
|
+
"""Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
|
|
1772
|
+
|
|
1773
|
+
After serializing each row to JSON, this method executes the prepared task.
|
|
1774
|
+
|
|
1775
|
+
Args:
|
|
1776
|
+
task (PreparedTask): Prepared task (instructions + response_format).
|
|
1777
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1778
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
792
1779
|
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
1780
|
+
Note:
|
|
1781
|
+
Core routing keys are managed internally.
|
|
1782
|
+
|
|
1783
|
+
Returns:
|
|
1784
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
1785
|
+
|
|
1786
|
+
Note:
|
|
1787
|
+
This is an asynchronous method and must be awaited.
|
|
1788
|
+
"""
|
|
1789
|
+
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1790
|
+
task=task,
|
|
1791
|
+
cache=cache,
|
|
1792
|
+
**api_kwargs,
|
|
1793
|
+
)
|
|
1794
|
+
|
|
1795
|
+
async def task(
|
|
1796
|
+
self,
|
|
1797
|
+
task: PreparedTask,
|
|
1798
|
+
batch_size: int | None = None,
|
|
1799
|
+
max_concurrency: int = 8,
|
|
1800
|
+
show_progress: bool = True,
|
|
1801
|
+
**api_kwargs,
|
|
1802
|
+
) -> pd.Series:
|
|
1803
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
|
|
797
1804
|
|
|
798
1805
|
Example:
|
|
799
1806
|
```python
|
|
800
|
-
from openaivec.
|
|
801
|
-
|
|
1807
|
+
from openaivec._model import PreparedTask
|
|
1808
|
+
|
|
802
1809
|
# Assume you have a prepared task for data analysis
|
|
803
1810
|
analysis_task = PreparedTask(...)
|
|
804
|
-
|
|
1811
|
+
|
|
805
1812
|
df = pd.DataFrame([
|
|
806
1813
|
{"name": "cat", "legs": 4},
|
|
807
1814
|
{"name": "dog", "legs": 4},
|
|
@@ -809,17 +1816,31 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
809
1816
|
])
|
|
810
1817
|
# Must be awaited
|
|
811
1818
|
results = await df.aio.task(analysis_task)
|
|
1819
|
+
|
|
1820
|
+
# With progress bar for large datasets
|
|
1821
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
1822
|
+
results = await large_df.aio.task(
|
|
1823
|
+
analysis_task,
|
|
1824
|
+
batch_size=50,
|
|
1825
|
+
max_concurrency=4,
|
|
1826
|
+
show_progress=True
|
|
1827
|
+
)
|
|
812
1828
|
```
|
|
813
|
-
This method returns a Series containing the task results for each
|
|
814
|
-
corresponding row, following the task's defined structure.
|
|
815
1829
|
|
|
816
1830
|
Args:
|
|
817
1831
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
818
|
-
response format
|
|
819
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
820
|
-
to optimize API usage. Defaults to
|
|
1832
|
+
response format for processing the inputs.
|
|
1833
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
1834
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1835
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
821
1836
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
822
1837
|
requests. Defaults to 8.
|
|
1838
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
1839
|
+
**api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
|
|
1840
|
+
|
|
1841
|
+
Note:
|
|
1842
|
+
Core batching / routing keys (``model``, ``instructions`` / system message, user ``input``)
|
|
1843
|
+
are managed by the library and cannot be overridden.
|
|
823
1844
|
|
|
824
1845
|
Returns:
|
|
825
1846
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -828,27 +1849,131 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
828
1849
|
Note:
|
|
829
1850
|
This is an asynchronous method and must be awaited.
|
|
830
1851
|
"""
|
|
831
|
-
series_of_json = self._obj.pipe(
|
|
832
|
-
lambda df: (
|
|
833
|
-
pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
|
|
834
|
-
lambda x: json.dumps(x, ensure_ascii=False)
|
|
835
|
-
)
|
|
836
|
-
)
|
|
837
|
-
)
|
|
838
1852
|
# Await the call to the async Series method using .aio
|
|
839
|
-
return await
|
|
1853
|
+
return await _df_rows_to_json_series(self._obj).aio.task(
|
|
840
1854
|
task=task,
|
|
841
1855
|
batch_size=batch_size,
|
|
842
1856
|
max_concurrency=max_concurrency,
|
|
1857
|
+
show_progress=show_progress,
|
|
1858
|
+
**api_kwargs,
|
|
843
1859
|
)
|
|
844
1860
|
|
|
845
|
-
async def
|
|
1861
|
+
async def parse_with_cache(
|
|
1862
|
+
self,
|
|
1863
|
+
instructions: str,
|
|
1864
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1865
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1866
|
+
max_examples: int = 100,
|
|
1867
|
+
**api_kwargs,
|
|
1868
|
+
) -> pd.Series:
|
|
1869
|
+
"""Parse DataFrame rows into structured data using an LLM with cache (asynchronously).
|
|
1870
|
+
|
|
1871
|
+
Async method for parsing DataFrame rows (as JSON) with external cache
|
|
1872
|
+
control, enabling deduplication across operations and concurrent processing.
|
|
1873
|
+
|
|
1874
|
+
Args:
|
|
1875
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1876
|
+
invoice details including items, quantities, and totals").
|
|
1877
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
|
|
1878
|
+
async cache for concurrent API call management.
|
|
1879
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1880
|
+
structure. None triggers automatic schema inference. Defaults to None.
|
|
1881
|
+
max_examples (int, optional): Maximum rows for schema inference.
|
|
1882
|
+
Defaults to 100.
|
|
1883
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1884
|
+
|
|
1885
|
+
Returns:
|
|
1886
|
+
pandas.Series: Parsed structured data indexed like the original DataFrame.
|
|
1887
|
+
|
|
1888
|
+
Note:
|
|
1889
|
+
This is an asynchronous method and must be awaited.
|
|
1890
|
+
"""
|
|
1891
|
+
return await _df_rows_to_json_series(self._obj).aio.parse_with_cache(
|
|
1892
|
+
instructions=instructions,
|
|
1893
|
+
cache=cache,
|
|
1894
|
+
response_format=response_format,
|
|
1895
|
+
max_examples=max_examples,
|
|
1896
|
+
**api_kwargs,
|
|
1897
|
+
)
|
|
1898
|
+
|
|
1899
|
+
async def parse(
|
|
1900
|
+
self,
|
|
1901
|
+
instructions: str,
|
|
1902
|
+
response_format: type[ResponseFormat] | None = None,
|
|
1903
|
+
max_examples: int = 100,
|
|
1904
|
+
batch_size: int | None = None,
|
|
1905
|
+
max_concurrency: int = 8,
|
|
1906
|
+
show_progress: bool = True,
|
|
1907
|
+
**api_kwargs,
|
|
1908
|
+
) -> pd.Series:
|
|
1909
|
+
"""Parse DataFrame rows into structured data using an LLM (asynchronously).
|
|
1910
|
+
|
|
1911
|
+
Async version for extracting structured information from DataFrame rows,
|
|
1912
|
+
with automatic schema inference when no format is specified.
|
|
1913
|
+
|
|
1914
|
+
Args:
|
|
1915
|
+
instructions (str): Plain language extraction goals (e.g., "Extract
|
|
1916
|
+
customer details, order items, and payment information").
|
|
1917
|
+
response_format (type[ResponseFormat] | None, optional): Target
|
|
1918
|
+
structure. None triggers automatic inference. Defaults to None.
|
|
1919
|
+
max_examples (int, optional): Maximum rows for schema inference.
|
|
1920
|
+
Defaults to 100.
|
|
1921
|
+
batch_size (int | None, optional): Rows per batch. None for
|
|
1922
|
+
automatic optimization. Defaults to None.
|
|
1923
|
+
max_concurrency (int, optional): Maximum concurrent requests.
|
|
1924
|
+
Defaults to 8.
|
|
1925
|
+
show_progress (bool, optional): Show progress bar. Defaults to True.
|
|
1926
|
+
**api_kwargs: Additional OpenAI API parameters.
|
|
1927
|
+
|
|
1928
|
+
Returns:
|
|
1929
|
+
pandas.Series: Parsed structured data indexed like the original DataFrame.
|
|
1930
|
+
|
|
1931
|
+
Example:
|
|
1932
|
+
```python
|
|
1933
|
+
df = pd.DataFrame({
|
|
1934
|
+
'raw_data': [
|
|
1935
|
+
'Customer: John Doe, Order: 2 laptops @ $1200 each',
|
|
1936
|
+
'Customer: Jane Smith, Order: 5 phones @ $800 each'
|
|
1937
|
+
]
|
|
1938
|
+
})
|
|
1939
|
+
|
|
1940
|
+
# Async parsing with automatic schema inference
|
|
1941
|
+
parsed = await df.aio.parse(
|
|
1942
|
+
"Extract customer name, product, quantity, and unit price"
|
|
1943
|
+
)
|
|
1944
|
+
```
|
|
1945
|
+
|
|
1946
|
+
Note:
|
|
1947
|
+
This is an asynchronous method and must be awaited.
|
|
846
1948
|
"""
|
|
847
|
-
|
|
1949
|
+
return await self.parse_with_cache(
|
|
1950
|
+
instructions=instructions,
|
|
1951
|
+
cache=AsyncBatchingMapProxy(
|
|
1952
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1953
|
+
),
|
|
1954
|
+
response_format=response_format,
|
|
1955
|
+
max_examples=max_examples,
|
|
1956
|
+
**api_kwargs,
|
|
1957
|
+
)
|
|
1958
|
+
|
|
1959
|
+
async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
|
|
1960
|
+
"""Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
|
|
848
1961
|
|
|
849
1962
|
This method allows chaining operations on the DataFrame, similar to pandas' `pipe` method,
|
|
850
1963
|
but with support for asynchronous functions.
|
|
851
1964
|
|
|
1965
|
+
Example:
|
|
1966
|
+
```python
|
|
1967
|
+
async def process_data(df):
|
|
1968
|
+
# Simulate an asynchronous computation
|
|
1969
|
+
await asyncio.sleep(1)
|
|
1970
|
+
return df.dropna()
|
|
1971
|
+
|
|
1972
|
+
df = pd.DataFrame({"col": [1, 2, None, 4]})
|
|
1973
|
+
# Must be awaited
|
|
1974
|
+
result = await df.aio.pipe(process_data)
|
|
1975
|
+
```
|
|
1976
|
+
|
|
852
1977
|
Args:
|
|
853
1978
|
func (Callable[[pd.DataFrame], Awaitable[T] | T]): A function that takes a DataFrame
|
|
854
1979
|
as input and returns either a result or an awaitable result.
|
|
@@ -865,7 +1990,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
865
1990
|
else:
|
|
866
1991
|
return result
|
|
867
1992
|
|
|
868
|
-
async def assign(self, **kwargs
|
|
1993
|
+
async def assign(self, **kwargs) -> pd.DataFrame:
|
|
869
1994
|
"""Asynchronously assign new columns to the DataFrame, evaluating sequentially.
|
|
870
1995
|
|
|
871
1996
|
This method extends pandas' `assign` method by supporting asynchronous
|
|
@@ -900,7 +2025,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
900
2025
|
```
|
|
901
2026
|
|
|
902
2027
|
Args:
|
|
903
|
-
**kwargs:
|
|
2028
|
+
**kwargs: Column names as keys and either static values or callables
|
|
904
2029
|
(synchronous or asynchronous) as values.
|
|
905
2030
|
|
|
906
2031
|
Returns:
|
|
@@ -923,3 +2048,88 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
923
2048
|
df_current[key] = column_data
|
|
924
2049
|
|
|
925
2050
|
return df_current
|
|
2051
|
+
|
|
2052
|
+
async def fillna(
|
|
2053
|
+
self,
|
|
2054
|
+
target_column_name: str,
|
|
2055
|
+
max_examples: int = 500,
|
|
2056
|
+
batch_size: int | None = None,
|
|
2057
|
+
max_concurrency: int = 8,
|
|
2058
|
+
show_progress: bool = True,
|
|
2059
|
+
) -> pd.DataFrame:
|
|
2060
|
+
"""Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
|
|
2061
|
+
|
|
2062
|
+
This method uses machine learning to intelligently fill missing (NaN) values
|
|
2063
|
+
in a specified column by analyzing patterns from non-missing rows in the DataFrame.
|
|
2064
|
+
It creates a prepared task that provides examples of similar rows to help the AI
|
|
2065
|
+
model predict appropriate values for the missing entries.
|
|
2066
|
+
|
|
2067
|
+
Args:
|
|
2068
|
+
target_column_name (str): The name of the column containing missing values
|
|
2069
|
+
that need to be filled.
|
|
2070
|
+
max_examples (int, optional): The maximum number of example rows to use
|
|
2071
|
+
for context when predicting missing values. Higher values may improve
|
|
2072
|
+
accuracy but increase API costs and processing time. Defaults to 500.
|
|
2073
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
2074
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
2075
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
2076
|
+
max_concurrency (int, optional): Maximum number of concurrent
|
|
2077
|
+
requests. Defaults to 8.
|
|
2078
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
|
|
2079
|
+
|
|
2080
|
+
Returns:
|
|
2081
|
+
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
2082
|
+
column. The original DataFrame is not modified.
|
|
2083
|
+
|
|
2084
|
+
Example:
|
|
2085
|
+
```python
|
|
2086
|
+
df = pd.DataFrame({
|
|
2087
|
+
'name': ['Alice', 'Bob', None, 'David'],
|
|
2088
|
+
'age': [25, 30, 35, None],
|
|
2089
|
+
'city': ['Tokyo', 'Osaka', 'Kyoto', 'Tokyo']
|
|
2090
|
+
})
|
|
2091
|
+
|
|
2092
|
+
# Fill missing values in the 'name' column (must be awaited)
|
|
2093
|
+
filled_df = await df.aio.fillna('name')
|
|
2094
|
+
|
|
2095
|
+
# With progress bar for large datasets
|
|
2096
|
+
large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
|
|
2097
|
+
filled_df = await large_df.aio.fillna(
|
|
2098
|
+
'name',
|
|
2099
|
+
batch_size=32,
|
|
2100
|
+
max_concurrency=4,
|
|
2101
|
+
show_progress=True
|
|
2102
|
+
)
|
|
2103
|
+
```
|
|
2104
|
+
|
|
2105
|
+
Note:
|
|
2106
|
+
This is an asynchronous method and must be awaited.
|
|
2107
|
+
If the target column has no missing values, the original DataFrame
|
|
2108
|
+
is returned unchanged.
|
|
2109
|
+
"""
|
|
2110
|
+
|
|
2111
|
+
task: PreparedTask = fillna(self._obj, target_column_name, max_examples)
|
|
2112
|
+
missing_rows = self._obj[self._obj[target_column_name].isna()]
|
|
2113
|
+
if missing_rows.empty:
|
|
2114
|
+
return self._obj
|
|
2115
|
+
|
|
2116
|
+
filled_values: list[FillNaResponse] = await missing_rows.aio.task(
|
|
2117
|
+
task=task,
|
|
2118
|
+
batch_size=batch_size,
|
|
2119
|
+
max_concurrency=max_concurrency,
|
|
2120
|
+
show_progress=show_progress,
|
|
2121
|
+
)
|
|
2122
|
+
|
|
2123
|
+
# get deep copy of the DataFrame to avoid modifying the original
|
|
2124
|
+
df = self._obj.copy()
|
|
2125
|
+
|
|
2126
|
+
# Get the actual indices of missing rows to map the results correctly
|
|
2127
|
+
missing_indices = missing_rows.index.tolist()
|
|
2128
|
+
|
|
2129
|
+
for i, result in enumerate(filled_values):
|
|
2130
|
+
if result.output is not None:
|
|
2131
|
+
# Use the actual index from the original DataFrame, not the relative index from result
|
|
2132
|
+
actual_index = missing_indices[i]
|
|
2133
|
+
df.at[actual_index, target_column_name] = result.output
|
|
2134
|
+
|
|
2135
|
+
return df
|