openaivec 0.10.0__py3-none-any.whl → 1.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. openaivec/__init__.py +13 -4
  2. openaivec/_cache/__init__.py +12 -0
  3. openaivec/_cache/optimize.py +109 -0
  4. openaivec/_cache/proxy.py +806 -0
  5. openaivec/_di.py +326 -0
  6. openaivec/_embeddings.py +203 -0
  7. openaivec/{log.py → _log.py} +2 -2
  8. openaivec/_model.py +113 -0
  9. openaivec/{prompt.py → _prompt.py} +95 -28
  10. openaivec/_provider.py +207 -0
  11. openaivec/_responses.py +511 -0
  12. openaivec/_schema/__init__.py +9 -0
  13. openaivec/_schema/infer.py +340 -0
  14. openaivec/_schema/spec.py +350 -0
  15. openaivec/_serialize.py +234 -0
  16. openaivec/{util.py → _util.py} +25 -85
  17. openaivec/pandas_ext.py +1635 -425
  18. openaivec/spark.py +604 -335
  19. openaivec/task/__init__.py +27 -29
  20. openaivec/task/customer_support/__init__.py +9 -15
  21. openaivec/task/customer_support/customer_sentiment.py +51 -41
  22. openaivec/task/customer_support/inquiry_classification.py +86 -61
  23. openaivec/task/customer_support/inquiry_summary.py +44 -45
  24. openaivec/task/customer_support/intent_analysis.py +56 -41
  25. openaivec/task/customer_support/response_suggestion.py +49 -43
  26. openaivec/task/customer_support/urgency_analysis.py +76 -71
  27. openaivec/task/nlp/__init__.py +4 -4
  28. openaivec/task/nlp/dependency_parsing.py +19 -20
  29. openaivec/task/nlp/keyword_extraction.py +22 -24
  30. openaivec/task/nlp/morphological_analysis.py +25 -25
  31. openaivec/task/nlp/named_entity_recognition.py +26 -28
  32. openaivec/task/nlp/sentiment_analysis.py +29 -21
  33. openaivec/task/nlp/translation.py +24 -30
  34. openaivec/task/table/__init__.py +3 -0
  35. openaivec/task/table/fillna.py +183 -0
  36. openaivec-1.0.10.dist-info/METADATA +399 -0
  37. openaivec-1.0.10.dist-info/RECORD +39 -0
  38. {openaivec-0.10.0.dist-info → openaivec-1.0.10.dist-info}/WHEEL +1 -1
  39. openaivec/embeddings.py +0 -172
  40. openaivec/responses.py +0 -392
  41. openaivec/serialize.py +0 -225
  42. openaivec/task/model.py +0 -84
  43. openaivec-0.10.0.dist-info/METADATA +0 -546
  44. openaivec-0.10.0.dist-info/RECORD +0 -29
  45. {openaivec-0.10.0.dist-info → openaivec-1.0.10.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py CHANGED
@@ -2,29 +2,40 @@
2
2
 
3
3
  ## Setup
4
4
  ```python
5
- from openai import OpenAI
5
+ from openai import OpenAI, AzureOpenAI, AsyncOpenAI, AsyncAzureOpenAI
6
6
  from openaivec import pandas_ext
7
7
 
8
- # Set up the OpenAI client to use with pandas_ext
9
- # Option 1: Use an existing client instance
10
- # pandas_ext.use(OpenAI())
11
-
12
- # Option 2: Use environment variables (OPENAI_API_KEY or Azure variables)
13
- # (No explicit setup needed if variables are set)
14
-
15
- # Option 3: Provide API key directly
16
- pandas_ext.use_openai("YOUR_API_KEY")
17
-
18
- # Option 4: Use Azure OpenAI credentials
19
- # pandas_ext.use_azure_openai(
20
- # api_key="YOUR_AZURE_KEY",
21
- # endpoint="YOUR_AZURE_ENDPOINT",
22
- # api_version="YOUR_API_VERSION"
23
- # )
24
-
25
- # Set up the model_name for responses and embeddings (optional, defaults shown)
26
- pandas_ext.responses_model("gpt-4o-mini")
27
- pandas_ext.embeddings_model("text-embedding-3-small")
8
+ # Option 1: Use environment variables (automatic detection)
9
+ # Set OPENAI_API_KEY or Azure OpenAI environment variables
10
+ # (AZURE_OPENAI_API_KEY, AZURE_OPENAI_BASE_URL, AZURE_OPENAI_API_VERSION)
11
+ # No explicit setup needed - clients are automatically created
12
+
13
+ # Option 2: Register an existing OpenAI client instance
14
+ client = OpenAI(api_key="your-api-key")
15
+ pandas_ext.set_client(client)
16
+
17
+ # Option 3: Register an Azure OpenAI client instance
18
+ azure_client = AzureOpenAI(
19
+ api_key="your-azure-key",
20
+ base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
21
+ api_version="preview"
22
+ )
23
+ pandas_ext.set_client(azure_client)
24
+
25
+ # Option 4: Register an async Azure OpenAI client instance
26
+ async_azure_client = AsyncAzureOpenAI(
27
+ api_key="your-azure-key",
28
+ base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
29
+ api_version="preview"
30
+ )
31
+ pandas_ext.set_async_client(async_azure_client)
32
+
33
+ # Set up model names (optional, defaults shown)
34
+ pandas_ext.set_responses_model("gpt-4.1-mini")
35
+ pandas_ext.set_embeddings_model("text-embedding-3-small")
36
+
37
+ # Inspect current configuration
38
+ configured_model = pandas_ext.get_responses_model()
28
39
  ```
29
40
 
30
41
  This module provides `.ai` and `.aio` accessors for pandas Series and DataFrames
@@ -33,202 +44,144 @@ to easily interact with OpenAI APIs for tasks like generating responses or embed
33
44
 
34
45
  import inspect
35
46
  import json
36
- import os
37
47
  import logging
38
- from typing import Any, Awaitable, Callable, Type, TypeVar
48
+ from collections.abc import Awaitable, Callable
49
+ from typing import TypeVar
39
50
 
40
51
  import numpy as np
41
52
  import pandas as pd
42
- from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
43
- from pydantic import BaseModel
44
53
  import tiktoken
54
+ from openai import AsyncOpenAI, OpenAI
55
+ from pydantic import BaseModel
45
56
 
46
- from .embeddings import AsyncBatchEmbeddings, BatchEmbeddings
47
- from .responses import AsyncBatchResponses, BatchResponses
48
- from .task.model import PreparedTask
57
+ from openaivec._cache import AsyncBatchingMapProxy, BatchingMapProxy
58
+ from openaivec._embeddings import AsyncBatchEmbeddings, BatchEmbeddings
59
+ from openaivec._model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
60
+ from openaivec._provider import CONTAINER, _check_azure_v1_api_url
61
+ from openaivec._responses import AsyncBatchResponses, BatchResponses
62
+ from openaivec._schema import SchemaInferenceInput, SchemaInferenceOutput, SchemaInferer
63
+ from openaivec.task.table import FillNaResponse, fillna
49
64
 
50
65
  __all__ = [
51
- "use",
52
- "use_async",
53
- "responses_model",
54
- "embeddings_model",
55
- "use_openai",
56
- "use_azure_openai",
66
+ "get_async_client",
67
+ "get_client",
68
+ "get_embeddings_model",
69
+ "get_responses_model",
70
+ "set_async_client",
71
+ "set_client",
72
+ "set_embeddings_model",
73
+ "set_responses_model",
57
74
  ]
58
75
 
59
76
  _LOGGER = logging.getLogger(__name__)
60
77
 
61
78
 
62
- T = TypeVar("T")
79
+ # ---------------------------------------------------------------------------
80
+ # Internal helpers (not exported)
81
+ # ---------------------------------------------------------------------------
82
+ def _df_rows_to_json_series(df: pd.DataFrame) -> pd.Series:
83
+ """Return a Series of JSON strings (UTF-8, no ASCII escaping) representing DataFrame rows.
63
84
 
64
- _CLIENT: OpenAI | None = None
65
- _ASYNC_CLIENT: AsyncOpenAI | None = None
66
- _RESPONSES_MODEL_NAME = "gpt-4o-mini"
67
- _EMBEDDINGS_MODEL_NAME = "text-embedding-3-small"
68
-
69
- _TIKTOKEN_ENCODING = tiktoken.encoding_for_model(_RESPONSES_MODEL_NAME)
85
+ Each element is the JSON serialisation of the corresponding row as a dict. Index and
86
+ name are preserved so downstream operations retain alignment. This consolidates the
87
+ previously duplicated inline pipeline used by responses*/task* DataFrame helpers.
88
+ """
89
+ return pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
90
+ lambda x: json.dumps(x, ensure_ascii=False)
91
+ )
70
92
 
71
93
 
72
- # internal method for accesing .ai accessor in spark udfs
73
- def _wakeup() -> None:
74
- pass
94
+ T = TypeVar("T") # For pipe function return type
75
95
 
76
96
 
77
- def use(client: OpenAI) -> None:
78
- """Register a custom OpenAIcompatible client.
97
+ def set_client(client: OpenAI) -> None:
98
+ """Register a custom OpenAI-compatible client for pandas helpers.
79
99
 
80
100
  Args:
81
- client (OpenAI): A preconfigured `openai.OpenAI` or
82
- `openai.AzureOpenAI` instance.
83
- The same instance is reused by every helper in this module.
101
+ client (OpenAI): A pre-configured `openai.OpenAI` or
102
+ `openai.AzureOpenAI` instance reused by every helper in this module.
84
103
  """
85
- global _CLIENT
86
- _CLIENT = client
104
+ if client.__class__.__name__ == "AzureOpenAI" and hasattr(client, "base_url"):
105
+ _check_azure_v1_api_url(str(client.base_url))
87
106
 
107
+ CONTAINER.register(OpenAI, lambda: client)
88
108
 
89
- def use_async(client: AsyncOpenAI) -> None:
90
- """Register a custom asynchronous OpenAI‑compatible client.
91
109
 
92
- Args:
93
- client (AsyncOpenAI): A pre‑configured `openai.AsyncOpenAI` or
94
- `openai.AsyncAzureOpenAI` instance.
95
- The same instance is reused by every helper in this module.
110
+ def get_client() -> OpenAI:
111
+ """Get the currently registered OpenAI-compatible client.
112
+
113
+ Returns:
114
+ OpenAI: The registered `openai.OpenAI` or `openai.AzureOpenAI` instance.
96
115
  """
97
- global _ASYNC_CLIENT
98
- _ASYNC_CLIENT = client
116
+ return CONTAINER.resolve(OpenAI)
99
117
 
100
118
 
101
- def use_openai(api_key: str) -> None:
102
- """Create and register a default `openai.OpenAI` client.
119
+ def set_async_client(client: AsyncOpenAI) -> None:
120
+ """Register a custom asynchronous OpenAI-compatible client.
103
121
 
104
122
  Args:
105
- api_key (str): Value forwarded to the ``api_key`` parameter of
106
- `openai.OpenAI`.
123
+ client (AsyncOpenAI): A pre-configured `openai.AsyncOpenAI` or
124
+ `openai.AsyncAzureOpenAI` instance reused by every helper in this module.
107
125
  """
108
- global _CLIENT, _ASYNC_CLIENT
109
- _CLIENT = OpenAI(api_key=api_key)
110
- _ASYNC_CLIENT = AsyncOpenAI(api_key=api_key)
126
+ if client.__class__.__name__ == "AsyncAzureOpenAI" and hasattr(client, "base_url"):
127
+ _check_azure_v1_api_url(str(client.base_url))
111
128
 
129
+ CONTAINER.register(AsyncOpenAI, lambda: client)
112
130
 
113
- def use_azure_openai(api_key: str, endpoint: str, api_version: str) -> None:
114
- """Create and register an `openai.AzureOpenAI` client.
115
131
 
116
- Args:
117
- api_key (str): Azure OpenAI subscription key.
118
- endpoint (str): Resource endpoint, e.g.
119
- ``https://<resource>.openai.azure.com``.
120
- api_version (str): REST API version such as ``2024‑02‑15-preview``.
132
+ def get_async_client() -> AsyncOpenAI:
133
+ """Get the currently registered asynchronous OpenAI-compatible client.
134
+
135
+ Returns:
136
+ AsyncOpenAI: The registered `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` instance.
121
137
  """
122
- global _CLIENT, _ASYNC_CLIENT
123
- _CLIENT = AzureOpenAI(
124
- api_key=api_key,
125
- azure_endpoint=endpoint,
126
- api_version=api_version,
127
- )
128
- _ASYNC_CLIENT = AsyncAzureOpenAI(
129
- api_key=api_key,
130
- azure_endpoint=endpoint,
131
- api_version=api_version,
132
- )
138
+ return CONTAINER.resolve(AsyncOpenAI)
133
139
 
134
140
 
135
- def responses_model(name: str) -> None:
141
+ def set_responses_model(name: str) -> None:
136
142
  """Override the model used for text responses.
137
143
 
138
144
  Args:
139
- name (str): Model name as listed in the OpenAI API
140
- (for example, ``gpt-4o-mini``).
145
+ name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name
146
+ (for example, ``gpt-4.1-mini``).
141
147
  """
142
- global _RESPONSES_MODEL_NAME, _TIKTOKEN_ENCODING
143
- _RESPONSES_MODEL_NAME = name
148
+ CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(name))
144
149
 
145
- try:
146
- _TIKTOKEN_ENCODING = tiktoken.encoding_for_model(name)
147
150
 
148
- except KeyError:
149
- _LOGGER.info(
150
- "The model name '%s' is not supported by tiktoken. Instead, using the 'o200k_base' encoding.",
151
- name,
152
- )
153
- _TIKTOKEN_ENCODING = tiktoken.get_encoding("o200k_base")
151
+ def get_responses_model() -> str:
152
+ """Get the currently registered model name for text responses.
153
+
154
+ Returns:
155
+ str: The model name (for example, ``gpt-4.1-mini``).
156
+ """
157
+ return CONTAINER.resolve(ResponsesModelName).value
154
158
 
155
159
 
156
- def embeddings_model(name: str) -> None:
160
+ def set_embeddings_model(name: str) -> None:
157
161
  """Override the model used for text embeddings.
158
162
 
159
163
  Args:
160
- name (str): Embedding model name, e.g. ``text-embedding-3-small``.
164
+ name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name,
165
+ e.g. ``text-embedding-3-small``.
161
166
  """
162
- global _EMBEDDINGS_MODEL_NAME
163
- _EMBEDDINGS_MODEL_NAME = name
164
-
165
-
166
- def _get_openai_client() -> OpenAI:
167
- global _CLIENT
168
- if _CLIENT is not None:
169
- return _CLIENT
170
-
171
- if "OPENAI_API_KEY" in os.environ:
172
- _CLIENT = OpenAI()
173
- return _CLIENT
167
+ CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(name))
174
168
 
175
- aoai_param_names = [
176
- "AZURE_OPENAI_API_KEY",
177
- "AZURE_OPENAI_ENDPOINT",
178
- "AZURE_OPENAI_API_VERSION",
179
- ]
180
169
 
181
- if all(param in os.environ for param in aoai_param_names):
182
- _CLIENT = AzureOpenAI(
183
- api_key=os.environ["AZURE_OPENAI_API_KEY"],
184
- azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
185
- api_version=os.environ["AZURE_OPENAI_API_VERSION"],
186
- )
187
-
188
- return _CLIENT
189
-
190
- raise ValueError(
191
- "No OpenAI API key found. Please set the OPENAI_API_KEY environment variable or provide Azure OpenAI parameters."
192
- "If using Azure OpenAI, ensure AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, and AZURE_OPENAI_API_VERSION are set."
193
- "If using OpenAI, ensure OPENAI_API_KEY is set."
194
- )
170
+ def get_embeddings_model() -> str:
171
+ """Get the currently registered model name for text embeddings.
195
172
 
196
-
197
- def _get_async_openai_client() -> AsyncOpenAI:
198
- global _ASYNC_CLIENT
199
- if _ASYNC_CLIENT is not None:
200
- return _ASYNC_CLIENT
201
-
202
- if "OPENAI_API_KEY" in os.environ:
203
- _ASYNC_CLIENT = AsyncOpenAI()
204
- return _ASYNC_CLIENT
205
-
206
- aoai_param_names = [
207
- "AZURE_OPENAI_API_KEY",
208
- "AZURE_OPENAI_ENDPOINT",
209
- "AZURE_OPENAI_API_VERSION",
210
- ]
211
- if all(param in os.environ for param in aoai_param_names):
212
- _ASYNC_CLIENT = AsyncAzureOpenAI(
213
- api_key=os.environ["AZURE_OPENAI_API_KEY"],
214
- azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
215
- api_version=os.environ["AZURE_OPENAI_API_VERSION"],
216
- )
217
- return _ASYNC_CLIENT
218
-
219
- raise ValueError(
220
- "No OpenAI API key found. Please set the OPENAI_API_KEY environment variable or provide Azure OpenAI parameters."
221
- "If using Azure OpenAI, ensure AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, and AZURE_OPENAI_API_VERSION are set."
222
- "If using OpenAI, ensure OPENAI_API_KEY is set."
223
- )
173
+ Returns:
174
+ str: The model name (for example, ``text-embedding-3-small``).
175
+ """
176
+ return CONTAINER.resolve(EmbeddingsModelName).value
224
177
 
225
178
 
226
179
  def _extract_value(x, series_name):
227
180
  """Return a homogeneous ``dict`` representation of any Series value.
228
181
 
229
182
  Args:
230
- x: Single element taken from the Series.
231
- series_name (str): Name of the Series (only used for logging).
183
+ x (Any): Single element taken from the Series.
184
+ series_name (str): Name of the Series (used for logging).
232
185
 
233
186
  Returns:
234
187
  dict: A dictionary representation or an empty ``dict`` if ``x`` cannot
@@ -241,7 +194,9 @@ def _extract_value(x, series_name):
241
194
  elif isinstance(x, dict):
242
195
  return x
243
196
 
244
- _LOGGER.warning(f"The value '{x}' in the series is not a dict or BaseModel. Returning an empty dict.")
197
+ _LOGGER.warning(
198
+ f"The value '{x}' in the series '{series_name}' is not a dict or BaseModel. Returning an empty dict."
199
+ )
245
200
  return {}
246
201
 
247
202
 
@@ -252,126 +207,463 @@ class OpenAIVecSeriesAccessor:
252
207
  def __init__(self, series_obj: pd.Series):
253
208
  self._obj = series_obj
254
209
 
210
+ def responses_with_cache(
211
+ self,
212
+ instructions: str,
213
+ cache: BatchingMapProxy[str, ResponseFormat],
214
+ response_format: type[ResponseFormat] = str,
215
+ **api_kwargs,
216
+ ) -> pd.Series:
217
+ """Call an LLM once for every Series element using a provided cache.
218
+
219
+ This is a lower-level method that allows explicit cache management for advanced
220
+ use cases. Most users should use the standard ``responses`` method instead.
221
+
222
+ Args:
223
+ instructions (str): System prompt prepended to every user message.
224
+ cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
225
+ batching and deduplication control.
226
+ response_format (type[ResponseFormat], optional): Pydantic model or built-in
227
+ type the assistant should return. Defaults to ``str``.
228
+ **api_kwargs: Arbitrary OpenAI Responses API parameters (e.g. ``temperature``,
229
+ ``top_p``, ``frequency_penalty``, ``presence_penalty``, ``seed``, etc.) are
230
+ forwarded verbatim to the underlying client.
231
+
232
+ Returns:
233
+ pandas.Series: Series whose values are instances of ``response_format``.
234
+ """
235
+
236
+ client: BatchResponses = BatchResponses(
237
+ client=CONTAINER.resolve(OpenAI),
238
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
239
+ system_message=instructions,
240
+ response_format=response_format,
241
+ cache=cache,
242
+ api_kwargs=api_kwargs,
243
+ )
244
+
245
+ return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
246
+
255
247
  def responses(
256
248
  self,
257
249
  instructions: str,
258
- response_format: Type[T] = str,
259
- batch_size: int = 128,
260
- temperature: float = 0.0,
261
- top_p: float = 1.0,
250
+ response_format: type[ResponseFormat] = str,
251
+ batch_size: int | None = None,
252
+ show_progress: bool = True,
253
+ **api_kwargs,
262
254
  ) -> pd.Series:
263
255
  """Call an LLM once for every Series element.
264
256
 
265
257
  Example:
266
258
  ```python
267
259
  animals = pd.Series(["cat", "dog", "elephant"])
260
+ # Basic usage
268
261
  animals.ai.responses("translate to French")
262
+
263
+ # With progress bar in Jupyter notebooks
264
+ large_series = pd.Series(["data"] * 1000)
265
+ large_series.ai.responses(
266
+ "analyze this data",
267
+ batch_size=32,
268
+ show_progress=True
269
+ )
270
+
271
+ # With custom temperature
272
+ animals.ai.responses(
273
+ "translate creatively",
274
+ temperature=0.8
275
+ )
269
276
  ```
270
- This method returns a Series of strings, each containing the
271
- assistant's response to the corresponding input.
272
- The model used is set by the `responses_model` function.
273
- The default model is `gpt-4o-mini`.
274
277
 
275
278
  Args:
276
279
  instructions (str): System prompt prepended to every user message.
277
- response_format (Type[T], optional): Pydantic model or built‑in
280
+ response_format (type[ResponseFormat], optional): Pydantic model or built‑in
278
281
  type the assistant should return. Defaults to ``str``.
279
- batch_size (int, optional): Number of prompts grouped into a single
280
- request. Defaults to ``128``.
281
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
282
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
282
+ batch_size (int | None, optional): Number of prompts grouped into a single
283
+ request. Defaults to ``None`` (automatic batch size optimization
284
+ based on execution time). Set to a positive integer for fixed batch size.
285
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
286
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
283
287
 
284
288
  Returns:
285
289
  pandas.Series: Series whose values are instances of ``response_format``.
286
290
  """
287
- client: BatchResponses = BatchResponses(
288
- client=_get_openai_client(),
289
- model_name=_RESPONSES_MODEL_NAME,
290
- system_message=instructions,
291
+ return self.responses_with_cache(
292
+ instructions=instructions,
293
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
291
294
  response_format=response_format,
292
- temperature=temperature,
293
- top_p=top_p,
295
+ **api_kwargs,
294
296
  )
295
297
 
296
- return pd.Series(
297
- client.parse(self._obj.tolist(), batch_size=batch_size),
298
- index=self._obj.index,
299
- name=self._obj.name,
300
- )
301
-
302
- def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
303
- """Execute a prepared task on every Series element.
298
+ def embeddings_with_cache(
299
+ self,
300
+ cache: BatchingMapProxy[str, np.ndarray],
301
+ **api_kwargs,
302
+ ) -> pd.Series:
303
+ """Compute OpenAI embeddings for every Series element using a provided cache.
304
304
 
305
- This method applies a pre-configured task to each element in the Series,
306
- using the task's instructions and response format to generate structured
307
- responses from the language model.
305
+ This method allows external control over caching behavior by accepting
306
+ a pre-configured BatchingMapProxy instance, enabling cache sharing
307
+ across multiple operations or custom batch size management.
308
308
 
309
309
  Example:
310
310
  ```python
311
- from openaivec.task.model import PreparedTask
312
-
313
- # Assume you have a prepared task for sentiment analysis
314
- sentiment_task = PreparedTask(...)
315
-
316
- reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
317
- results = reviews.ai.task(sentiment_task)
311
+ from openaivec._cache import BatchingMapProxy
312
+ import numpy as np
313
+
314
+ # Create a shared cache with custom batch size
315
+ shared_cache = BatchingMapProxy[str, np.ndarray](batch_size=64)
316
+
317
+ animals = pd.Series(["cat", "dog", "elephant"])
318
+ embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
318
319
  ```
319
- This method returns a Series containing the task results for each
320
- corresponding input element, following the task's defined structure.
321
320
 
322
321
  Args:
323
- task (PreparedTask): A pre-configured task containing instructions,
324
- response format, and other parameters for processing the inputs.
325
- batch_size (int, optional): Number of prompts grouped into a single
326
- request to optimize API usage. Defaults to 128.
322
+ cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
323
+ instance for managing API call batching and deduplication.
324
+ Set cache.batch_size=None to enable automatic batch size optimization.
325
+ **api_kwargs: Additional keyword arguments to pass to the OpenAI API.
327
326
 
328
327
  Returns:
329
- pandas.Series: Series whose values are instances of the task's
330
- response format, aligned with the original Series index.
328
+ pandas.Series: Series whose values are ``np.ndarray`` objects
329
+ (dtype ``float32``).
331
330
  """
332
- client = BatchResponses.of_task(
333
- client=_get_openai_client(),
334
- model_name=_RESPONSES_MODEL_NAME,
335
- task=task
331
+ client: BatchEmbeddings = BatchEmbeddings(
332
+ client=CONTAINER.resolve(OpenAI),
333
+ model_name=CONTAINER.resolve(EmbeddingsModelName).value,
334
+ cache=cache,
335
+ api_kwargs=api_kwargs,
336
336
  )
337
337
 
338
338
  return pd.Series(
339
- client.parse(self._obj.tolist(), batch_size=batch_size),
339
+ client.create(self._obj.tolist()),
340
340
  index=self._obj.index,
341
341
  name=self._obj.name,
342
342
  )
343
343
 
344
- def embeddings(self, batch_size: int = 128) -> pd.Series:
344
+ def embeddings(self, batch_size: int | None = None, show_progress: bool = True, **api_kwargs) -> pd.Series:
345
345
  """Compute OpenAI embeddings for every Series element.
346
346
 
347
347
  Example:
348
348
  ```python
349
349
  animals = pd.Series(["cat", "dog", "elephant"])
350
+ # Basic usage
350
351
  animals.ai.embeddings()
352
+
353
+ # With progress bar for large datasets
354
+ large_texts = pd.Series(["text"] * 5000)
355
+ embeddings = large_texts.ai.embeddings(
356
+ batch_size=100,
357
+ show_progress=True
358
+ )
351
359
  ```
352
- This method returns a Series of numpy arrays, each containing the
353
- embedding vector for the corresponding input.
354
- The embedding model is set by the `embeddings_model` function.
355
- The default embedding model is `text-embedding-3-small`.
356
360
 
357
361
  Args:
358
- batch_size (int, optional): Number of inputs grouped into a
359
- single request. Defaults to ``128``.
362
+ batch_size (int | None, optional): Number of inputs grouped into a
363
+ single request. Defaults to ``None`` (automatic batch size optimization
364
+ based on execution time). Set to a positive integer for fixed batch size.
365
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
366
+ **api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
360
367
 
361
368
  Returns:
362
369
  pandas.Series: Series whose values are ``np.ndarray`` objects
363
370
  (dtype ``float32``).
364
371
  """
365
- client: BatchEmbeddings = BatchEmbeddings(
366
- client=_get_openai_client(),
367
- model_name=_EMBEDDINGS_MODEL_NAME,
372
+ return self.embeddings_with_cache(
373
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
374
+ **api_kwargs,
368
375
  )
369
376
 
370
- return pd.Series(
371
- client.create(self._obj.tolist(), batch_size=batch_size),
372
- index=self._obj.index,
373
- name=self._obj.name,
377
+ def task_with_cache(
378
+ self,
379
+ task: PreparedTask[ResponseFormat],
380
+ cache: BatchingMapProxy[str, ResponseFormat],
381
+ **api_kwargs,
382
+ ) -> pd.Series:
383
+ """Execute a prepared task on every Series element using a provided cache.
384
+
385
+ This mirrors ``responses_with_cache`` but uses the task's stored instructions
386
+ and response format. A supplied ``BatchingMapProxy`` enables cross‑operation
387
+ deduplicated reuse and external batch size / progress control.
388
+
389
+ Example:
390
+ ```python
391
+ from openaivec._cache import BatchingMapProxy
392
+ shared_cache = BatchingMapProxy(batch_size=64)
393
+ reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
394
+ ```
395
+
396
+ Args:
397
+ task (PreparedTask): Prepared task (instructions + response_format).
398
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
399
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
400
+
401
+ Note:
402
+ Core routing keys (``model``, system instructions, user input) are managed
403
+ internally and cannot be overridden.
404
+
405
+ Returns:
406
+ pandas.Series: Task results aligned with the original Series index.
407
+ """
408
+ client: BatchResponses = BatchResponses(
409
+ client=CONTAINER.resolve(OpenAI),
410
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
411
+ system_message=task.instructions,
412
+ response_format=task.response_format,
413
+ cache=cache,
414
+ api_kwargs=api_kwargs,
415
+ )
416
+ return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
417
+
418
+ def task(
419
+ self,
420
+ task: PreparedTask,
421
+ batch_size: int | None = None,
422
+ show_progress: bool = True,
423
+ **api_kwargs,
424
+ ) -> pd.Series:
425
+ """Execute a prepared task on every Series element.
426
+
427
+ Example:
428
+ ```python
429
+ from openaivec._model import PreparedTask
430
+
431
+ # Assume you have a prepared task for sentiment analysis
432
+ sentiment_task = PreparedTask(...)
433
+
434
+ reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
435
+ # Basic usage
436
+ results = reviews.ai.task(sentiment_task)
437
+
438
+ # With progress bar for large datasets
439
+ large_reviews = pd.Series(["review text"] * 2000)
440
+ results = large_reviews.ai.task(
441
+ sentiment_task,
442
+ batch_size=50,
443
+ show_progress=True
444
+ )
445
+ ```
446
+
447
+ Args:
448
+ task (PreparedTask): A pre-configured task containing instructions,
449
+ response format for processing the inputs.
450
+ batch_size (int | None, optional): Number of prompts grouped into a single
451
+ request to optimize API usage. Defaults to ``None`` (automatic batch size
452
+ optimization based on execution time). Set to a positive integer for fixed batch size.
453
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
454
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
455
+
456
+ Note:
457
+ Core batching / routing keys (``model``, ``instructions`` / system message,
458
+ user ``input``) are managed by the library and cannot be overridden.
459
+
460
+ Returns:
461
+ pandas.Series: Series whose values are instances of the task's response format.
462
+ """
463
+ return self.task_with_cache(
464
+ task=task,
465
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
466
+ **api_kwargs,
467
+ )
468
+
469
+ def parse_with_cache(
470
+ self,
471
+ instructions: str,
472
+ cache: BatchingMapProxy[str, ResponseFormat],
473
+ response_format: type[ResponseFormat] | None = None,
474
+ max_examples: int = 100,
475
+ **api_kwargs,
476
+ ) -> pd.Series:
477
+ """Parse Series values using an LLM with a provided cache.
478
+
479
+ This method allows external control over caching behavior while parsing
480
+ Series content into structured data. If no response format is provided,
481
+ the method automatically infers an appropriate schema by analyzing the
482
+ data patterns.
483
+
484
+ Args:
485
+ instructions (str): Plain language description of what information
486
+ to extract (e.g., "Extract customer information including name
487
+ and contact details"). This guides both the extraction process
488
+ and schema inference.
489
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
490
+ instance for managing API call batching and deduplication.
491
+ Set cache.batch_size=None to enable automatic batch size optimization.
492
+ response_format (type[ResponseFormat] | None, optional): Target structure
493
+ for the parsed data. Can be a Pydantic model class, built-in type
494
+ (str, int, float, bool, list, dict), or None. If None, the method
495
+ infers an appropriate schema based on the instructions and data.
496
+ Defaults to None.
497
+ max_examples (int, optional): Maximum number of Series values to
498
+ analyze when inferring the schema. Only used when response_format
499
+ is None. Defaults to 100.
500
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p,
501
+ frequency_penalty, presence_penalty, seed, etc.) forwarded to
502
+ the underlying API calls.
503
+
504
+ Returns:
505
+ pandas.Series: Series containing parsed structured data. Each value
506
+ is an instance of the specified response_format or the inferred
507
+ schema model, aligned with the original Series index.
508
+ """
509
+
510
+ schema: SchemaInferenceOutput | None = None
511
+ if response_format is None:
512
+ schema = self.infer_schema(instructions=instructions, max_examples=max_examples, **api_kwargs)
513
+
514
+ return self.responses_with_cache(
515
+ instructions=schema.inference_prompt if schema else instructions,
516
+ cache=cache,
517
+ response_format=response_format or schema.model,
518
+ **api_kwargs,
519
+ )
520
+
521
+ def parse(
522
+ self,
523
+ instructions: str,
524
+ response_format: type[ResponseFormat] | None = None,
525
+ max_examples: int = 100,
526
+ batch_size: int | None = None,
527
+ show_progress: bool = True,
528
+ **api_kwargs,
529
+ ) -> pd.Series:
530
+ """Parse Series values into structured data using an LLM.
531
+
532
+ This method extracts structured information from unstructured text in
533
+ the Series. When no response format is provided, it automatically
534
+ infers an appropriate schema by analyzing patterns in the data.
535
+
536
+ Args:
537
+ instructions (str): Plain language description of what information
538
+ to extract (e.g., "Extract product details including price,
539
+ category, and availability"). This guides both the extraction
540
+ process and schema inference.
541
+ response_format (type[ResponseFormat] | None, optional): Target
542
+ structure for the parsed data. Can be a Pydantic model class,
543
+ built-in type (str, int, float, bool, list, dict), or None.
544
+ If None, automatically infers a schema. Defaults to None.
545
+ max_examples (int, optional): Maximum number of Series values to
546
+ analyze when inferring schema. Only used when response_format
547
+ is None. Defaults to 100.
548
+ batch_size (int | None, optional): Number of requests to process
549
+ per batch. None enables automatic optimization. Defaults to None.
550
+ show_progress (bool, optional): Display progress bar in Jupyter
551
+ notebooks. Defaults to True.
552
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p,
553
+ frequency_penalty, presence_penalty, seed, etc.).
554
+
555
+ Returns:
556
+ pandas.Series: Series containing parsed structured data as instances
557
+ of response_format or the inferred schema model.
558
+
559
+ Example:
560
+ ```python
561
+ # With explicit schema
562
+ from pydantic import BaseModel
563
+ class Product(BaseModel):
564
+ name: str
565
+ price: float
566
+ in_stock: bool
567
+
568
+ descriptions = pd.Series([
569
+ "iPhone 15 Pro - $999, available now",
570
+ "Samsung Galaxy S24 - $899, out of stock"
571
+ ])
572
+ products = descriptions.ai.parse(
573
+ "Extract product information",
574
+ response_format=Product
575
+ )
576
+
577
+ # With automatic schema inference
578
+ reviews = pd.Series([
579
+ "Great product! 5 stars. Fast shipping.",
580
+ "Poor quality. 2 stars. Slow delivery."
581
+ ])
582
+ parsed = reviews.ai.parse(
583
+ "Extract review rating and shipping feedback"
584
+ )
585
+ ```
586
+ """
587
+ return self.parse_with_cache(
588
+ instructions=instructions,
589
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
590
+ response_format=response_format,
591
+ max_examples=max_examples,
592
+ **api_kwargs,
593
+ )
594
+
595
+ def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> SchemaInferenceOutput:
596
+ """Infer a structured data schema from Series content using AI.
597
+
598
+ This method analyzes a sample of Series values to automatically generate
599
+ a Pydantic model that captures the relevant information structure. The
600
+ inferred schema supports both flat and hierarchical (nested) structures,
601
+ making it suitable for complex data extraction tasks.
602
+
603
+ Args:
604
+ instructions (str): Plain language description of the extraction goal
605
+ (e.g., "Extract customer information for CRM system", "Parse
606
+ event details for calendar integration"). This guides which
607
+ fields to include and their purpose.
608
+ max_examples (int, optional): Maximum number of Series values to
609
+ analyze for pattern detection. The method samples randomly up
610
+ to this limit. Higher values may improve schema quality but
611
+ increase inference time. Defaults to 100.
612
+ **api_kwargs: Additional OpenAI API parameters for fine-tuning
613
+ the inference process.
614
+
615
+ Returns:
616
+ InferredSchema: A comprehensive schema object containing:
617
+ - instructions: Refined extraction objective statement
618
+ - fields: Hierarchical field specifications with names, types,
619
+ descriptions, and nested structures where applicable
620
+ - inference_prompt: Optimized prompt for consistent extraction
621
+ - model: Dynamically generated Pydantic model class supporting
622
+ both flat and nested structures
623
+ - task: PreparedTask configured for batch extraction using
624
+ the inferred schema
625
+
626
+ Example:
627
+ ```python
628
+ # Simple flat structure
629
+ reviews = pd.Series([
630
+ "5 stars! Great product, fast shipping to NYC.",
631
+ "2 stars. Product broke, slow delivery to LA."
632
+ ])
633
+ schema = reviews.ai.infer_schema(
634
+ "Extract review ratings and shipping information"
635
+ )
636
+
637
+ # Hierarchical structure
638
+ orders = pd.Series([
639
+ "Order #123: John Doe, 123 Main St, NYC. Items: iPhone ($999), Case ($29)",
640
+ "Order #456: Jane Smith, 456 Oak Ave, LA. Items: iPad ($799)"
641
+ ])
642
+ schema = orders.ai.infer_schema(
643
+ "Extract order details including customer and items"
644
+ )
645
+ # Inferred schema may include nested structures like:
646
+ # - customer: {name: str, address: str, city: str}
647
+ # - items: [{product: str, price: float}]
648
+
649
+ # Apply the schema for extraction
650
+ extracted = orders.ai.task(schema.task)
651
+ ```
652
+
653
+ Note:
654
+ The inference process uses multiple AI iterations to ensure schema
655
+ validity. Nested structures are automatically detected when the
656
+ data contains hierarchical relationships. The generated Pydantic
657
+ model ensures type safety and validation for all extracted data.
658
+ """
659
+ inferer = CONTAINER.resolve(SchemaInferer)
660
+
661
+ input: SchemaInferenceInput = SchemaInferenceInput(
662
+ examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(),
663
+ instructions=instructions,
664
+ **api_kwargs,
374
665
  )
666
+ return inferer.infer_schema(input)
375
667
 
376
668
  def count_tokens(self) -> pd.Series:
377
669
  """Count `tiktoken` tokens per row.
@@ -382,12 +674,13 @@ class OpenAIVecSeriesAccessor:
382
674
  animals.ai.count_tokens()
383
675
  ```
384
676
  This method uses the `tiktoken` library to count tokens based on the
385
- model name set by `responses_model`.
677
+ model name configured via `set_responses_model`.
386
678
 
387
679
  Returns:
388
680
  pandas.Series: Token counts for each element.
389
681
  """
390
- return self._obj.map(_TIKTOKEN_ENCODING.encode).map(len).rename("num_tokens")
682
+ encoding: tiktoken.Encoding = CONTAINER.resolve(tiktoken.Encoding)
683
+ return self._obj.map(encoding.encode).map(len).rename("num_tokens")
391
684
 
392
685
  def extract(self) -> pd.DataFrame:
393
686
  """Expand a Series of Pydantic models/dicts into columns.
@@ -426,47 +719,65 @@ class OpenAIVecDataFrameAccessor:
426
719
  def __init__(self, df_obj: pd.DataFrame):
427
720
  self._obj = df_obj
428
721
 
429
- def extract(self, column: str) -> pd.DataFrame:
430
- """Flatten one column of Pydantic models/dicts into top‑level columns.
722
+ def responses_with_cache(
723
+ self,
724
+ instructions: str,
725
+ cache: BatchingMapProxy[str, ResponseFormat],
726
+ response_format: type[ResponseFormat] = str,
727
+ **api_kwargs,
728
+ ) -> pd.Series:
729
+ """Generate a response for each row after serializing it to JSON using a provided cache.
730
+
731
+ This method allows external control over caching behavior by accepting
732
+ a pre-configured BatchingMapProxy instance, enabling cache sharing
733
+ across multiple operations or custom batch size management.
431
734
 
432
735
  Example:
433
736
  ```python
737
+ from openaivec._cache import BatchingMapProxy
738
+
739
+ # Create a shared cache with custom batch size
740
+ shared_cache = BatchingMapProxy(batch_size=64)
741
+
434
742
  df = pd.DataFrame([
435
- {"animal": {"name": "cat", "legs": 4}},
436
- {"animal": {"name": "dog", "legs": 4}},
437
- {"animal": {"name": "elephant", "legs": 4}},
743
+ {"name": "cat", "legs": 4},
744
+ {"name": "dog", "legs": 4},
745
+ {"name": "elephant", "legs": 4},
438
746
  ])
439
- df.ai.extract("animal")
747
+ result = df.ai.responses_with_cache(
748
+ "what is the animal's name?",
749
+ cache=shared_cache
750
+ )
440
751
  ```
441
- This method returns a DataFrame with the same index as the original,
442
- where each column corresponds to a key in the dictionaries.
443
- The source column is dropped.
444
752
 
445
753
  Args:
446
- column (str): Column to expand.
754
+ instructions (str): System prompt for the assistant.
755
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
756
+ instance for managing API call batching and deduplication.
757
+ Set cache.batch_size=None to enable automatic batch size optimization.
758
+ response_format (type[ResponseFormat], optional): Desired Python type of the
759
+ responses. Defaults to ``str``.
760
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
447
761
 
448
762
  Returns:
449
- pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
763
+ pandas.Series: Responses aligned with the DataFrame's original index.
450
764
  """
451
- if column not in self._obj.columns:
452
- raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
453
-
454
- return (
455
- self._obj.pipe(lambda df: df.reset_index(drop=True))
456
- .pipe(lambda df: df.join(df[column].ai.extract()))
457
- .pipe(lambda df: df.set_index(self._obj.index))
458
- .pipe(lambda df: df.drop(columns=[column], axis=1))
765
+ return _df_rows_to_json_series(self._obj).ai.responses_with_cache(
766
+ instructions=instructions,
767
+ cache=cache,
768
+ response_format=response_format,
769
+ **api_kwargs,
459
770
  )
460
771
 
461
772
  def responses(
462
773
  self,
463
774
  instructions: str,
464
- response_format: Type[T] = str,
465
- batch_size: int = 128,
466
- temperature: float = 0.0,
467
- top_p: float = 1.0,
775
+ response_format: type[ResponseFormat] = str,
776
+ batch_size: int | None = None,
777
+ show_progress: bool = True,
778
+ **api_kwargs,
468
779
  ) -> pd.Series:
469
- """Generate a response for each row after serialising it to JSON.
780
+ """Generate a response for each row after serializing it to JSON.
470
781
 
471
782
  Example:
472
783
  ```python
@@ -475,105 +786,482 @@ class OpenAIVecDataFrameAccessor:
475
786
  {"name": "dog", "legs": 4},
476
787
  {"name": "elephant", "legs": 4},
477
788
  ])
789
+ # Basic usage
478
790
  df.ai.responses("what is the animal's name?")
791
+
792
+ # With progress bar for large datasets
793
+ large_df = pd.DataFrame({"id": list(range(1000))})
794
+ large_df.ai.responses(
795
+ "generate a name for this ID",
796
+ batch_size=20,
797
+ show_progress=True
798
+ )
479
799
  ```
480
- This method returns a Series of strings, each containing the
481
- assistant's response to the corresponding input.
482
- Each row is serialised to JSON before being sent to the assistant.
483
- The model used is set by the `responses_model` function.
484
- The default model is `gpt-4o-mini`.
485
800
 
486
801
  Args:
487
802
  instructions (str): System prompt for the assistant.
488
- response_format (Type[T], optional): Desired Python type of the
803
+ response_format (type[ResponseFormat], optional): Desired Python type of the
489
804
  responses. Defaults to ``str``.
490
- batch_size (int, optional): Number of requests sent in one batch.
491
- Defaults to ``128``.
492
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
493
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
494
-
495
- Returns:
496
- pandas.Series: Responses aligned with the DataFrame’s original index.
497
- """
498
- return self._obj.pipe(
499
- lambda df: (
500
- df.pipe(lambda df: pd.Series(df.to_dict(orient="records"), index=df.index, name="record"))
501
- .map(lambda x: json.dumps(x, ensure_ascii=False))
502
- .ai.responses(
503
- instructions=instructions,
504
- response_format=response_format,
505
- batch_size=batch_size,
506
- temperature=temperature,
507
- top_p=top_p,
508
- )
509
- )
805
+ batch_size (int | None, optional): Number of requests sent in one batch.
806
+ Defaults to ``None`` (automatic batch size optimization
807
+ based on execution time). Set to a positive integer for fixed batch size.
808
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
809
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
810
+
811
+ Returns:
812
+ pandas.Series: Responses aligned with the DataFrame's original index.
813
+ """
814
+ return self.responses_with_cache(
815
+ instructions=instructions,
816
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
817
+ response_format=response_format,
818
+ **api_kwargs,
510
819
  )
511
820
 
512
- def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
513
- """Execute a prepared task on each DataFrame row after serialising it to JSON.
821
+ def task_with_cache(
822
+ self,
823
+ task: PreparedTask[ResponseFormat],
824
+ cache: BatchingMapProxy[str, ResponseFormat],
825
+ **api_kwargs,
826
+ ) -> pd.Series:
827
+ """Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
828
+
829
+ Args:
830
+ task (PreparedTask): Prepared task (instructions + response_format).
831
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
832
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
833
+
834
+ Note:
835
+ Core routing keys are managed internally.
836
+
837
+ Returns:
838
+ pandas.Series: Task results aligned with the DataFrame's original index.
839
+ """
840
+ return _df_rows_to_json_series(self._obj).ai.task_with_cache(
841
+ task=task,
842
+ cache=cache,
843
+ **api_kwargs,
844
+ )
514
845
 
515
- This method applies a pre-configured task to each row in the DataFrame,
516
- using the task's instructions and response format to generate structured
517
- responses from the language model. Each row is serialised to JSON before
518
- being processed by the task.
846
+ def task(
847
+ self,
848
+ task: PreparedTask,
849
+ batch_size: int | None = None,
850
+ show_progress: bool = True,
851
+ **api_kwargs,
852
+ ) -> pd.Series:
853
+ """Execute a prepared task on each DataFrame row after serializing it to JSON.
519
854
 
520
855
  Example:
521
856
  ```python
522
- from openaivec.task.model import PreparedTask
523
-
857
+ from openaivec._model import PreparedTask
858
+
524
859
  # Assume you have a prepared task for data analysis
525
860
  analysis_task = PreparedTask(...)
526
-
861
+
527
862
  df = pd.DataFrame([
528
863
  {"name": "cat", "legs": 4},
529
864
  {"name": "dog", "legs": 4},
530
865
  {"name": "elephant", "legs": 4},
531
866
  ])
867
+ # Basic usage
532
868
  results = df.ai.task(analysis_task)
869
+
870
+ # With progress bar for large datasets
871
+ large_df = pd.DataFrame({"id": list(range(1000))})
872
+ results = large_df.ai.task(
873
+ analysis_task,
874
+ batch_size=50,
875
+ show_progress=True
876
+ )
533
877
  ```
534
- This method returns a Series containing the task results for each
535
- corresponding row, following the task's defined structure.
536
878
 
537
879
  Args:
538
880
  task (PreparedTask): A pre-configured task containing instructions,
539
- response format, and other parameters for processing the inputs.
540
- batch_size (int, optional): Number of requests sent in one batch
541
- to optimize API usage. Defaults to 128.
881
+ response format for processing the inputs.
882
+ batch_size (int | None, optional): Number of requests sent in one batch
883
+ to optimize API usage. Defaults to ``None`` (automatic batch size
884
+ optimization based on execution time). Set to a positive integer for fixed batch size.
885
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
886
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
887
+
888
+ Note:
889
+ Core batching / routing keys (``model``, ``instructions`` / system message, user ``input``)
890
+ are managed by the library and cannot be overridden.
542
891
 
543
892
  Returns:
544
893
  pandas.Series: Series whose values are instances of the task's
545
894
  response format, aligned with the DataFrame's original index.
546
895
  """
547
- return self._obj.pipe(
548
- lambda df: (
549
- df.pipe(lambda df: pd.Series(df.to_dict(orient="records"), index=df.index, name="record"))
550
- .map(lambda x: json.dumps(x, ensure_ascii=False))
551
- .ai.task(task=task, batch_size=batch_size)
552
- )
896
+ return _df_rows_to_json_series(self._obj).ai.task(
897
+ task=task,
898
+ batch_size=batch_size,
899
+ show_progress=show_progress,
900
+ **api_kwargs,
553
901
  )
554
902
 
555
- def similarity(self, col1: str, col2: str) -> pd.Series:
556
- return self._obj.apply(
557
- lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
558
- axis=1,
559
- ).rename("similarity")
903
+ def parse_with_cache(
904
+ self,
905
+ instructions: str,
906
+ cache: BatchingMapProxy[str, ResponseFormat],
907
+ response_format: type[ResponseFormat] | None = None,
908
+ max_examples: int = 100,
909
+ **api_kwargs,
910
+ ) -> pd.Series:
911
+ """Parse DataFrame rows into structured data using an LLM with a provided cache.
560
912
 
913
+ This method processes each DataFrame row (converted to JSON) and extracts
914
+ structured information using an LLM. External cache control enables
915
+ deduplication across operations and custom batch management.
561
916
 
562
- @pd.api.extensions.register_series_accessor("aio")
563
- class AsyncOpenAIVecSeriesAccessor:
564
- """pandas Series accessor (``.aio``) that adds OpenAI helpers."""
917
+ Args:
918
+ instructions (str): Plain language description of what information
919
+ to extract from each row (e.g., "Extract shipping details and
920
+ order status"). Guides both extraction and schema inference.
921
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
922
+ instance for managing API call batching and deduplication.
923
+ Set cache.batch_size=None for automatic optimization.
924
+ response_format (type[ResponseFormat] | None, optional): Target
925
+ structure for parsed data. Can be a Pydantic model, built-in
926
+ type, or None for automatic schema inference. Defaults to None.
927
+ max_examples (int, optional): Maximum rows to analyze when inferring
928
+ schema (only used when response_format is None). Defaults to 100.
929
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p,
930
+ frequency_penalty, presence_penalty, seed, etc.).
931
+
932
+ Returns:
933
+ pandas.Series: Series containing parsed structured data as instances
934
+ of response_format or the inferred schema model, indexed like
935
+ the original DataFrame.
936
+ """
937
+ return _df_rows_to_json_series(self._obj).ai.parse_with_cache(
938
+ instructions=instructions,
939
+ cache=cache,
940
+ response_format=response_format,
941
+ max_examples=max_examples,
942
+ **api_kwargs,
943
+ )
944
+
945
+ def parse(
946
+ self,
947
+ instructions: str,
948
+ response_format: type[ResponseFormat] | None = None,
949
+ max_examples: int = 100,
950
+ batch_size: int | None = None,
951
+ show_progress: bool = True,
952
+ **api_kwargs,
953
+ ) -> pd.Series:
954
+ """Parse DataFrame rows into structured data using an LLM.
955
+
956
+ Each row is converted to JSON and processed to extract structured
957
+ information. When no response format is provided, the method
958
+ automatically infers an appropriate schema from the data.
959
+
960
+ Args:
961
+ instructions (str): Plain language description of extraction goals
962
+ (e.g., "Extract transaction details including amount, date,
963
+ and merchant"). Guides extraction and schema inference.
964
+ response_format (type[ResponseFormat] | None, optional): Target
965
+ structure for parsed data. Can be a Pydantic model, built-in
966
+ type, or None for automatic inference. Defaults to None.
967
+ max_examples (int, optional): Maximum rows to analyze for schema
968
+ inference (when response_format is None). Defaults to 100.
969
+ batch_size (int | None, optional): Rows per API batch. None
970
+ enables automatic optimization. Defaults to None.
971
+ show_progress (bool, optional): Show progress bar in Jupyter
972
+ notebooks. Defaults to True.
973
+ **api_kwargs: Additional OpenAI API parameters.
974
+
975
+ Returns:
976
+ pandas.Series: Parsed structured data indexed like the original
977
+ DataFrame.
978
+
979
+ Example:
980
+ ```python
981
+ df = pd.DataFrame({
982
+ 'log': [
983
+ '2024-01-01 10:00 ERROR Database connection failed',
984
+ '2024-01-01 10:05 INFO Service started successfully'
985
+ ]
986
+ })
987
+
988
+ # With automatic schema inference
989
+ parsed = df.ai.parse("Extract timestamp, level, and message")
990
+ # Returns Series with inferred structure like:
991
+ # {timestamp: str, level: str, message: str}
992
+ ```
993
+ """
994
+ return self.parse_with_cache(
995
+ instructions=instructions,
996
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
997
+ response_format=response_format,
998
+ max_examples=max_examples,
999
+ **api_kwargs,
1000
+ )
1001
+
1002
+ def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> SchemaInferenceOutput:
1003
+ """Infer a structured data schema from DataFrame rows using AI.
1004
+
1005
+ This method analyzes a sample of DataFrame rows to automatically infer
1006
+ a structured schema that can be used for consistent data extraction.
1007
+ Each row is converted to JSON format and analyzed to identify patterns,
1008
+ field types, and potential categorical values.
1009
+
1010
+ Args:
1011
+ instructions (str): Plain language description of how the extracted
1012
+ structured data will be used (e.g., "Extract operational metrics
1013
+ for dashboard", "Parse customer attributes for segmentation").
1014
+ This guides field relevance and helps exclude irrelevant information.
1015
+ max_examples (int): Maximum number of rows to analyze from the
1016
+ DataFrame. The method will sample randomly up to this limit.
1017
+ Defaults to 100.
1018
+
1019
+ Returns:
1020
+ InferredSchema: An object containing:
1021
+ - instructions: Normalized statement of the extraction objective
1022
+ - fields: List of field specifications with names, types, and descriptions
1023
+ - inference_prompt: Reusable prompt for future extractions
1024
+ - model: Dynamically generated Pydantic model for parsing
1025
+ - task: PreparedTask for batch extraction operations
1026
+
1027
+ Example:
1028
+ ```python
1029
+ df = pd.DataFrame({
1030
+ 'text': [
1031
+ "Order #123: Shipped to NYC, arriving Tuesday",
1032
+ "Order #456: Delayed due to weather, new ETA Friday",
1033
+ "Order #789: Delivered to customer in LA"
1034
+ ],
1035
+ 'timestamp': ['2024-01-01', '2024-01-02', '2024-01-03']
1036
+ })
1037
+
1038
+ # Infer schema for logistics tracking
1039
+ schema = df.ai.infer_schema(
1040
+ instructions="Extract shipping status and location data for logistics tracking"
1041
+ )
1042
+
1043
+ # Apply the schema to extract structured data
1044
+ extracted_df = df.ai.task(schema.task)
1045
+ ```
1046
+
1047
+ Note:
1048
+ Each row is converted to JSON before analysis. The inference
1049
+ process automatically detects hierarchical relationships and
1050
+ creates appropriate nested structures when present. The generated
1051
+ Pydantic model ensures type safety and validation.
1052
+ """
1053
+ return _df_rows_to_json_series(self._obj).ai.infer_schema(
1054
+ instructions=instructions,
1055
+ max_examples=max_examples,
1056
+ **api_kwargs,
1057
+ )
1058
+
1059
+ def extract(self, column: str) -> pd.DataFrame:
1060
+ """Flatten one column of Pydantic models/dicts into top‑level columns.
1061
+
1062
+ Example:
1063
+ ```python
1064
+ df = pd.DataFrame([
1065
+ {"animal": {"name": "cat", "legs": 4}},
1066
+ {"animal": {"name": "dog", "legs": 4}},
1067
+ {"animal": {"name": "elephant", "legs": 4}},
1068
+ ])
1069
+ df.ai.extract("animal")
1070
+ ```
1071
+ This method returns a DataFrame with the same index as the original,
1072
+ where each column corresponds to a key in the dictionaries.
1073
+ The source column is dropped.
1074
+
1075
+ Args:
1076
+ column (str): Column to expand.
1077
+
1078
+ Returns:
1079
+ pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
1080
+ """
1081
+ if column not in self._obj.columns:
1082
+ raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
1083
+
1084
+ return (
1085
+ self._obj.pipe(lambda df: df.reset_index(drop=True))
1086
+ .pipe(lambda df: df.join(df[column].ai.extract()))
1087
+ .pipe(lambda df: df.set_index(self._obj.index))
1088
+ .pipe(lambda df: df.drop(columns=[column], axis=1))
1089
+ )
1090
+
1091
+ def fillna(
1092
+ self,
1093
+ target_column_name: str,
1094
+ max_examples: int = 500,
1095
+ batch_size: int | None = None,
1096
+ show_progress: bool = True,
1097
+ ) -> pd.DataFrame:
1098
+ """Fill missing values in a DataFrame column using AI-powered inference.
1099
+
1100
+ This method uses machine learning to intelligently fill missing (NaN) values
1101
+ in a specified column by analyzing patterns from non-missing rows in the DataFrame.
1102
+ It creates a prepared task that provides examples of similar rows to help the AI
1103
+ model predict appropriate values for the missing entries.
1104
+
1105
+ Args:
1106
+ target_column_name (str): The name of the column containing missing values
1107
+ that need to be filled.
1108
+ max_examples (int, optional): The maximum number of example rows to use
1109
+ for context when predicting missing values. Higher values may improve
1110
+ accuracy but increase API costs and processing time. Defaults to 500.
1111
+ batch_size (int | None, optional): Number of requests sent in one batch
1112
+ to optimize API usage. Defaults to ``None`` (automatic batch size
1113
+ optimization based on execution time). Set to a positive integer for fixed batch size.
1114
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
1115
+
1116
+ Returns:
1117
+ pandas.DataFrame: A new DataFrame with missing values filled in the target
1118
+ column. The original DataFrame is not modified.
1119
+
1120
+ Example:
1121
+ ```python
1122
+ df = pd.DataFrame({
1123
+ 'name': ['Alice', 'Bob', None, 'David'],
1124
+ 'age': [25, 30, 35, None],
1125
+ 'city': ['Tokyo', 'Osaka', 'Kyoto', 'Tokyo']
1126
+ })
1127
+
1128
+ # Fill missing values in the 'name' column
1129
+ filled_df = df.ai.fillna('name')
1130
+
1131
+ # With progress bar for large datasets
1132
+ large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
1133
+ filled_df = large_df.ai.fillna('name', batch_size=32, show_progress=True)
1134
+ ```
1135
+
1136
+ Note:
1137
+ If the target column has no missing values, the original DataFrame
1138
+ is returned unchanged.
1139
+ """
1140
+
1141
+ task: PreparedTask = fillna(self._obj, target_column_name, max_examples)
1142
+ missing_rows = self._obj[self._obj[target_column_name].isna()]
1143
+ if missing_rows.empty:
1144
+ return self._obj
1145
+
1146
+ filled_values: list[FillNaResponse] = missing_rows.ai.task(
1147
+ task=task, batch_size=batch_size, show_progress=show_progress
1148
+ )
1149
+
1150
+ # get deep copy of the DataFrame to avoid modifying the original
1151
+ df = self._obj.copy()
1152
+
1153
+ # Get the actual indices of missing rows to map the results correctly
1154
+ missing_indices = missing_rows.index.tolist()
1155
+
1156
+ for i, result in enumerate(filled_values):
1157
+ if result.output is not None:
1158
+ # Use the actual index from the original DataFrame, not the relative index from result
1159
+ actual_index = missing_indices[i]
1160
+ df.at[actual_index, target_column_name] = result.output
1161
+
1162
+ return df
1163
+
1164
+ def similarity(self, col1: str, col2: str) -> pd.Series:
1165
+ """Compute cosine similarity between two columns containing embedding vectors.
1166
+
1167
+ This method calculates the cosine similarity between vectors stored in
1168
+ two columns of the DataFrame. The vectors should be numpy arrays or
1169
+ array-like objects that support dot product operations.
1170
+
1171
+ Example:
1172
+ ```python
1173
+ df = pd.DataFrame({
1174
+ 'vec1': [np.array([1, 0, 0]), np.array([0, 1, 0])],
1175
+ 'vec2': [np.array([1, 0, 0]), np.array([1, 1, 0])]
1176
+ })
1177
+ similarities = df.ai.similarity('vec1', 'vec2')
1178
+ ```
1179
+
1180
+ Args:
1181
+ col1 (str): Name of the first column containing embedding vectors.
1182
+ col2 (str): Name of the second column containing embedding vectors.
1183
+
1184
+ Returns:
1185
+ pandas.Series: Series containing cosine similarity scores between
1186
+ corresponding vectors in col1 and col2, with values ranging
1187
+ from -1 to 1, where 1 indicates identical direction.
1188
+ """
1189
+ return self._obj.apply(
1190
+ lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
1191
+ axis=1,
1192
+ ).rename("similarity") # type: ignore[arg-type]
1193
+
1194
+
1195
+ @pd.api.extensions.register_series_accessor("aio")
1196
+ class AsyncOpenAIVecSeriesAccessor:
1197
+ """pandas Series accessor (``.aio``) that adds OpenAI helpers."""
565
1198
 
566
1199
  def __init__(self, series_obj: pd.Series):
567
1200
  self._obj = series_obj
568
1201
 
1202
+ async def responses_with_cache(
1203
+ self,
1204
+ instructions: str,
1205
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1206
+ response_format: type[ResponseFormat] = str,
1207
+ **api_kwargs,
1208
+ ) -> pd.Series:
1209
+ """Call an LLM once for every Series element using a provided cache (asynchronously).
1210
+
1211
+ This method allows external control over caching behavior by accepting
1212
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1213
+ across multiple operations or custom batch size management. The concurrency
1214
+ is controlled by the cache instance itself.
1215
+
1216
+ Example:
1217
+ ```python
1218
+ result = await series.aio.responses_with_cache(
1219
+ "classify",
1220
+ cache=shared,
1221
+ max_output_tokens=256,
1222
+ frequency_penalty=0.2,
1223
+ )
1224
+ ```
1225
+
1226
+ Args:
1227
+ instructions (str): System prompt prepended to every user message.
1228
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
1229
+ instance for managing API call batching and deduplication.
1230
+ Set cache.batch_size=None to enable automatic batch size optimization.
1231
+ response_format (type[ResponseFormat], optional): Pydantic model or built‑in
1232
+ type the assistant should return. Defaults to ``str``.
1233
+ **api_kwargs: Additional keyword arguments forwarded verbatim to
1234
+ ``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
1235
+ ``max_output_tokens``, penalties, future parameters). Core batching keys
1236
+ (model, instructions, input, text_format) are protected and silently
1237
+ ignored if provided.
1238
+
1239
+ Returns:
1240
+ pandas.Series: Series whose values are instances of ``response_format``.
1241
+
1242
+ Note:
1243
+ This is an asynchronous method and must be awaited.
1244
+ """
1245
+ client: AsyncBatchResponses = AsyncBatchResponses(
1246
+ client=CONTAINER.resolve(AsyncOpenAI),
1247
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
1248
+ system_message=instructions,
1249
+ response_format=response_format,
1250
+ cache=cache,
1251
+ api_kwargs=api_kwargs,
1252
+ )
1253
+
1254
+ results = await client.parse(self._obj.tolist())
1255
+ return pd.Series(results, index=self._obj.index, name=self._obj.name)
1256
+
569
1257
  async def responses(
570
1258
  self,
571
1259
  instructions: str,
572
- response_format: Type[T] = str,
573
- batch_size: int = 128,
574
- temperature: float = 0.0,
575
- top_p: float = 1.0,
1260
+ response_format: type[ResponseFormat] = str,
1261
+ batch_size: int | None = None,
576
1262
  max_concurrency: int = 8,
1263
+ show_progress: bool = True,
1264
+ **api_kwargs,
577
1265
  ) -> pd.Series:
578
1266
  """Call an LLM once for every Series element (asynchronously).
579
1267
 
@@ -582,22 +1270,32 @@ class AsyncOpenAIVecSeriesAccessor:
582
1270
  animals = pd.Series(["cat", "dog", "elephant"])
583
1271
  # Must be awaited
584
1272
  results = await animals.aio.responses("translate to French")
1273
+
1274
+ # With progress bar for large datasets
1275
+ large_series = pd.Series(["data"] * 1000)
1276
+ results = await large_series.aio.responses(
1277
+ "analyze this data",
1278
+ batch_size=32,
1279
+ max_concurrency=4,
1280
+ show_progress=True
1281
+ )
585
1282
  ```
586
- This method returns a Series of strings, each containing the
587
- assistant's response to the corresponding input.
588
- The model used is set by the `responses_model` function.
589
- The default model is `gpt-4o-mini`.
590
1283
 
591
1284
  Args:
592
1285
  instructions (str): System prompt prepended to every user message.
593
- response_format (Type[T], optional): Pydantic model or built‑in
1286
+ response_format (type[ResponseFormat], optional): Pydantic model or built‑in
594
1287
  type the assistant should return. Defaults to ``str``.
595
- batch_size (int, optional): Number of prompts grouped into a single
596
- request. Defaults to ``128``.
597
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
598
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
1288
+ batch_size (int | None, optional): Number of prompts grouped into a single
1289
+ request. Defaults to ``None`` (automatic batch size optimization
1290
+ based on execution time). Set to a positive integer for fixed batch size.
599
1291
  max_concurrency (int, optional): Maximum number of concurrent
600
1292
  requests. Defaults to ``8``.
1293
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
1294
+ **api_kwargs: Additional keyword arguments forwarded verbatim to
1295
+ ``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
1296
+ ``max_output_tokens``, penalties, future parameters). Core batching keys
1297
+ (model, instructions, input, text_format) are protected and silently
1298
+ ignored if provided.
601
1299
 
602
1300
  Returns:
603
1301
  pandas.Series: Series whose values are instances of ``response_format``.
@@ -605,18 +1303,64 @@ class AsyncOpenAIVecSeriesAccessor:
605
1303
  Note:
606
1304
  This is an asynchronous method and must be awaited.
607
1305
  """
608
- client: AsyncBatchResponses = AsyncBatchResponses(
609
- client=_get_async_openai_client(),
610
- model_name=_RESPONSES_MODEL_NAME,
611
- system_message=instructions,
1306
+ return await self.responses_with_cache(
1307
+ instructions=instructions,
1308
+ cache=AsyncBatchingMapProxy(
1309
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1310
+ ),
612
1311
  response_format=response_format,
613
- temperature=temperature,
614
- top_p=top_p,
615
- max_concurrency=max_concurrency,
1312
+ **api_kwargs,
1313
+ )
1314
+
1315
+ async def embeddings_with_cache(
1316
+ self,
1317
+ cache: AsyncBatchingMapProxy[str, np.ndarray],
1318
+ **api_kwargs,
1319
+ ) -> pd.Series:
1320
+ """Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
1321
+
1322
+ This method allows external control over caching behavior by accepting
1323
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1324
+ across multiple operations or custom batch size management. The concurrency
1325
+ is controlled by the cache instance itself.
1326
+
1327
+ Example:
1328
+ ```python
1329
+ from openaivec._cache import AsyncBatchingMapProxy
1330
+ import numpy as np
1331
+
1332
+ # Create a shared cache with custom batch size and concurrency
1333
+ shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
1334
+ batch_size=64, max_concurrency=4
1335
+ )
1336
+
1337
+ animals = pd.Series(["cat", "dog", "elephant"])
1338
+ # Must be awaited
1339
+ embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
1340
+ ```
1341
+
1342
+ Args:
1343
+ cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
1344
+ instance for managing API call batching and deduplication.
1345
+ Set cache.batch_size=None to enable automatic batch size optimization.
1346
+ **api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
1347
+
1348
+ Returns:
1349
+ pandas.Series: Series whose values are ``np.ndarray`` objects
1350
+ (dtype ``float32``).
1351
+
1352
+ Note:
1353
+ This is an asynchronous method and must be awaited.
1354
+ """
1355
+ client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
1356
+ client=CONTAINER.resolve(AsyncOpenAI),
1357
+ model_name=CONTAINER.resolve(EmbeddingsModelName).value,
1358
+ cache=cache,
1359
+ api_kwargs=api_kwargs,
616
1360
  )
617
1361
 
618
1362
  # Await the async operation
619
- results = await client.parse(self._obj.tolist(), batch_size=batch_size)
1363
+ results = await client.create(self._obj.tolist())
620
1364
 
621
1365
  return pd.Series(
622
1366
  results,
@@ -624,7 +1368,9 @@ class AsyncOpenAIVecSeriesAccessor:
624
1368
  name=self._obj.name,
625
1369
  )
626
1370
 
627
- async def embeddings(self, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
1371
+ async def embeddings(
1372
+ self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = True, **api_kwargs
1373
+ ) -> pd.Series:
628
1374
  """Compute OpenAI embeddings for every Series element (asynchronously).
629
1375
 
630
1376
  Example:
@@ -632,17 +1378,24 @@ class AsyncOpenAIVecSeriesAccessor:
632
1378
  animals = pd.Series(["cat", "dog", "elephant"])
633
1379
  # Must be awaited
634
1380
  embeddings = await animals.aio.embeddings()
1381
+
1382
+ # With progress bar for large datasets
1383
+ large_texts = pd.Series(["text"] * 5000)
1384
+ embeddings = await large_texts.aio.embeddings(
1385
+ batch_size=100,
1386
+ max_concurrency=4,
1387
+ show_progress=True
1388
+ )
635
1389
  ```
636
- This method returns a Series of numpy arrays, each containing the
637
- embedding vector for the corresponding input.
638
- The embedding model is set by the `embeddings_model` function.
639
- The default embedding model is `text-embedding-3-small`.
640
1390
 
641
1391
  Args:
642
- batch_size (int, optional): Number of inputs grouped into a
643
- single request. Defaults to ``128``.
1392
+ batch_size (int | None, optional): Number of inputs grouped into a
1393
+ single request. Defaults to ``None`` (automatic batch size optimization
1394
+ based on execution time). Set to a positive integer for fixed batch size.
644
1395
  max_concurrency (int, optional): Maximum number of concurrent
645
1396
  requests. Defaults to ``8``.
1397
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
1398
+ **api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
646
1399
 
647
1400
  Returns:
648
1401
  pandas.Series: Series whose values are ``np.ndarray`` objects
@@ -651,49 +1404,121 @@ class AsyncOpenAIVecSeriesAccessor:
651
1404
  Note:
652
1405
  This is an asynchronous method and must be awaited.
653
1406
  """
654
- client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
655
- client=_get_async_openai_client(),
656
- model_name=_EMBEDDINGS_MODEL_NAME,
657
- max_concurrency=max_concurrency,
1407
+ return await self.embeddings_with_cache(
1408
+ cache=AsyncBatchingMapProxy(
1409
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1410
+ ),
1411
+ **api_kwargs,
658
1412
  )
659
1413
 
660
- # Await the async operation
661
- results = await client.create(self._obj.tolist(), batch_size=batch_size)
1414
+ async def task_with_cache(
1415
+ self,
1416
+ task: PreparedTask[ResponseFormat],
1417
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1418
+ **api_kwargs,
1419
+ ) -> pd.Series:
1420
+ """Execute a prepared task on every Series element using a provided cache (asynchronously).
662
1421
 
663
- return pd.Series(
664
- results,
665
- index=self._obj.index,
666
- name=self._obj.name,
1422
+ This method allows external control over caching behavior by accepting
1423
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1424
+ across multiple operations or custom batch size management. The concurrency
1425
+ is controlled by the cache instance itself.
1426
+
1427
+ Args:
1428
+ task (PreparedTask): A pre-configured task containing instructions,
1429
+ response format for processing the inputs.
1430
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
1431
+ instance for managing API call batching and deduplication.
1432
+ Set cache.batch_size=None to enable automatic batch size optimization.
1433
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
1434
+
1435
+ Example:
1436
+ ```python
1437
+ from openaivec._model import PreparedTask
1438
+ from openaivec._cache import AsyncBatchingMapProxy
1439
+
1440
+ # Create a shared cache with custom batch size and concurrency
1441
+ shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
1442
+
1443
+ # Assume you have a prepared task for sentiment analysis
1444
+ sentiment_task = PreparedTask(...)
1445
+
1446
+ reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
1447
+ # Must be awaited
1448
+ results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
1449
+ ```
1450
+
1451
+ Additional Keyword Args:
1452
+ Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
1453
+ ``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
1454
+ keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
1455
+ library and cannot be overridden.
1456
+
1457
+ Returns:
1458
+ pandas.Series: Series whose values are instances of the task's
1459
+ response format, aligned with the original Series index.
1460
+
1461
+ Note:
1462
+ This is an asynchronous method and must be awaited.
1463
+ """
1464
+ client = AsyncBatchResponses(
1465
+ client=CONTAINER.resolve(AsyncOpenAI),
1466
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
1467
+ system_message=task.instructions,
1468
+ response_format=task.response_format,
1469
+ cache=cache,
1470
+ api_kwargs=api_kwargs,
667
1471
  )
1472
+ results = await client.parse(self._obj.tolist())
668
1473
 
669
- async def task(self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
670
- """Execute a prepared task on every Series element (asynchronously).
1474
+ return pd.Series(results, index=self._obj.index, name=self._obj.name)
671
1475
 
672
- This method applies a pre-configured task to each element in the Series,
673
- using the task's instructions and response format to generate structured
674
- responses from the language model.
1476
+ async def task(
1477
+ self,
1478
+ task: PreparedTask,
1479
+ batch_size: int | None = None,
1480
+ max_concurrency: int = 8,
1481
+ show_progress: bool = True,
1482
+ **api_kwargs,
1483
+ ) -> pd.Series:
1484
+ """Execute a prepared task on every Series element (asynchronously).
675
1485
 
676
1486
  Example:
677
1487
  ```python
678
- from openaivec.task.model import PreparedTask
679
-
1488
+ from openaivec._model import PreparedTask
1489
+
680
1490
  # Assume you have a prepared task for sentiment analysis
681
1491
  sentiment_task = PreparedTask(...)
682
-
1492
+
683
1493
  reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
684
1494
  # Must be awaited
685
1495
  results = await reviews.aio.task(sentiment_task)
1496
+
1497
+ # With progress bar for large datasets
1498
+ large_reviews = pd.Series(["review text"] * 2000)
1499
+ results = await large_reviews.aio.task(
1500
+ sentiment_task,
1501
+ batch_size=50,
1502
+ max_concurrency=4,
1503
+ show_progress=True
1504
+ )
686
1505
  ```
687
- This method returns a Series containing the task results for each
688
- corresponding input element, following the task's defined structure.
689
1506
 
690
1507
  Args:
691
1508
  task (PreparedTask): A pre-configured task containing instructions,
692
- response format, and other parameters for processing the inputs.
693
- batch_size (int, optional): Number of prompts grouped into a single
694
- request to optimize API usage. Defaults to 128.
1509
+ response format for processing the inputs.
1510
+ batch_size (int | None, optional): Number of prompts grouped into a single
1511
+ request to optimize API usage. Defaults to ``None`` (automatic batch size
1512
+ optimization based on execution time). Set to a positive integer for fixed batch size.
695
1513
  max_concurrency (int, optional): Maximum number of concurrent
696
1514
  requests. Defaults to 8.
1515
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
1516
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
1517
+
1518
+ Note:
1519
+ The task's stored API parameters are used. Core batching / routing
1520
+ keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
1521
+ library and cannot be overridden.
697
1522
 
698
1523
  Returns:
699
1524
  pandas.Series: Series whose values are instances of the task's
@@ -702,20 +1527,117 @@ class AsyncOpenAIVecSeriesAccessor:
702
1527
  Note:
703
1528
  This is an asynchronous method and must be awaited.
704
1529
  """
705
- client = AsyncBatchResponses.of_task(
706
- client=_get_async_openai_client(),
707
- model_name=_RESPONSES_MODEL_NAME,
1530
+ return await self.task_with_cache(
708
1531
  task=task,
709
- max_concurrency=max_concurrency,
1532
+ cache=AsyncBatchingMapProxy(
1533
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1534
+ ),
1535
+ **api_kwargs,
710
1536
  )
711
1537
 
712
- # Await the async operation
713
- results = await client.parse(self._obj.tolist(), batch_size=batch_size)
1538
+ async def parse_with_cache(
1539
+ self,
1540
+ instructions: str,
1541
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1542
+ response_format: type[ResponseFormat] | None = None,
1543
+ max_examples: int = 100,
1544
+ **api_kwargs,
1545
+ ) -> pd.Series:
1546
+ """Parse Series values into structured data using an LLM with a provided cache (asynchronously).
714
1547
 
715
- return pd.Series(
716
- results,
717
- index=self._obj.index,
718
- name=self._obj.name,
1548
+ This async method provides external cache control while parsing Series
1549
+ content into structured data. Automatic schema inference is performed
1550
+ when no response format is specified.
1551
+
1552
+ Args:
1553
+ instructions (str): Plain language description of what to extract
1554
+ (e.g., "Extract dates, amounts, and descriptions from receipts").
1555
+ Guides both extraction and schema inference.
1556
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
1557
+ async cache for managing concurrent API calls and deduplication.
1558
+ Set cache.batch_size=None for automatic optimization.
1559
+ response_format (type[ResponseFormat] | None, optional): Target
1560
+ structure for parsed data. Can be a Pydantic model, built-in
1561
+ type, or None for automatic inference. Defaults to None.
1562
+ max_examples (int, optional): Maximum values to analyze for schema
1563
+ inference (when response_format is None). Defaults to 100.
1564
+ **api_kwargs: Additional OpenAI API parameters.
1565
+
1566
+ Returns:
1567
+ pandas.Series: Series containing parsed structured data aligned
1568
+ with the original index.
1569
+
1570
+ Note:
1571
+ This is an asynchronous method and must be awaited.
1572
+ """
1573
+ schema: SchemaInferenceOutput | None = None
1574
+ if response_format is None:
1575
+ # Use synchronous schema inference
1576
+ schema = self._obj.ai.infer_schema(instructions=instructions, max_examples=max_examples)
1577
+
1578
+ return await self.responses_with_cache(
1579
+ instructions=schema.inference_prompt if schema else instructions,
1580
+ cache=cache,
1581
+ response_format=response_format or schema.model,
1582
+ **api_kwargs,
1583
+ )
1584
+
1585
+ async def parse(
1586
+ self,
1587
+ instructions: str,
1588
+ response_format: type[ResponseFormat] | None = None,
1589
+ max_examples: int = 100,
1590
+ batch_size: int | None = None,
1591
+ max_concurrency: int = 8,
1592
+ show_progress: bool = True,
1593
+ **api_kwargs,
1594
+ ) -> pd.Series:
1595
+ """Parse Series values into structured data using an LLM (asynchronously).
1596
+
1597
+ Async version of the parse method, extracting structured information
1598
+ from unstructured text with automatic schema inference when needed.
1599
+
1600
+ Args:
1601
+ instructions (str): Plain language extraction goals (e.g., "Extract
1602
+ product names, prices, and categories from descriptions").
1603
+ response_format (type[ResponseFormat] | None, optional): Target
1604
+ structure. None triggers automatic schema inference. Defaults to None.
1605
+ max_examples (int, optional): Maximum values for schema inference.
1606
+ Defaults to 100.
1607
+ batch_size (int | None, optional): Requests per batch. None for
1608
+ automatic optimization. Defaults to None.
1609
+ max_concurrency (int, optional): Maximum concurrent API requests.
1610
+ Defaults to 8.
1611
+ show_progress (bool, optional): Show progress bar. Defaults to True.
1612
+ **api_kwargs: Additional OpenAI API parameters.
1613
+
1614
+ Returns:
1615
+ pandas.Series: Parsed structured data indexed like the original Series.
1616
+
1617
+ Example:
1618
+ ```python
1619
+ emails = pd.Series([
1620
+ "Meeting tomorrow at 3pm with John about Q4 planning",
1621
+ "Lunch with Sarah on Friday to discuss new project"
1622
+ ])
1623
+
1624
+ # Async extraction with schema inference
1625
+ parsed = await emails.aio.parse(
1626
+ "Extract meeting details including time, person, and topic"
1627
+ )
1628
+ ```
1629
+
1630
+ Note:
1631
+ This is an asynchronous method and must be awaited.
1632
+ """
1633
+ return await self.parse_with_cache(
1634
+ instructions=instructions,
1635
+ cache=AsyncBatchingMapProxy(
1636
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1637
+ ),
1638
+ response_format=response_format,
1639
+ max_examples=max_examples,
1640
+ **api_kwargs,
719
1641
  )
720
1642
 
721
1643
 
@@ -726,82 +1648,167 @@ class AsyncOpenAIVecDataFrameAccessor:
726
1648
  def __init__(self, df_obj: pd.DataFrame):
727
1649
  self._obj = df_obj
728
1650
 
1651
+ async def responses_with_cache(
1652
+ self,
1653
+ instructions: str,
1654
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1655
+ response_format: type[ResponseFormat] = str,
1656
+ **api_kwargs,
1657
+ ) -> pd.Series:
1658
+ """Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
1659
+
1660
+ This method allows external control over caching behavior by accepting
1661
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1662
+ across multiple operations or custom batch size management. The concurrency
1663
+ is controlled by the cache instance itself.
1664
+
1665
+ Example:
1666
+ ```python
1667
+ from openaivec._cache import AsyncBatchingMapProxy
1668
+
1669
+ # Create a shared cache with custom batch size and concurrency
1670
+ shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
1671
+
1672
+ df = pd.DataFrame([
1673
+ {"name": "cat", "legs": 4},
1674
+ {"name": "dog", "legs": 4},
1675
+ {"name": "elephant", "legs": 4},
1676
+ ])
1677
+ # Must be awaited
1678
+ result = await df.aio.responses_with_cache(
1679
+ "what is the animal's name?",
1680
+ cache=shared_cache
1681
+ )
1682
+ ```
1683
+
1684
+ Args:
1685
+ instructions (str): System prompt for the assistant.
1686
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
1687
+ instance for managing API call batching and deduplication.
1688
+ Set cache.batch_size=None to enable automatic batch size optimization.
1689
+ response_format (type[ResponseFormat], optional): Desired Python type of the
1690
+ responses. Defaults to ``str``.
1691
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
1692
+
1693
+ Returns:
1694
+ pandas.Series: Responses aligned with the DataFrame's original index.
1695
+
1696
+ Note:
1697
+ This is an asynchronous method and must be awaited.
1698
+ """
1699
+ # Await the call to the async Series method using .aio
1700
+ return await _df_rows_to_json_series(self._obj).aio.responses_with_cache(
1701
+ instructions=instructions,
1702
+ cache=cache,
1703
+ response_format=response_format,
1704
+ **api_kwargs,
1705
+ )
1706
+
729
1707
  async def responses(
730
1708
  self,
731
1709
  instructions: str,
732
- response_format: Type[T] = str,
733
- batch_size: int = 128,
734
- temperature: float = 0.0,
735
- top_p: float = 1.0,
1710
+ response_format: type[ResponseFormat] = str,
1711
+ batch_size: int | None = None,
736
1712
  max_concurrency: int = 8,
1713
+ show_progress: bool = True,
1714
+ **api_kwargs,
737
1715
  ) -> pd.Series:
738
- """Generate a response for each row after serialising it to JSON (asynchronously).
1716
+ """Generate a response for each row after serializing it to JSON (asynchronously).
739
1717
 
740
1718
  Example:
741
1719
  ```python
742
1720
  df = pd.DataFrame([
743
- {\"name\": \"cat\", \"legs\": 4},
744
- {\"name\": \"dog\", \"legs\": 4},
745
- {\"name\": \"elephant\", \"legs\": 4},
1721
+ {"name": "cat", "legs": 4},
1722
+ {"name": "dog", "legs": 4},
1723
+ {"name": "elephant", "legs": 4},
746
1724
  ])
747
1725
  # Must be awaited
748
- results = await df.aio.responses(\"what is the animal\'s name?\")
1726
+ results = await df.aio.responses("what is the animal's name?")
1727
+
1728
+ # With progress bar for large datasets
1729
+ large_df = pd.DataFrame({"id": list(range(1000))})
1730
+ results = await large_df.aio.responses(
1731
+ "generate a name for this ID",
1732
+ batch_size=20,
1733
+ max_concurrency=4,
1734
+ show_progress=True
1735
+ )
749
1736
  ```
750
- This method returns a Series of strings, each containing the
751
- assistant's response to the corresponding input.
752
- Each row is serialised to JSON before being sent to the assistant.
753
- The model used is set by the `responses_model` function.
754
- The default model is `gpt-4o-mini`.
755
1737
 
756
1738
  Args:
757
1739
  instructions (str): System prompt for the assistant.
758
- response_format (Type[T], optional): Desired Python type of the
1740
+ response_format (type[ResponseFormat], optional): Desired Python type of the
759
1741
  responses. Defaults to ``str``.
760
- batch_size (int, optional): Number of requests sent in one batch.
761
- Defaults to ``128``.
762
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
763
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
1742
+ batch_size (int | None, optional): Number of requests sent in one batch.
1743
+ Defaults to ``None`` (automatic batch size optimization
1744
+ based on execution time). Set to a positive integer for fixed batch size.
1745
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
764
1746
  max_concurrency (int, optional): Maximum number of concurrent
765
1747
  requests. Defaults to ``8``.
1748
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
766
1749
 
767
1750
  Returns:
768
- pandas.Series: Responses aligned with the DataFrames original index.
1751
+ pandas.Series: Responses aligned with the DataFrame's original index.
769
1752
 
770
1753
  Note:
771
1754
  This is an asynchronous method and must be awaited.
772
1755
  """
773
- series_of_json = self._obj.pipe(
774
- lambda df: (
775
- pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
776
- lambda x: json.dumps(x, ensure_ascii=False)
777
- )
778
- )
779
- )
780
- # Await the call to the async Series method using .aio
781
- return await series_of_json.aio.responses(
1756
+ return await self.responses_with_cache(
782
1757
  instructions=instructions,
1758
+ cache=AsyncBatchingMapProxy(
1759
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1760
+ ),
783
1761
  response_format=response_format,
784
- batch_size=batch_size,
785
- temperature=temperature,
786
- top_p=top_p,
787
- max_concurrency=max_concurrency,
1762
+ **api_kwargs,
788
1763
  )
789
1764
 
790
- async def task(self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
791
- """Execute a prepared task on each DataFrame row after serialising it to JSON (asynchronously).
1765
+ async def task_with_cache(
1766
+ self,
1767
+ task: PreparedTask[ResponseFormat],
1768
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1769
+ **api_kwargs,
1770
+ ) -> pd.Series:
1771
+ """Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
1772
+
1773
+ After serializing each row to JSON, this method executes the prepared task.
1774
+
1775
+ Args:
1776
+ task (PreparedTask): Prepared task (instructions + response_format).
1777
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
1778
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
792
1779
 
793
- This method applies a pre-configured task to each row in the DataFrame,
794
- using the task's instructions and response format to generate structured
795
- responses from the language model. Each row is serialised to JSON before
796
- being processed by the task.
1780
+ Note:
1781
+ Core routing keys are managed internally.
1782
+
1783
+ Returns:
1784
+ pandas.Series: Task results aligned with the DataFrame's original index.
1785
+
1786
+ Note:
1787
+ This is an asynchronous method and must be awaited.
1788
+ """
1789
+ return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
1790
+ task=task,
1791
+ cache=cache,
1792
+ **api_kwargs,
1793
+ )
1794
+
1795
+ async def task(
1796
+ self,
1797
+ task: PreparedTask,
1798
+ batch_size: int | None = None,
1799
+ max_concurrency: int = 8,
1800
+ show_progress: bool = True,
1801
+ **api_kwargs,
1802
+ ) -> pd.Series:
1803
+ """Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
797
1804
 
798
1805
  Example:
799
1806
  ```python
800
- from openaivec.task.model import PreparedTask
801
-
1807
+ from openaivec._model import PreparedTask
1808
+
802
1809
  # Assume you have a prepared task for data analysis
803
1810
  analysis_task = PreparedTask(...)
804
-
1811
+
805
1812
  df = pd.DataFrame([
806
1813
  {"name": "cat", "legs": 4},
807
1814
  {"name": "dog", "legs": 4},
@@ -809,17 +1816,31 @@ class AsyncOpenAIVecDataFrameAccessor:
809
1816
  ])
810
1817
  # Must be awaited
811
1818
  results = await df.aio.task(analysis_task)
1819
+
1820
+ # With progress bar for large datasets
1821
+ large_df = pd.DataFrame({"id": list(range(1000))})
1822
+ results = await large_df.aio.task(
1823
+ analysis_task,
1824
+ batch_size=50,
1825
+ max_concurrency=4,
1826
+ show_progress=True
1827
+ )
812
1828
  ```
813
- This method returns a Series containing the task results for each
814
- corresponding row, following the task's defined structure.
815
1829
 
816
1830
  Args:
817
1831
  task (PreparedTask): A pre-configured task containing instructions,
818
- response format, and other parameters for processing the inputs.
819
- batch_size (int, optional): Number of requests sent in one batch
820
- to optimize API usage. Defaults to 128.
1832
+ response format for processing the inputs.
1833
+ batch_size (int | None, optional): Number of requests sent in one batch
1834
+ to optimize API usage. Defaults to ``None`` (automatic batch size
1835
+ optimization based on execution time). Set to a positive integer for fixed batch size.
821
1836
  max_concurrency (int, optional): Maximum number of concurrent
822
1837
  requests. Defaults to 8.
1838
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
1839
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
1840
+
1841
+ Note:
1842
+ Core batching / routing keys (``model``, ``instructions`` / system message, user ``input``)
1843
+ are managed by the library and cannot be overridden.
823
1844
 
824
1845
  Returns:
825
1846
  pandas.Series: Series whose values are instances of the task's
@@ -828,27 +1849,131 @@ class AsyncOpenAIVecDataFrameAccessor:
828
1849
  Note:
829
1850
  This is an asynchronous method and must be awaited.
830
1851
  """
831
- series_of_json = self._obj.pipe(
832
- lambda df: (
833
- pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
834
- lambda x: json.dumps(x, ensure_ascii=False)
835
- )
836
- )
837
- )
838
1852
  # Await the call to the async Series method using .aio
839
- return await series_of_json.aio.task(
1853
+ return await _df_rows_to_json_series(self._obj).aio.task(
840
1854
  task=task,
841
1855
  batch_size=batch_size,
842
1856
  max_concurrency=max_concurrency,
1857
+ show_progress=show_progress,
1858
+ **api_kwargs,
843
1859
  )
844
1860
 
845
- async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
1861
+ async def parse_with_cache(
1862
+ self,
1863
+ instructions: str,
1864
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1865
+ response_format: type[ResponseFormat] | None = None,
1866
+ max_examples: int = 100,
1867
+ **api_kwargs,
1868
+ ) -> pd.Series:
1869
+ """Parse DataFrame rows into structured data using an LLM with cache (asynchronously).
1870
+
1871
+ Async method for parsing DataFrame rows (as JSON) with external cache
1872
+ control, enabling deduplication across operations and concurrent processing.
1873
+
1874
+ Args:
1875
+ instructions (str): Plain language extraction goals (e.g., "Extract
1876
+ invoice details including items, quantities, and totals").
1877
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
1878
+ async cache for concurrent API call management.
1879
+ response_format (type[ResponseFormat] | None, optional): Target
1880
+ structure. None triggers automatic schema inference. Defaults to None.
1881
+ max_examples (int, optional): Maximum rows for schema inference.
1882
+ Defaults to 100.
1883
+ **api_kwargs: Additional OpenAI API parameters.
1884
+
1885
+ Returns:
1886
+ pandas.Series: Parsed structured data indexed like the original DataFrame.
1887
+
1888
+ Note:
1889
+ This is an asynchronous method and must be awaited.
1890
+ """
1891
+ return await _df_rows_to_json_series(self._obj).aio.parse_with_cache(
1892
+ instructions=instructions,
1893
+ cache=cache,
1894
+ response_format=response_format,
1895
+ max_examples=max_examples,
1896
+ **api_kwargs,
1897
+ )
1898
+
1899
+ async def parse(
1900
+ self,
1901
+ instructions: str,
1902
+ response_format: type[ResponseFormat] | None = None,
1903
+ max_examples: int = 100,
1904
+ batch_size: int | None = None,
1905
+ max_concurrency: int = 8,
1906
+ show_progress: bool = True,
1907
+ **api_kwargs,
1908
+ ) -> pd.Series:
1909
+ """Parse DataFrame rows into structured data using an LLM (asynchronously).
1910
+
1911
+ Async version for extracting structured information from DataFrame rows,
1912
+ with automatic schema inference when no format is specified.
1913
+
1914
+ Args:
1915
+ instructions (str): Plain language extraction goals (e.g., "Extract
1916
+ customer details, order items, and payment information").
1917
+ response_format (type[ResponseFormat] | None, optional): Target
1918
+ structure. None triggers automatic inference. Defaults to None.
1919
+ max_examples (int, optional): Maximum rows for schema inference.
1920
+ Defaults to 100.
1921
+ batch_size (int | None, optional): Rows per batch. None for
1922
+ automatic optimization. Defaults to None.
1923
+ max_concurrency (int, optional): Maximum concurrent requests.
1924
+ Defaults to 8.
1925
+ show_progress (bool, optional): Show progress bar. Defaults to True.
1926
+ **api_kwargs: Additional OpenAI API parameters.
1927
+
1928
+ Returns:
1929
+ pandas.Series: Parsed structured data indexed like the original DataFrame.
1930
+
1931
+ Example:
1932
+ ```python
1933
+ df = pd.DataFrame({
1934
+ 'raw_data': [
1935
+ 'Customer: John Doe, Order: 2 laptops @ $1200 each',
1936
+ 'Customer: Jane Smith, Order: 5 phones @ $800 each'
1937
+ ]
1938
+ })
1939
+
1940
+ # Async parsing with automatic schema inference
1941
+ parsed = await df.aio.parse(
1942
+ "Extract customer name, product, quantity, and unit price"
1943
+ )
1944
+ ```
1945
+
1946
+ Note:
1947
+ This is an asynchronous method and must be awaited.
846
1948
  """
847
- Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
1949
+ return await self.parse_with_cache(
1950
+ instructions=instructions,
1951
+ cache=AsyncBatchingMapProxy(
1952
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1953
+ ),
1954
+ response_format=response_format,
1955
+ max_examples=max_examples,
1956
+ **api_kwargs,
1957
+ )
1958
+
1959
+ async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
1960
+ """Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
848
1961
 
849
1962
  This method allows chaining operations on the DataFrame, similar to pandas' `pipe` method,
850
1963
  but with support for asynchronous functions.
851
1964
 
1965
+ Example:
1966
+ ```python
1967
+ async def process_data(df):
1968
+ # Simulate an asynchronous computation
1969
+ await asyncio.sleep(1)
1970
+ return df.dropna()
1971
+
1972
+ df = pd.DataFrame({"col": [1, 2, None, 4]})
1973
+ # Must be awaited
1974
+ result = await df.aio.pipe(process_data)
1975
+ ```
1976
+
852
1977
  Args:
853
1978
  func (Callable[[pd.DataFrame], Awaitable[T] | T]): A function that takes a DataFrame
854
1979
  as input and returns either a result or an awaitable result.
@@ -865,7 +1990,7 @@ class AsyncOpenAIVecDataFrameAccessor:
865
1990
  else:
866
1991
  return result
867
1992
 
868
- async def assign(self, **kwargs: Any) -> pd.DataFrame:
1993
+ async def assign(self, **kwargs) -> pd.DataFrame:
869
1994
  """Asynchronously assign new columns to the DataFrame, evaluating sequentially.
870
1995
 
871
1996
  This method extends pandas' `assign` method by supporting asynchronous
@@ -900,7 +2025,7 @@ class AsyncOpenAIVecDataFrameAccessor:
900
2025
  ```
901
2026
 
902
2027
  Args:
903
- **kwargs: Any. Column names as keys and either static values or callables
2028
+ **kwargs: Column names as keys and either static values or callables
904
2029
  (synchronous or asynchronous) as values.
905
2030
 
906
2031
  Returns:
@@ -923,3 +2048,88 @@ class AsyncOpenAIVecDataFrameAccessor:
923
2048
  df_current[key] = column_data
924
2049
 
925
2050
  return df_current
2051
+
2052
+ async def fillna(
2053
+ self,
2054
+ target_column_name: str,
2055
+ max_examples: int = 500,
2056
+ batch_size: int | None = None,
2057
+ max_concurrency: int = 8,
2058
+ show_progress: bool = True,
2059
+ ) -> pd.DataFrame:
2060
+ """Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
2061
+
2062
+ This method uses machine learning to intelligently fill missing (NaN) values
2063
+ in a specified column by analyzing patterns from non-missing rows in the DataFrame.
2064
+ It creates a prepared task that provides examples of similar rows to help the AI
2065
+ model predict appropriate values for the missing entries.
2066
+
2067
+ Args:
2068
+ target_column_name (str): The name of the column containing missing values
2069
+ that need to be filled.
2070
+ max_examples (int, optional): The maximum number of example rows to use
2071
+ for context when predicting missing values. Higher values may improve
2072
+ accuracy but increase API costs and processing time. Defaults to 500.
2073
+ batch_size (int | None, optional): Number of requests sent in one batch
2074
+ to optimize API usage. Defaults to ``None`` (automatic batch size
2075
+ optimization based on execution time). Set to a positive integer for fixed batch size.
2076
+ max_concurrency (int, optional): Maximum number of concurrent
2077
+ requests. Defaults to 8.
2078
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
2079
+
2080
+ Returns:
2081
+ pandas.DataFrame: A new DataFrame with missing values filled in the target
2082
+ column. The original DataFrame is not modified.
2083
+
2084
+ Example:
2085
+ ```python
2086
+ df = pd.DataFrame({
2087
+ 'name': ['Alice', 'Bob', None, 'David'],
2088
+ 'age': [25, 30, 35, None],
2089
+ 'city': ['Tokyo', 'Osaka', 'Kyoto', 'Tokyo']
2090
+ })
2091
+
2092
+ # Fill missing values in the 'name' column (must be awaited)
2093
+ filled_df = await df.aio.fillna('name')
2094
+
2095
+ # With progress bar for large datasets
2096
+ large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
2097
+ filled_df = await large_df.aio.fillna(
2098
+ 'name',
2099
+ batch_size=32,
2100
+ max_concurrency=4,
2101
+ show_progress=True
2102
+ )
2103
+ ```
2104
+
2105
+ Note:
2106
+ This is an asynchronous method and must be awaited.
2107
+ If the target column has no missing values, the original DataFrame
2108
+ is returned unchanged.
2109
+ """
2110
+
2111
+ task: PreparedTask = fillna(self._obj, target_column_name, max_examples)
2112
+ missing_rows = self._obj[self._obj[target_column_name].isna()]
2113
+ if missing_rows.empty:
2114
+ return self._obj
2115
+
2116
+ filled_values: list[FillNaResponse] = await missing_rows.aio.task(
2117
+ task=task,
2118
+ batch_size=batch_size,
2119
+ max_concurrency=max_concurrency,
2120
+ show_progress=show_progress,
2121
+ )
2122
+
2123
+ # get deep copy of the DataFrame to avoid modifying the original
2124
+ df = self._obj.copy()
2125
+
2126
+ # Get the actual indices of missing rows to map the results correctly
2127
+ missing_indices = missing_rows.index.tolist()
2128
+
2129
+ for i, result in enumerate(filled_values):
2130
+ if result.output is not None:
2131
+ # Use the actual index from the original DataFrame, not the relative index from result
2132
+ actual_index = missing_indices[i]
2133
+ df.at[actual_index, target_column_name] = result.output
2134
+
2135
+ return df