openaivec 0.12.5__py3-none-any.whl → 1.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. openaivec/__init__.py +13 -4
  2. openaivec/_cache/__init__.py +12 -0
  3. openaivec/_cache/optimize.py +109 -0
  4. openaivec/_cache/proxy.py +806 -0
  5. openaivec/{di.py → _di.py} +36 -12
  6. openaivec/_embeddings.py +203 -0
  7. openaivec/{log.py → _log.py} +2 -2
  8. openaivec/_model.py +113 -0
  9. openaivec/{prompt.py → _prompt.py} +95 -28
  10. openaivec/_provider.py +207 -0
  11. openaivec/_responses.py +511 -0
  12. openaivec/_schema/__init__.py +9 -0
  13. openaivec/_schema/infer.py +340 -0
  14. openaivec/_schema/spec.py +350 -0
  15. openaivec/_serialize.py +234 -0
  16. openaivec/{util.py → _util.py} +25 -85
  17. openaivec/pandas_ext.py +1496 -318
  18. openaivec/spark.py +485 -183
  19. openaivec/task/__init__.py +9 -7
  20. openaivec/task/customer_support/__init__.py +9 -15
  21. openaivec/task/customer_support/customer_sentiment.py +17 -15
  22. openaivec/task/customer_support/inquiry_classification.py +23 -22
  23. openaivec/task/customer_support/inquiry_summary.py +14 -13
  24. openaivec/task/customer_support/intent_analysis.py +21 -19
  25. openaivec/task/customer_support/response_suggestion.py +16 -16
  26. openaivec/task/customer_support/urgency_analysis.py +24 -25
  27. openaivec/task/nlp/__init__.py +4 -4
  28. openaivec/task/nlp/dependency_parsing.py +10 -12
  29. openaivec/task/nlp/keyword_extraction.py +11 -14
  30. openaivec/task/nlp/morphological_analysis.py +12 -14
  31. openaivec/task/nlp/named_entity_recognition.py +16 -18
  32. openaivec/task/nlp/sentiment_analysis.py +14 -11
  33. openaivec/task/nlp/translation.py +6 -9
  34. openaivec/task/table/__init__.py +2 -2
  35. openaivec/task/table/fillna.py +11 -11
  36. openaivec-1.0.10.dist-info/METADATA +399 -0
  37. openaivec-1.0.10.dist-info/RECORD +39 -0
  38. {openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/WHEEL +1 -1
  39. openaivec/embeddings.py +0 -172
  40. openaivec/model.py +0 -67
  41. openaivec/provider.py +0 -45
  42. openaivec/responses.py +0 -393
  43. openaivec/serialize.py +0 -225
  44. openaivec-0.12.5.dist-info/METADATA +0 -696
  45. openaivec-0.12.5.dist-info/RECORD +0 -33
  46. {openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/licenses/LICENSE +0 -0
openaivec/pandas_ext.py CHANGED
@@ -7,28 +7,35 @@ from openaivec import pandas_ext
7
7
 
8
8
  # Option 1: Use environment variables (automatic detection)
9
9
  # Set OPENAI_API_KEY or Azure OpenAI environment variables
10
- # (AZURE_OPENAI_API_KEY, AZURE_OPENAI_API_ENDPOINT, AZURE_OPENAI_API_VERSION)
10
+ # (AZURE_OPENAI_API_KEY, AZURE_OPENAI_BASE_URL, AZURE_OPENAI_API_VERSION)
11
11
  # No explicit setup needed - clients are automatically created
12
12
 
13
- # Option 2: Use an existing OpenAI client instance
13
+ # Option 2: Register an existing OpenAI client instance
14
14
  client = OpenAI(api_key="your-api-key")
15
- pandas_ext.use(client)
15
+ pandas_ext.set_client(client)
16
16
 
17
- # Option 3: Use an existing Azure OpenAI client instance
17
+ # Option 3: Register an Azure OpenAI client instance
18
18
  azure_client = AzureOpenAI(
19
19
  api_key="your-azure-key",
20
- azure_endpoint="https://your-resource.openai.azure.com",
21
- api_version="2024-02-01"
20
+ base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
21
+ api_version="preview"
22
22
  )
23
- pandas_ext.use(azure_client)
23
+ pandas_ext.set_client(azure_client)
24
24
 
25
- # Option 4: Use async clients
26
- async_client = AsyncOpenAI(api_key="your-api-key")
27
- pandas_ext.use_async(async_client)
25
+ # Option 4: Register an async Azure OpenAI client instance
26
+ async_azure_client = AsyncAzureOpenAI(
27
+ api_key="your-azure-key",
28
+ base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
29
+ api_version="preview"
30
+ )
31
+ pandas_ext.set_async_client(async_azure_client)
28
32
 
29
33
  # Set up model names (optional, defaults shown)
30
- pandas_ext.responses_model("gpt-4.1-mini")
31
- pandas_ext.embeddings_model("text-embedding-3-small")
34
+ pandas_ext.set_responses_model("gpt-4.1-mini")
35
+ pandas_ext.set_embeddings_model("text-embedding-3-small")
36
+
37
+ # Inspect current configuration
38
+ configured_model = pandas_ext.get_responses_model()
32
39
  ```
33
40
 
34
41
  This module provides `.ai` and `.aio` accessors for pandas Series and DataFrames
@@ -38,7 +45,8 @@ to easily interact with OpenAI APIs for tasks like generating responses or embed
38
45
  import inspect
39
46
  import json
40
47
  import logging
41
- from typing import Any, Awaitable, Callable, List, Type, TypeVar
48
+ from collections.abc import Awaitable, Callable
49
+ from typing import TypeVar
42
50
 
43
51
  import numpy as np
44
52
  import pandas as pd
@@ -46,87 +54,126 @@ import tiktoken
46
54
  from openai import AsyncOpenAI, OpenAI
47
55
  from pydantic import BaseModel
48
56
 
49
- from .di import Container
50
- from .embeddings import AsyncBatchEmbeddings, BatchEmbeddings
51
- from .model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
52
- from .provider import provide_async_openai_client, provide_openai_client
53
- from .responses import AsyncBatchResponses, BatchResponses
54
- from .task.table import FillNaResponse, fillna
57
+ from openaivec._cache import AsyncBatchingMapProxy, BatchingMapProxy
58
+ from openaivec._embeddings import AsyncBatchEmbeddings, BatchEmbeddings
59
+ from openaivec._model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
60
+ from openaivec._provider import CONTAINER, _check_azure_v1_api_url
61
+ from openaivec._responses import AsyncBatchResponses, BatchResponses
62
+ from openaivec._schema import SchemaInferenceInput, SchemaInferenceOutput, SchemaInferer
63
+ from openaivec.task.table import FillNaResponse, fillna
55
64
 
56
65
  __all__ = [
57
- "use",
58
- "use_async",
59
- "responses_model",
60
- "embeddings_model",
66
+ "get_async_client",
67
+ "get_client",
68
+ "get_embeddings_model",
69
+ "get_responses_model",
70
+ "set_async_client",
71
+ "set_client",
72
+ "set_embeddings_model",
73
+ "set_responses_model",
61
74
  ]
62
75
 
63
76
  _LOGGER = logging.getLogger(__name__)
64
77
 
65
78
 
79
+ # ---------------------------------------------------------------------------
80
+ # Internal helpers (not exported)
81
+ # ---------------------------------------------------------------------------
82
+ def _df_rows_to_json_series(df: pd.DataFrame) -> pd.Series:
83
+ """Return a Series of JSON strings (UTF-8, no ASCII escaping) representing DataFrame rows.
84
+
85
+ Each element is the JSON serialisation of the corresponding row as a dict. Index and
86
+ name are preserved so downstream operations retain alignment. This consolidates the
87
+ previously duplicated inline pipeline used by responses*/task* DataFrame helpers.
88
+ """
89
+ return pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
90
+ lambda x: json.dumps(x, ensure_ascii=False)
91
+ )
92
+
93
+
66
94
  T = TypeVar("T") # For pipe function return type
67
95
 
68
- _DI = Container()
69
- _DI.register(OpenAI, provide_openai_client)
70
- _DI.register(AsyncOpenAI, provide_async_openai_client)
71
- _DI.register(ResponsesModelName, lambda: ResponsesModelName("gpt-4.1-mini"))
72
- _DI.register(EmbeddingsModelName, lambda: EmbeddingsModelName("text-embedding-3-small"))
73
96
 
97
+ def set_client(client: OpenAI) -> None:
98
+ """Register a custom OpenAI-compatible client for pandas helpers.
74
99
 
75
- def _provide_tiktoken_encoding() -> tiktoken.Encoding:
76
- model_name = _DI.resolve(ResponsesModelName).value
77
- try:
78
- return tiktoken.encoding_for_model(model_name)
79
- except KeyError:
80
- _LOGGER.info(
81
- "The model name '%s' is not supported by tiktoken. Using 'o200k_base' encoding instead.",
82
- model_name,
83
- )
84
- return tiktoken.get_encoding("o200k_base")
100
+ Args:
101
+ client (OpenAI): A pre-configured `openai.OpenAI` or
102
+ `openai.AzureOpenAI` instance reused by every helper in this module.
103
+ """
104
+ if client.__class__.__name__ == "AzureOpenAI" and hasattr(client, "base_url"):
105
+ _check_azure_v1_api_url(str(client.base_url))
85
106
 
107
+ CONTAINER.register(OpenAI, lambda: client)
86
108
 
87
- _DI.register(tiktoken.Encoding, _provide_tiktoken_encoding)
88
109
 
110
+ def get_client() -> OpenAI:
111
+ """Get the currently registered OpenAI-compatible client.
89
112
 
90
- def use(client: OpenAI) -> None:
91
- """Register a custom OpenAI‑compatible client.
113
+ Returns:
114
+ OpenAI: The registered `openai.OpenAI` or `openai.AzureOpenAI` instance.
115
+ """
116
+ return CONTAINER.resolve(OpenAI)
117
+
118
+
119
+ def set_async_client(client: AsyncOpenAI) -> None:
120
+ """Register a custom asynchronous OpenAI-compatible client.
92
121
 
93
122
  Args:
94
- client (OpenAI): A preconfigured `openai.OpenAI` or
95
- `openai.AzureOpenAI` instance.
96
- The same instance is reused by every helper in this module.
123
+ client (AsyncOpenAI): A pre-configured `openai.AsyncOpenAI` or
124
+ `openai.AsyncAzureOpenAI` instance reused by every helper in this module.
97
125
  """
98
- _DI.register(OpenAI, lambda: client)
126
+ if client.__class__.__name__ == "AsyncAzureOpenAI" and hasattr(client, "base_url"):
127
+ _check_azure_v1_api_url(str(client.base_url))
99
128
 
129
+ CONTAINER.register(AsyncOpenAI, lambda: client)
100
130
 
101
- def use_async(client: AsyncOpenAI) -> None:
102
- """Register a custom asynchronous OpenAI‑compatible client.
103
131
 
104
- Args:
105
- client (AsyncOpenAI): A pre‑configured `openai.AsyncOpenAI` or
106
- `openai.AsyncAzureOpenAI` instance.
107
- The same instance is reused by every helper in this module.
132
+ def get_async_client() -> AsyncOpenAI:
133
+ """Get the currently registered asynchronous OpenAI-compatible client.
134
+
135
+ Returns:
136
+ AsyncOpenAI: The registered `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` instance.
108
137
  """
109
- _DI.register(AsyncOpenAI, lambda: client)
138
+ return CONTAINER.resolve(AsyncOpenAI)
110
139
 
111
140
 
112
- def responses_model(name: str) -> None:
141
+ def set_responses_model(name: str) -> None:
113
142
  """Override the model used for text responses.
114
143
 
115
144
  Args:
116
- name (str): Model name as listed in the OpenAI API
145
+ name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name
117
146
  (for example, ``gpt-4.1-mini``).
118
147
  """
119
- _DI.register(ResponsesModelName, lambda: ResponsesModelName(name))
120
- _DI.register(tiktoken.Encoding, _provide_tiktoken_encoding)
148
+ CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(name))
149
+
150
+
151
+ def get_responses_model() -> str:
152
+ """Get the currently registered model name for text responses.
153
+
154
+ Returns:
155
+ str: The model name (for example, ``gpt-4.1-mini``).
156
+ """
157
+ return CONTAINER.resolve(ResponsesModelName).value
121
158
 
122
159
 
123
- def embeddings_model(name: str) -> None:
160
+ def set_embeddings_model(name: str) -> None:
124
161
  """Override the model used for text embeddings.
125
162
 
126
163
  Args:
127
- name (str): Embedding model name, e.g. ``text-embedding-3-small``.
164
+ name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name,
165
+ e.g. ``text-embedding-3-small``.
128
166
  """
129
- _DI.register(EmbeddingsModelName, lambda: EmbeddingsModelName(name))
167
+ CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(name))
168
+
169
+
170
+ def get_embeddings_model() -> str:
171
+ """Get the currently registered model name for text embeddings.
172
+
173
+ Returns:
174
+ str: The model name (for example, ``text-embedding-3-small``).
175
+ """
176
+ return CONTAINER.resolve(EmbeddingsModelName).value
130
177
 
131
178
 
132
179
  def _extract_value(x, series_name):
@@ -160,124 +207,463 @@ class OpenAIVecSeriesAccessor:
160
207
  def __init__(self, series_obj: pd.Series):
161
208
  self._obj = series_obj
162
209
 
210
+ def responses_with_cache(
211
+ self,
212
+ instructions: str,
213
+ cache: BatchingMapProxy[str, ResponseFormat],
214
+ response_format: type[ResponseFormat] = str,
215
+ **api_kwargs,
216
+ ) -> pd.Series:
217
+ """Call an LLM once for every Series element using a provided cache.
218
+
219
+ This is a lower-level method that allows explicit cache management for advanced
220
+ use cases. Most users should use the standard ``responses`` method instead.
221
+
222
+ Args:
223
+ instructions (str): System prompt prepended to every user message.
224
+ cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
225
+ batching and deduplication control.
226
+ response_format (type[ResponseFormat], optional): Pydantic model or built-in
227
+ type the assistant should return. Defaults to ``str``.
228
+ **api_kwargs: Arbitrary OpenAI Responses API parameters (e.g. ``temperature``,
229
+ ``top_p``, ``frequency_penalty``, ``presence_penalty``, ``seed``, etc.) are
230
+ forwarded verbatim to the underlying client.
231
+
232
+ Returns:
233
+ pandas.Series: Series whose values are instances of ``response_format``.
234
+ """
235
+
236
+ client: BatchResponses = BatchResponses(
237
+ client=CONTAINER.resolve(OpenAI),
238
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
239
+ system_message=instructions,
240
+ response_format=response_format,
241
+ cache=cache,
242
+ api_kwargs=api_kwargs,
243
+ )
244
+
245
+ return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
246
+
163
247
  def responses(
164
248
  self,
165
249
  instructions: str,
166
- response_format: Type[ResponseFormat] = str,
167
- batch_size: int = 128,
168
- temperature: float = 0.0,
169
- top_p: float = 1.0,
250
+ response_format: type[ResponseFormat] = str,
251
+ batch_size: int | None = None,
252
+ show_progress: bool = True,
253
+ **api_kwargs,
170
254
  ) -> pd.Series:
171
255
  """Call an LLM once for every Series element.
172
256
 
173
257
  Example:
174
258
  ```python
175
259
  animals = pd.Series(["cat", "dog", "elephant"])
260
+ # Basic usage
176
261
  animals.ai.responses("translate to French")
262
+
263
+ # With progress bar in Jupyter notebooks
264
+ large_series = pd.Series(["data"] * 1000)
265
+ large_series.ai.responses(
266
+ "analyze this data",
267
+ batch_size=32,
268
+ show_progress=True
269
+ )
270
+
271
+ # With custom temperature
272
+ animals.ai.responses(
273
+ "translate creatively",
274
+ temperature=0.8
275
+ )
177
276
  ```
178
- This method returns a Series of strings, each containing the
179
- assistant's response to the corresponding input.
180
- The model used is set by the `responses_model` function.
181
- The default model is `gpt-4.1-mini`.
182
277
 
183
278
  Args:
184
279
  instructions (str): System prompt prepended to every user message.
185
- response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
280
+ response_format (type[ResponseFormat], optional): Pydantic model or built‑in
186
281
  type the assistant should return. Defaults to ``str``.
187
- batch_size (int, optional): Number of prompts grouped into a single
188
- request. Defaults to ``128``.
189
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
190
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
282
+ batch_size (int | None, optional): Number of prompts grouped into a single
283
+ request. Defaults to ``None`` (automatic batch size optimization
284
+ based on execution time). Set to a positive integer for fixed batch size.
285
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
286
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
191
287
 
192
288
  Returns:
193
289
  pandas.Series: Series whose values are instances of ``response_format``.
194
290
  """
195
- client: BatchResponses = BatchResponses(
196
- client=_DI.resolve(OpenAI),
197
- model_name=_DI.resolve(ResponsesModelName).value,
198
- system_message=instructions,
291
+ return self.responses_with_cache(
292
+ instructions=instructions,
293
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
199
294
  response_format=response_format,
200
- temperature=temperature,
201
- top_p=top_p,
295
+ **api_kwargs,
296
+ )
297
+
298
+ def embeddings_with_cache(
299
+ self,
300
+ cache: BatchingMapProxy[str, np.ndarray],
301
+ **api_kwargs,
302
+ ) -> pd.Series:
303
+ """Compute OpenAI embeddings for every Series element using a provided cache.
304
+
305
+ This method allows external control over caching behavior by accepting
306
+ a pre-configured BatchingMapProxy instance, enabling cache sharing
307
+ across multiple operations or custom batch size management.
308
+
309
+ Example:
310
+ ```python
311
+ from openaivec._cache import BatchingMapProxy
312
+ import numpy as np
313
+
314
+ # Create a shared cache with custom batch size
315
+ shared_cache = BatchingMapProxy[str, np.ndarray](batch_size=64)
316
+
317
+ animals = pd.Series(["cat", "dog", "elephant"])
318
+ embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
319
+ ```
320
+
321
+ Args:
322
+ cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
323
+ instance for managing API call batching and deduplication.
324
+ Set cache.batch_size=None to enable automatic batch size optimization.
325
+ **api_kwargs: Additional keyword arguments to pass to the OpenAI API.
326
+
327
+ Returns:
328
+ pandas.Series: Series whose values are ``np.ndarray`` objects
329
+ (dtype ``float32``).
330
+ """
331
+ client: BatchEmbeddings = BatchEmbeddings(
332
+ client=CONTAINER.resolve(OpenAI),
333
+ model_name=CONTAINER.resolve(EmbeddingsModelName).value,
334
+ cache=cache,
335
+ api_kwargs=api_kwargs,
202
336
  )
203
337
 
204
338
  return pd.Series(
205
- client.parse(self._obj.tolist(), batch_size=batch_size),
339
+ client.create(self._obj.tolist()),
206
340
  index=self._obj.index,
207
341
  name=self._obj.name,
208
342
  )
209
343
 
210
- def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
211
- """Execute a prepared task on every Series element.
344
+ def embeddings(self, batch_size: int | None = None, show_progress: bool = True, **api_kwargs) -> pd.Series:
345
+ """Compute OpenAI embeddings for every Series element.
346
+
347
+ Example:
348
+ ```python
349
+ animals = pd.Series(["cat", "dog", "elephant"])
350
+ # Basic usage
351
+ animals.ai.embeddings()
212
352
 
213
- This method applies a pre-configured task to each element in the Series,
214
- using the task's instructions and response format to generate structured
215
- responses from the language model.
353
+ # With progress bar for large datasets
354
+ large_texts = pd.Series(["text"] * 5000)
355
+ embeddings = large_texts.ai.embeddings(
356
+ batch_size=100,
357
+ show_progress=True
358
+ )
359
+ ```
360
+
361
+ Args:
362
+ batch_size (int | None, optional): Number of inputs grouped into a
363
+ single request. Defaults to ``None`` (automatic batch size optimization
364
+ based on execution time). Set to a positive integer for fixed batch size.
365
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
366
+ **api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
367
+
368
+ Returns:
369
+ pandas.Series: Series whose values are ``np.ndarray`` objects
370
+ (dtype ``float32``).
371
+ """
372
+ return self.embeddings_with_cache(
373
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
374
+ **api_kwargs,
375
+ )
376
+
377
+ def task_with_cache(
378
+ self,
379
+ task: PreparedTask[ResponseFormat],
380
+ cache: BatchingMapProxy[str, ResponseFormat],
381
+ **api_kwargs,
382
+ ) -> pd.Series:
383
+ """Execute a prepared task on every Series element using a provided cache.
384
+
385
+ This mirrors ``responses_with_cache`` but uses the task's stored instructions
386
+ and response format. A supplied ``BatchingMapProxy`` enables cross‑operation
387
+ deduplicated reuse and external batch size / progress control.
388
+
389
+ Example:
390
+ ```python
391
+ from openaivec._cache import BatchingMapProxy
392
+ shared_cache = BatchingMapProxy(batch_size=64)
393
+ reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
394
+ ```
395
+
396
+ Args:
397
+ task (PreparedTask): Prepared task (instructions + response_format).
398
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
399
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
400
+
401
+ Note:
402
+ Core routing keys (``model``, system instructions, user input) are managed
403
+ internally and cannot be overridden.
404
+
405
+ Returns:
406
+ pandas.Series: Task results aligned with the original Series index.
407
+ """
408
+ client: BatchResponses = BatchResponses(
409
+ client=CONTAINER.resolve(OpenAI),
410
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
411
+ system_message=task.instructions,
412
+ response_format=task.response_format,
413
+ cache=cache,
414
+ api_kwargs=api_kwargs,
415
+ )
416
+ return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
417
+
418
+ def task(
419
+ self,
420
+ task: PreparedTask,
421
+ batch_size: int | None = None,
422
+ show_progress: bool = True,
423
+ **api_kwargs,
424
+ ) -> pd.Series:
425
+ """Execute a prepared task on every Series element.
216
426
 
217
427
  Example:
218
428
  ```python
219
- from openaivec.model import PreparedTask
429
+ from openaivec._model import PreparedTask
220
430
 
221
431
  # Assume you have a prepared task for sentiment analysis
222
432
  sentiment_task = PreparedTask(...)
223
433
 
224
434
  reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
435
+ # Basic usage
225
436
  results = reviews.ai.task(sentiment_task)
437
+
438
+ # With progress bar for large datasets
439
+ large_reviews = pd.Series(["review text"] * 2000)
440
+ results = large_reviews.ai.task(
441
+ sentiment_task,
442
+ batch_size=50,
443
+ show_progress=True
444
+ )
226
445
  ```
227
- This method returns a Series containing the task results for each
228
- corresponding input element, following the task's defined structure.
229
446
 
230
447
  Args:
231
448
  task (PreparedTask): A pre-configured task containing instructions,
232
- response format, and other parameters for processing the inputs.
233
- batch_size (int, optional): Number of prompts grouped into a single
234
- request to optimize API usage. Defaults to 128.
449
+ response format for processing the inputs.
450
+ batch_size (int | None, optional): Number of prompts grouped into a single
451
+ request to optimize API usage. Defaults to ``None`` (automatic batch size
452
+ optimization based on execution time). Set to a positive integer for fixed batch size.
453
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
454
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
455
+
456
+ Note:
457
+ Core batching / routing keys (``model``, ``instructions`` / system message,
458
+ user ``input``) are managed by the library and cannot be overridden.
235
459
 
236
460
  Returns:
237
- pandas.Series: Series whose values are instances of the task's
238
- response format, aligned with the original Series index.
461
+ pandas.Series: Series whose values are instances of the task's response format.
239
462
  """
240
- client = BatchResponses.of_task(
241
- client=_DI.resolve(OpenAI), model_name=_DI.resolve(ResponsesModelName).value, task=task
463
+ return self.task_with_cache(
464
+ task=task,
465
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
466
+ **api_kwargs,
242
467
  )
243
468
 
244
- return pd.Series(
245
- client.parse(self._obj.tolist(), batch_size=batch_size),
246
- index=self._obj.index,
247
- name=self._obj.name,
469
+ def parse_with_cache(
470
+ self,
471
+ instructions: str,
472
+ cache: BatchingMapProxy[str, ResponseFormat],
473
+ response_format: type[ResponseFormat] | None = None,
474
+ max_examples: int = 100,
475
+ **api_kwargs,
476
+ ) -> pd.Series:
477
+ """Parse Series values using an LLM with a provided cache.
478
+
479
+ This method allows external control over caching behavior while parsing
480
+ Series content into structured data. If no response format is provided,
481
+ the method automatically infers an appropriate schema by analyzing the
482
+ data patterns.
483
+
484
+ Args:
485
+ instructions (str): Plain language description of what information
486
+ to extract (e.g., "Extract customer information including name
487
+ and contact details"). This guides both the extraction process
488
+ and schema inference.
489
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
490
+ instance for managing API call batching and deduplication.
491
+ Set cache.batch_size=None to enable automatic batch size optimization.
492
+ response_format (type[ResponseFormat] | None, optional): Target structure
493
+ for the parsed data. Can be a Pydantic model class, built-in type
494
+ (str, int, float, bool, list, dict), or None. If None, the method
495
+ infers an appropriate schema based on the instructions and data.
496
+ Defaults to None.
497
+ max_examples (int, optional): Maximum number of Series values to
498
+ analyze when inferring the schema. Only used when response_format
499
+ is None. Defaults to 100.
500
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p,
501
+ frequency_penalty, presence_penalty, seed, etc.) forwarded to
502
+ the underlying API calls.
503
+
504
+ Returns:
505
+ pandas.Series: Series containing parsed structured data. Each value
506
+ is an instance of the specified response_format or the inferred
507
+ schema model, aligned with the original Series index.
508
+ """
509
+
510
+ schema: SchemaInferenceOutput | None = None
511
+ if response_format is None:
512
+ schema = self.infer_schema(instructions=instructions, max_examples=max_examples, **api_kwargs)
513
+
514
+ return self.responses_with_cache(
515
+ instructions=schema.inference_prompt if schema else instructions,
516
+ cache=cache,
517
+ response_format=response_format or schema.model,
518
+ **api_kwargs,
248
519
  )
249
520
 
250
- def embeddings(self, batch_size: int = 128) -> pd.Series:
251
- """Compute OpenAI embeddings for every Series element.
521
+ def parse(
522
+ self,
523
+ instructions: str,
524
+ response_format: type[ResponseFormat] | None = None,
525
+ max_examples: int = 100,
526
+ batch_size: int | None = None,
527
+ show_progress: bool = True,
528
+ **api_kwargs,
529
+ ) -> pd.Series:
530
+ """Parse Series values into structured data using an LLM.
531
+
532
+ This method extracts structured information from unstructured text in
533
+ the Series. When no response format is provided, it automatically
534
+ infers an appropriate schema by analyzing patterns in the data.
535
+
536
+ Args:
537
+ instructions (str): Plain language description of what information
538
+ to extract (e.g., "Extract product details including price,
539
+ category, and availability"). This guides both the extraction
540
+ process and schema inference.
541
+ response_format (type[ResponseFormat] | None, optional): Target
542
+ structure for the parsed data. Can be a Pydantic model class,
543
+ built-in type (str, int, float, bool, list, dict), or None.
544
+ If None, automatically infers a schema. Defaults to None.
545
+ max_examples (int, optional): Maximum number of Series values to
546
+ analyze when inferring schema. Only used when response_format
547
+ is None. Defaults to 100.
548
+ batch_size (int | None, optional): Number of requests to process
549
+ per batch. None enables automatic optimization. Defaults to None.
550
+ show_progress (bool, optional): Display progress bar in Jupyter
551
+ notebooks. Defaults to True.
552
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p,
553
+ frequency_penalty, presence_penalty, seed, etc.).
554
+
555
+ Returns:
556
+ pandas.Series: Series containing parsed structured data as instances
557
+ of response_format or the inferred schema model.
252
558
 
253
559
  Example:
254
560
  ```python
255
- animals = pd.Series(["cat", "dog", "elephant"])
256
- animals.ai.embeddings()
561
+ # With explicit schema
562
+ from pydantic import BaseModel
563
+ class Product(BaseModel):
564
+ name: str
565
+ price: float
566
+ in_stock: bool
567
+
568
+ descriptions = pd.Series([
569
+ "iPhone 15 Pro - $999, available now",
570
+ "Samsung Galaxy S24 - $899, out of stock"
571
+ ])
572
+ products = descriptions.ai.parse(
573
+ "Extract product information",
574
+ response_format=Product
575
+ )
576
+
577
+ # With automatic schema inference
578
+ reviews = pd.Series([
579
+ "Great product! 5 stars. Fast shipping.",
580
+ "Poor quality. 2 stars. Slow delivery."
581
+ ])
582
+ parsed = reviews.ai.parse(
583
+ "Extract review rating and shipping feedback"
584
+ )
257
585
  ```
258
- This method returns a Series of numpy arrays, each containing the
259
- embedding vector for the corresponding input.
260
- The embedding model is set by the `embeddings_model` function.
261
- The default embedding model is `text-embedding-3-small`.
586
+ """
587
+ return self.parse_with_cache(
588
+ instructions=instructions,
589
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
590
+ response_format=response_format,
591
+ max_examples=max_examples,
592
+ **api_kwargs,
593
+ )
594
+
595
+ def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> SchemaInferenceOutput:
596
+ """Infer a structured data schema from Series content using AI.
597
+
598
+ This method analyzes a sample of Series values to automatically generate
599
+ a Pydantic model that captures the relevant information structure. The
600
+ inferred schema supports both flat and hierarchical (nested) structures,
601
+ making it suitable for complex data extraction tasks.
262
602
 
263
603
  Args:
264
- batch_size (int, optional): Number of inputs grouped into a
265
- single request. Defaults to ``128``.
604
+ instructions (str): Plain language description of the extraction goal
605
+ (e.g., "Extract customer information for CRM system", "Parse
606
+ event details for calendar integration"). This guides which
607
+ fields to include and their purpose.
608
+ max_examples (int, optional): Maximum number of Series values to
609
+ analyze for pattern detection. The method samples randomly up
610
+ to this limit. Higher values may improve schema quality but
611
+ increase inference time. Defaults to 100.
612
+ **api_kwargs: Additional OpenAI API parameters for fine-tuning
613
+ the inference process.
266
614
 
267
615
  Returns:
268
- pandas.Series: Series whose values are ``np.ndarray`` objects
269
- (dtype ``float32``).
616
+ InferredSchema: A comprehensive schema object containing:
617
+ - instructions: Refined extraction objective statement
618
+ - fields: Hierarchical field specifications with names, types,
619
+ descriptions, and nested structures where applicable
620
+ - inference_prompt: Optimized prompt for consistent extraction
621
+ - model: Dynamically generated Pydantic model class supporting
622
+ both flat and nested structures
623
+ - task: PreparedTask configured for batch extraction using
624
+ the inferred schema
625
+
626
+ Example:
627
+ ```python
628
+ # Simple flat structure
629
+ reviews = pd.Series([
630
+ "5 stars! Great product, fast shipping to NYC.",
631
+ "2 stars. Product broke, slow delivery to LA."
632
+ ])
633
+ schema = reviews.ai.infer_schema(
634
+ "Extract review ratings and shipping information"
635
+ )
636
+
637
+ # Hierarchical structure
638
+ orders = pd.Series([
639
+ "Order #123: John Doe, 123 Main St, NYC. Items: iPhone ($999), Case ($29)",
640
+ "Order #456: Jane Smith, 456 Oak Ave, LA. Items: iPad ($799)"
641
+ ])
642
+ schema = orders.ai.infer_schema(
643
+ "Extract order details including customer and items"
644
+ )
645
+ # Inferred schema may include nested structures like:
646
+ # - customer: {name: str, address: str, city: str}
647
+ # - items: [{product: str, price: float}]
648
+
649
+ # Apply the schema for extraction
650
+ extracted = orders.ai.task(schema.task)
651
+ ```
652
+
653
+ Note:
654
+ The inference process uses multiple AI iterations to ensure schema
655
+ validity. Nested structures are automatically detected when the
656
+ data contains hierarchical relationships. The generated Pydantic
657
+ model ensures type safety and validation for all extracted data.
270
658
  """
271
- client: BatchEmbeddings = BatchEmbeddings(
272
- client=_DI.resolve(OpenAI),
273
- model_name=_DI.resolve(EmbeddingsModelName).value,
274
- )
659
+ inferer = CONTAINER.resolve(SchemaInferer)
275
660
 
276
- return pd.Series(
277
- client.create(self._obj.tolist(), batch_size=batch_size),
278
- index=self._obj.index,
279
- name=self._obj.name,
661
+ input: SchemaInferenceInput = SchemaInferenceInput(
662
+ examples=self._obj.sample(n=min(max_examples, len(self._obj))).tolist(),
663
+ instructions=instructions,
664
+ **api_kwargs,
280
665
  )
666
+ return inferer.infer_schema(input)
281
667
 
282
668
  def count_tokens(self) -> pd.Series:
283
669
  """Count `tiktoken` tokens per row.
@@ -288,12 +674,12 @@ class OpenAIVecSeriesAccessor:
288
674
  animals.ai.count_tokens()
289
675
  ```
290
676
  This method uses the `tiktoken` library to count tokens based on the
291
- model name set by `responses_model`.
677
+ model name configured via `set_responses_model`.
292
678
 
293
679
  Returns:
294
680
  pandas.Series: Token counts for each element.
295
681
  """
296
- encoding: tiktoken.Encoding = _DI.resolve(tiktoken.Encoding)
682
+ encoding: tiktoken.Encoding = CONTAINER.resolve(tiktoken.Encoding)
297
683
  return self._obj.map(encoding.encode).map(len).rename("num_tokens")
298
684
 
299
685
  def extract(self) -> pd.DataFrame:
@@ -333,47 +719,65 @@ class OpenAIVecDataFrameAccessor:
333
719
  def __init__(self, df_obj: pd.DataFrame):
334
720
  self._obj = df_obj
335
721
 
336
- def extract(self, column: str) -> pd.DataFrame:
337
- """Flatten one column of Pydantic models/dicts into top‑level columns.
722
+ def responses_with_cache(
723
+ self,
724
+ instructions: str,
725
+ cache: BatchingMapProxy[str, ResponseFormat],
726
+ response_format: type[ResponseFormat] = str,
727
+ **api_kwargs,
728
+ ) -> pd.Series:
729
+ """Generate a response for each row after serializing it to JSON using a provided cache.
730
+
731
+ This method allows external control over caching behavior by accepting
732
+ a pre-configured BatchingMapProxy instance, enabling cache sharing
733
+ across multiple operations or custom batch size management.
338
734
 
339
735
  Example:
340
736
  ```python
737
+ from openaivec._cache import BatchingMapProxy
738
+
739
+ # Create a shared cache with custom batch size
740
+ shared_cache = BatchingMapProxy(batch_size=64)
741
+
341
742
  df = pd.DataFrame([
342
- {"animal": {"name": "cat", "legs": 4}},
343
- {"animal": {"name": "dog", "legs": 4}},
344
- {"animal": {"name": "elephant", "legs": 4}},
743
+ {"name": "cat", "legs": 4},
744
+ {"name": "dog", "legs": 4},
745
+ {"name": "elephant", "legs": 4},
345
746
  ])
346
- df.ai.extract("animal")
747
+ result = df.ai.responses_with_cache(
748
+ "what is the animal's name?",
749
+ cache=shared_cache
750
+ )
347
751
  ```
348
- This method returns a DataFrame with the same index as the original,
349
- where each column corresponds to a key in the dictionaries.
350
- The source column is dropped.
351
752
 
352
753
  Args:
353
- column (str): Column to expand.
754
+ instructions (str): System prompt for the assistant.
755
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
756
+ instance for managing API call batching and deduplication.
757
+ Set cache.batch_size=None to enable automatic batch size optimization.
758
+ response_format (type[ResponseFormat], optional): Desired Python type of the
759
+ responses. Defaults to ``str``.
760
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
354
761
 
355
762
  Returns:
356
- pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
763
+ pandas.Series: Responses aligned with the DataFrame's original index.
357
764
  """
358
- if column not in self._obj.columns:
359
- raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
360
-
361
- return (
362
- self._obj.pipe(lambda df: df.reset_index(drop=True))
363
- .pipe(lambda df: df.join(df[column].ai.extract()))
364
- .pipe(lambda df: df.set_index(self._obj.index))
365
- .pipe(lambda df: df.drop(columns=[column], axis=1))
765
+ return _df_rows_to_json_series(self._obj).ai.responses_with_cache(
766
+ instructions=instructions,
767
+ cache=cache,
768
+ response_format=response_format,
769
+ **api_kwargs,
366
770
  )
367
771
 
368
772
  def responses(
369
773
  self,
370
774
  instructions: str,
371
- response_format: Type[ResponseFormat] = str,
372
- batch_size: int = 128,
373
- temperature: float = 0.0,
374
- top_p: float = 1.0,
775
+ response_format: type[ResponseFormat] = str,
776
+ batch_size: int | None = None,
777
+ show_progress: bool = True,
778
+ **api_kwargs,
375
779
  ) -> pd.Series:
376
- """Generate a response for each row after serialising it to JSON.
780
+ """Generate a response for each row after serializing it to JSON.
377
781
 
378
782
  Example:
379
783
  ```python
@@ -382,51 +786,75 @@ class OpenAIVecDataFrameAccessor:
382
786
  {"name": "dog", "legs": 4},
383
787
  {"name": "elephant", "legs": 4},
384
788
  ])
789
+ # Basic usage
385
790
  df.ai.responses("what is the animal's name?")
791
+
792
+ # With progress bar for large datasets
793
+ large_df = pd.DataFrame({"id": list(range(1000))})
794
+ large_df.ai.responses(
795
+ "generate a name for this ID",
796
+ batch_size=20,
797
+ show_progress=True
798
+ )
386
799
  ```
387
- This method returns a Series of strings, each containing the
388
- assistant's response to the corresponding input.
389
- Each row is serialised to JSON before being sent to the assistant.
390
- The model used is set by the `responses_model` function.
391
- The default model is `gpt-4.1-mini`.
392
800
 
393
801
  Args:
394
802
  instructions (str): System prompt for the assistant.
395
- response_format (Type[ResponseFormat], optional): Desired Python type of the
803
+ response_format (type[ResponseFormat], optional): Desired Python type of the
396
804
  responses. Defaults to ``str``.
397
- batch_size (int, optional): Number of requests sent in one batch.
398
- Defaults to ``128``.
399
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
400
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
401
-
402
- Returns:
403
- pandas.Series: Responses aligned with the DataFrame’s original index.
404
- """
405
- return self._obj.pipe(
406
- lambda df: (
407
- df.pipe(lambda df: pd.Series(df.to_dict(orient="records"), index=df.index, name="record"))
408
- .map(lambda x: json.dumps(x, ensure_ascii=False))
409
- .ai.responses(
410
- instructions=instructions,
411
- response_format=response_format,
412
- batch_size=batch_size,
413
- temperature=temperature,
414
- top_p=top_p,
415
- )
416
- )
805
+ batch_size (int | None, optional): Number of requests sent in one batch.
806
+ Defaults to ``None`` (automatic batch size optimization
807
+ based on execution time). Set to a positive integer for fixed batch size.
808
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
809
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
810
+
811
+ Returns:
812
+ pandas.Series: Responses aligned with the DataFrame's original index.
813
+ """
814
+ return self.responses_with_cache(
815
+ instructions=instructions,
816
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
817
+ response_format=response_format,
818
+ **api_kwargs,
417
819
  )
418
820
 
419
- def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
420
- """Execute a prepared task on each DataFrame row after serialising it to JSON.
821
+ def task_with_cache(
822
+ self,
823
+ task: PreparedTask[ResponseFormat],
824
+ cache: BatchingMapProxy[str, ResponseFormat],
825
+ **api_kwargs,
826
+ ) -> pd.Series:
827
+ """Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
828
+
829
+ Args:
830
+ task (PreparedTask): Prepared task (instructions + response_format).
831
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
832
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
833
+
834
+ Note:
835
+ Core routing keys are managed internally.
421
836
 
422
- This method applies a pre-configured task to each row in the DataFrame,
423
- using the task's instructions and response format to generate structured
424
- responses from the language model. Each row is serialised to JSON before
425
- being processed by the task.
837
+ Returns:
838
+ pandas.Series: Task results aligned with the DataFrame's original index.
839
+ """
840
+ return _df_rows_to_json_series(self._obj).ai.task_with_cache(
841
+ task=task,
842
+ cache=cache,
843
+ **api_kwargs,
844
+ )
845
+
846
+ def task(
847
+ self,
848
+ task: PreparedTask,
849
+ batch_size: int | None = None,
850
+ show_progress: bool = True,
851
+ **api_kwargs,
852
+ ) -> pd.Series:
853
+ """Execute a prepared task on each DataFrame row after serializing it to JSON.
426
854
 
427
855
  Example:
428
856
  ```python
429
- from openaivec.model import PreparedTask
857
+ from openaivec._model import PreparedTask
430
858
 
431
859
  # Assume you have a prepared task for data analysis
432
860
  analysis_task = PreparedTask(...)
@@ -436,30 +864,237 @@ class OpenAIVecDataFrameAccessor:
436
864
  {"name": "dog", "legs": 4},
437
865
  {"name": "elephant", "legs": 4},
438
866
  ])
867
+ # Basic usage
439
868
  results = df.ai.task(analysis_task)
869
+
870
+ # With progress bar for large datasets
871
+ large_df = pd.DataFrame({"id": list(range(1000))})
872
+ results = large_df.ai.task(
873
+ analysis_task,
874
+ batch_size=50,
875
+ show_progress=True
876
+ )
440
877
  ```
441
- This method returns a Series containing the task results for each
442
- corresponding row, following the task's defined structure.
443
878
 
444
879
  Args:
445
880
  task (PreparedTask): A pre-configured task containing instructions,
446
- response format, and other parameters for processing the inputs.
447
- batch_size (int, optional): Number of requests sent in one batch
448
- to optimize API usage. Defaults to 128.
881
+ response format for processing the inputs.
882
+ batch_size (int | None, optional): Number of requests sent in one batch
883
+ to optimize API usage. Defaults to ``None`` (automatic batch size
884
+ optimization based on execution time). Set to a positive integer for fixed batch size.
885
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
886
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
887
+
888
+ Note:
889
+ Core batching / routing keys (``model``, ``instructions`` / system message, user ``input``)
890
+ are managed by the library and cannot be overridden.
449
891
 
450
892
  Returns:
451
893
  pandas.Series: Series whose values are instances of the task's
452
894
  response format, aligned with the DataFrame's original index.
453
895
  """
454
- return self._obj.pipe(
455
- lambda df: (
456
- df.pipe(lambda df: pd.Series(df.to_dict(orient="records"), index=df.index, name="record"))
457
- .map(lambda x: json.dumps(x, ensure_ascii=False))
458
- .ai.task(task=task, batch_size=batch_size)
896
+ return _df_rows_to_json_series(self._obj).ai.task(
897
+ task=task,
898
+ batch_size=batch_size,
899
+ show_progress=show_progress,
900
+ **api_kwargs,
901
+ )
902
+
903
+ def parse_with_cache(
904
+ self,
905
+ instructions: str,
906
+ cache: BatchingMapProxy[str, ResponseFormat],
907
+ response_format: type[ResponseFormat] | None = None,
908
+ max_examples: int = 100,
909
+ **api_kwargs,
910
+ ) -> pd.Series:
911
+ """Parse DataFrame rows into structured data using an LLM with a provided cache.
912
+
913
+ This method processes each DataFrame row (converted to JSON) and extracts
914
+ structured information using an LLM. External cache control enables
915
+ deduplication across operations and custom batch management.
916
+
917
+ Args:
918
+ instructions (str): Plain language description of what information
919
+ to extract from each row (e.g., "Extract shipping details and
920
+ order status"). Guides both extraction and schema inference.
921
+ cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
922
+ instance for managing API call batching and deduplication.
923
+ Set cache.batch_size=None for automatic optimization.
924
+ response_format (type[ResponseFormat] | None, optional): Target
925
+ structure for parsed data. Can be a Pydantic model, built-in
926
+ type, or None for automatic schema inference. Defaults to None.
927
+ max_examples (int, optional): Maximum rows to analyze when inferring
928
+ schema (only used when response_format is None). Defaults to 100.
929
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p,
930
+ frequency_penalty, presence_penalty, seed, etc.).
931
+
932
+ Returns:
933
+ pandas.Series: Series containing parsed structured data as instances
934
+ of response_format or the inferred schema model, indexed like
935
+ the original DataFrame.
936
+ """
937
+ return _df_rows_to_json_series(self._obj).ai.parse_with_cache(
938
+ instructions=instructions,
939
+ cache=cache,
940
+ response_format=response_format,
941
+ max_examples=max_examples,
942
+ **api_kwargs,
943
+ )
944
+
945
+ def parse(
946
+ self,
947
+ instructions: str,
948
+ response_format: type[ResponseFormat] | None = None,
949
+ max_examples: int = 100,
950
+ batch_size: int | None = None,
951
+ show_progress: bool = True,
952
+ **api_kwargs,
953
+ ) -> pd.Series:
954
+ """Parse DataFrame rows into structured data using an LLM.
955
+
956
+ Each row is converted to JSON and processed to extract structured
957
+ information. When no response format is provided, the method
958
+ automatically infers an appropriate schema from the data.
959
+
960
+ Args:
961
+ instructions (str): Plain language description of extraction goals
962
+ (e.g., "Extract transaction details including amount, date,
963
+ and merchant"). Guides extraction and schema inference.
964
+ response_format (type[ResponseFormat] | None, optional): Target
965
+ structure for parsed data. Can be a Pydantic model, built-in
966
+ type, or None for automatic inference. Defaults to None.
967
+ max_examples (int, optional): Maximum rows to analyze for schema
968
+ inference (when response_format is None). Defaults to 100.
969
+ batch_size (int | None, optional): Rows per API batch. None
970
+ enables automatic optimization. Defaults to None.
971
+ show_progress (bool, optional): Show progress bar in Jupyter
972
+ notebooks. Defaults to True.
973
+ **api_kwargs: Additional OpenAI API parameters.
974
+
975
+ Returns:
976
+ pandas.Series: Parsed structured data indexed like the original
977
+ DataFrame.
978
+
979
+ Example:
980
+ ```python
981
+ df = pd.DataFrame({
982
+ 'log': [
983
+ '2024-01-01 10:00 ERROR Database connection failed',
984
+ '2024-01-01 10:05 INFO Service started successfully'
985
+ ]
986
+ })
987
+
988
+ # With automatic schema inference
989
+ parsed = df.ai.parse("Extract timestamp, level, and message")
990
+ # Returns Series with inferred structure like:
991
+ # {timestamp: str, level: str, message: str}
992
+ ```
993
+ """
994
+ return self.parse_with_cache(
995
+ instructions=instructions,
996
+ cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
997
+ response_format=response_format,
998
+ max_examples=max_examples,
999
+ **api_kwargs,
1000
+ )
1001
+
1002
+ def infer_schema(self, instructions: str, max_examples: int = 100, **api_kwargs) -> SchemaInferenceOutput:
1003
+ """Infer a structured data schema from DataFrame rows using AI.
1004
+
1005
+ This method analyzes a sample of DataFrame rows to automatically infer
1006
+ a structured schema that can be used for consistent data extraction.
1007
+ Each row is converted to JSON format and analyzed to identify patterns,
1008
+ field types, and potential categorical values.
1009
+
1010
+ Args:
1011
+ instructions (str): Plain language description of how the extracted
1012
+ structured data will be used (e.g., "Extract operational metrics
1013
+ for dashboard", "Parse customer attributes for segmentation").
1014
+ This guides field relevance and helps exclude irrelevant information.
1015
+ max_examples (int): Maximum number of rows to analyze from the
1016
+ DataFrame. The method will sample randomly up to this limit.
1017
+ Defaults to 100.
1018
+
1019
+ Returns:
1020
+ InferredSchema: An object containing:
1021
+ - instructions: Normalized statement of the extraction objective
1022
+ - fields: List of field specifications with names, types, and descriptions
1023
+ - inference_prompt: Reusable prompt for future extractions
1024
+ - model: Dynamically generated Pydantic model for parsing
1025
+ - task: PreparedTask for batch extraction operations
1026
+
1027
+ Example:
1028
+ ```python
1029
+ df = pd.DataFrame({
1030
+ 'text': [
1031
+ "Order #123: Shipped to NYC, arriving Tuesday",
1032
+ "Order #456: Delayed due to weather, new ETA Friday",
1033
+ "Order #789: Delivered to customer in LA"
1034
+ ],
1035
+ 'timestamp': ['2024-01-01', '2024-01-02', '2024-01-03']
1036
+ })
1037
+
1038
+ # Infer schema for logistics tracking
1039
+ schema = df.ai.infer_schema(
1040
+ instructions="Extract shipping status and location data for logistics tracking"
459
1041
  )
1042
+
1043
+ # Apply the schema to extract structured data
1044
+ extracted_df = df.ai.task(schema.task)
1045
+ ```
1046
+
1047
+ Note:
1048
+ Each row is converted to JSON before analysis. The inference
1049
+ process automatically detects hierarchical relationships and
1050
+ creates appropriate nested structures when present. The generated
1051
+ Pydantic model ensures type safety and validation.
1052
+ """
1053
+ return _df_rows_to_json_series(self._obj).ai.infer_schema(
1054
+ instructions=instructions,
1055
+ max_examples=max_examples,
1056
+ **api_kwargs,
1057
+ )
1058
+
1059
+ def extract(self, column: str) -> pd.DataFrame:
1060
+ """Flatten one column of Pydantic models/dicts into top‑level columns.
1061
+
1062
+ Example:
1063
+ ```python
1064
+ df = pd.DataFrame([
1065
+ {"animal": {"name": "cat", "legs": 4}},
1066
+ {"animal": {"name": "dog", "legs": 4}},
1067
+ {"animal": {"name": "elephant", "legs": 4}},
1068
+ ])
1069
+ df.ai.extract("animal")
1070
+ ```
1071
+ This method returns a DataFrame with the same index as the original,
1072
+ where each column corresponds to a key in the dictionaries.
1073
+ The source column is dropped.
1074
+
1075
+ Args:
1076
+ column (str): Column to expand.
1077
+
1078
+ Returns:
1079
+ pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
1080
+ """
1081
+ if column not in self._obj.columns:
1082
+ raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
1083
+
1084
+ return (
1085
+ self._obj.pipe(lambda df: df.reset_index(drop=True))
1086
+ .pipe(lambda df: df.join(df[column].ai.extract()))
1087
+ .pipe(lambda df: df.set_index(self._obj.index))
1088
+ .pipe(lambda df: df.drop(columns=[column], axis=1))
460
1089
  )
461
1090
 
462
- def fillna(self, target_column_name: str, max_examples: int = 500, batch_size: int = 128) -> pd.DataFrame:
1091
+ def fillna(
1092
+ self,
1093
+ target_column_name: str,
1094
+ max_examples: int = 500,
1095
+ batch_size: int | None = None,
1096
+ show_progress: bool = True,
1097
+ ) -> pd.DataFrame:
463
1098
  """Fill missing values in a DataFrame column using AI-powered inference.
464
1099
 
465
1100
  This method uses machine learning to intelligently fill missing (NaN) values
@@ -473,8 +1108,10 @@ class OpenAIVecDataFrameAccessor:
473
1108
  max_examples (int, optional): The maximum number of example rows to use
474
1109
  for context when predicting missing values. Higher values may improve
475
1110
  accuracy but increase API costs and processing time. Defaults to 500.
476
- batch_size (int, optional): Number of requests sent in one batch
477
- to optimize API usage. Defaults to 128.
1111
+ batch_size (int | None, optional): Number of requests sent in one batch
1112
+ to optimize API usage. Defaults to ``None`` (automatic batch size
1113
+ optimization based on execution time). Set to a positive integer for fixed batch size.
1114
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
478
1115
 
479
1116
  Returns:
480
1117
  pandas.DataFrame: A new DataFrame with missing values filled in the target
@@ -490,6 +1127,10 @@ class OpenAIVecDataFrameAccessor:
490
1127
 
491
1128
  # Fill missing values in the 'name' column
492
1129
  filled_df = df.ai.fillna('name')
1130
+
1131
+ # With progress bar for large datasets
1132
+ large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
1133
+ filled_df = large_df.ai.fillna('name', batch_size=32, show_progress=True)
493
1134
  ```
494
1135
 
495
1136
  Note:
@@ -502,7 +1143,9 @@ class OpenAIVecDataFrameAccessor:
502
1143
  if missing_rows.empty:
503
1144
  return self._obj
504
1145
 
505
- filled_values: List[FillNaResponse] = missing_rows.ai.task(task=task, batch_size=batch_size)
1146
+ filled_values: list[FillNaResponse] = missing_rows.ai.task(
1147
+ task=task, batch_size=batch_size, show_progress=show_progress
1148
+ )
506
1149
 
507
1150
  # get deep copy of the DataFrame to avoid modifying the original
508
1151
  df = self._obj.copy()
@@ -519,27 +1162,106 @@ class OpenAIVecDataFrameAccessor:
519
1162
  return df
520
1163
 
521
1164
  def similarity(self, col1: str, col2: str) -> pd.Series:
1165
+ """Compute cosine similarity between two columns containing embedding vectors.
1166
+
1167
+ This method calculates the cosine similarity between vectors stored in
1168
+ two columns of the DataFrame. The vectors should be numpy arrays or
1169
+ array-like objects that support dot product operations.
1170
+
1171
+ Example:
1172
+ ```python
1173
+ df = pd.DataFrame({
1174
+ 'vec1': [np.array([1, 0, 0]), np.array([0, 1, 0])],
1175
+ 'vec2': [np.array([1, 0, 0]), np.array([1, 1, 0])]
1176
+ })
1177
+ similarities = df.ai.similarity('vec1', 'vec2')
1178
+ ```
1179
+
1180
+ Args:
1181
+ col1 (str): Name of the first column containing embedding vectors.
1182
+ col2 (str): Name of the second column containing embedding vectors.
1183
+
1184
+ Returns:
1185
+ pandas.Series: Series containing cosine similarity scores between
1186
+ corresponding vectors in col1 and col2, with values ranging
1187
+ from -1 to 1, where 1 indicates identical direction.
1188
+ """
522
1189
  return self._obj.apply(
523
1190
  lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
524
1191
  axis=1,
525
- ).rename("similarity")
1192
+ ).rename("similarity") # type: ignore[arg-type]
1193
+
1194
+
1195
+ @pd.api.extensions.register_series_accessor("aio")
1196
+ class AsyncOpenAIVecSeriesAccessor:
1197
+ """pandas Series accessor (``.aio``) that adds OpenAI helpers."""
1198
+
1199
+ def __init__(self, series_obj: pd.Series):
1200
+ self._obj = series_obj
1201
+
1202
+ async def responses_with_cache(
1203
+ self,
1204
+ instructions: str,
1205
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1206
+ response_format: type[ResponseFormat] = str,
1207
+ **api_kwargs,
1208
+ ) -> pd.Series:
1209
+ """Call an LLM once for every Series element using a provided cache (asynchronously).
1210
+
1211
+ This method allows external control over caching behavior by accepting
1212
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1213
+ across multiple operations or custom batch size management. The concurrency
1214
+ is controlled by the cache instance itself.
1215
+
1216
+ Example:
1217
+ ```python
1218
+ result = await series.aio.responses_with_cache(
1219
+ "classify",
1220
+ cache=shared,
1221
+ max_output_tokens=256,
1222
+ frequency_penalty=0.2,
1223
+ )
1224
+ ```
526
1225
 
1226
+ Args:
1227
+ instructions (str): System prompt prepended to every user message.
1228
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
1229
+ instance for managing API call batching and deduplication.
1230
+ Set cache.batch_size=None to enable automatic batch size optimization.
1231
+ response_format (type[ResponseFormat], optional): Pydantic model or built‑in
1232
+ type the assistant should return. Defaults to ``str``.
1233
+ **api_kwargs: Additional keyword arguments forwarded verbatim to
1234
+ ``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
1235
+ ``max_output_tokens``, penalties, future parameters). Core batching keys
1236
+ (model, instructions, input, text_format) are protected and silently
1237
+ ignored if provided.
527
1238
 
528
- @pd.api.extensions.register_series_accessor("aio")
529
- class AsyncOpenAIVecSeriesAccessor:
530
- """pandas Series accessor (``.aio``) that adds OpenAI helpers."""
1239
+ Returns:
1240
+ pandas.Series: Series whose values are instances of ``response_format``.
531
1241
 
532
- def __init__(self, series_obj: pd.Series):
533
- self._obj = series_obj
1242
+ Note:
1243
+ This is an asynchronous method and must be awaited.
1244
+ """
1245
+ client: AsyncBatchResponses = AsyncBatchResponses(
1246
+ client=CONTAINER.resolve(AsyncOpenAI),
1247
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
1248
+ system_message=instructions,
1249
+ response_format=response_format,
1250
+ cache=cache,
1251
+ api_kwargs=api_kwargs,
1252
+ )
1253
+
1254
+ results = await client.parse(self._obj.tolist())
1255
+ return pd.Series(results, index=self._obj.index, name=self._obj.name)
534
1256
 
535
1257
  async def responses(
536
1258
  self,
537
1259
  instructions: str,
538
- response_format: Type[ResponseFormat] = str,
539
- batch_size: int = 128,
540
- temperature: float = 0.0,
541
- top_p: float = 1.0,
1260
+ response_format: type[ResponseFormat] = str,
1261
+ batch_size: int | None = None,
542
1262
  max_concurrency: int = 8,
1263
+ show_progress: bool = True,
1264
+ **api_kwargs,
543
1265
  ) -> pd.Series:
544
1266
  """Call an LLM once for every Series element (asynchronously).
545
1267
 
@@ -548,22 +1270,32 @@ class AsyncOpenAIVecSeriesAccessor:
548
1270
  animals = pd.Series(["cat", "dog", "elephant"])
549
1271
  # Must be awaited
550
1272
  results = await animals.aio.responses("translate to French")
1273
+
1274
+ # With progress bar for large datasets
1275
+ large_series = pd.Series(["data"] * 1000)
1276
+ results = await large_series.aio.responses(
1277
+ "analyze this data",
1278
+ batch_size=32,
1279
+ max_concurrency=4,
1280
+ show_progress=True
1281
+ )
551
1282
  ```
552
- This method returns a Series of strings, each containing the
553
- assistant's response to the corresponding input.
554
- The model used is set by the `responses_model` function.
555
- The default model is `gpt-4.1-mini`.
556
1283
 
557
1284
  Args:
558
1285
  instructions (str): System prompt prepended to every user message.
559
- response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
1286
+ response_format (type[ResponseFormat], optional): Pydantic model or built‑in
560
1287
  type the assistant should return. Defaults to ``str``.
561
- batch_size (int, optional): Number of prompts grouped into a single
562
- request. Defaults to ``128``.
563
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
564
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
1288
+ batch_size (int | None, optional): Number of prompts grouped into a single
1289
+ request. Defaults to ``None`` (automatic batch size optimization
1290
+ based on execution time). Set to a positive integer for fixed batch size.
565
1291
  max_concurrency (int, optional): Maximum number of concurrent
566
1292
  requests. Defaults to ``8``.
1293
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
1294
+ **api_kwargs: Additional keyword arguments forwarded verbatim to
1295
+ ``AsyncOpenAI.responses.parse`` (e.g. ``temperature``, ``top_p``,
1296
+ ``max_output_tokens``, penalties, future parameters). Core batching keys
1297
+ (model, instructions, input, text_format) are protected and silently
1298
+ ignored if provided.
567
1299
 
568
1300
  Returns:
569
1301
  pandas.Series: Series whose values are instances of ``response_format``.
@@ -571,18 +1303,64 @@ class AsyncOpenAIVecSeriesAccessor:
571
1303
  Note:
572
1304
  This is an asynchronous method and must be awaited.
573
1305
  """
574
- client: AsyncBatchResponses = AsyncBatchResponses(
575
- client=_DI.resolve(AsyncOpenAI),
576
- model_name=_DI.resolve(ResponsesModelName).value,
577
- system_message=instructions,
1306
+ return await self.responses_with_cache(
1307
+ instructions=instructions,
1308
+ cache=AsyncBatchingMapProxy(
1309
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1310
+ ),
578
1311
  response_format=response_format,
579
- temperature=temperature,
580
- top_p=top_p,
581
- max_concurrency=max_concurrency,
1312
+ **api_kwargs,
1313
+ )
1314
+
1315
+ async def embeddings_with_cache(
1316
+ self,
1317
+ cache: AsyncBatchingMapProxy[str, np.ndarray],
1318
+ **api_kwargs,
1319
+ ) -> pd.Series:
1320
+ """Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
1321
+
1322
+ This method allows external control over caching behavior by accepting
1323
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1324
+ across multiple operations or custom batch size management. The concurrency
1325
+ is controlled by the cache instance itself.
1326
+
1327
+ Example:
1328
+ ```python
1329
+ from openaivec._cache import AsyncBatchingMapProxy
1330
+ import numpy as np
1331
+
1332
+ # Create a shared cache with custom batch size and concurrency
1333
+ shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
1334
+ batch_size=64, max_concurrency=4
1335
+ )
1336
+
1337
+ animals = pd.Series(["cat", "dog", "elephant"])
1338
+ # Must be awaited
1339
+ embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
1340
+ ```
1341
+
1342
+ Args:
1343
+ cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
1344
+ instance for managing API call batching and deduplication.
1345
+ Set cache.batch_size=None to enable automatic batch size optimization.
1346
+ **api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
1347
+
1348
+ Returns:
1349
+ pandas.Series: Series whose values are ``np.ndarray`` objects
1350
+ (dtype ``float32``).
1351
+
1352
+ Note:
1353
+ This is an asynchronous method and must be awaited.
1354
+ """
1355
+ client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
1356
+ client=CONTAINER.resolve(AsyncOpenAI),
1357
+ model_name=CONTAINER.resolve(EmbeddingsModelName).value,
1358
+ cache=cache,
1359
+ api_kwargs=api_kwargs,
582
1360
  )
583
1361
 
584
1362
  # Await the async operation
585
- results = await client.parse(self._obj.tolist(), batch_size=batch_size)
1363
+ results = await client.create(self._obj.tolist())
586
1364
 
587
1365
  return pd.Series(
588
1366
  results,
@@ -590,7 +1368,9 @@ class AsyncOpenAIVecSeriesAccessor:
590
1368
  name=self._obj.name,
591
1369
  )
592
1370
 
593
- async def embeddings(self, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
1371
+ async def embeddings(
1372
+ self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = True, **api_kwargs
1373
+ ) -> pd.Series:
594
1374
  """Compute OpenAI embeddings for every Series element (asynchronously).
595
1375
 
596
1376
  Example:
@@ -598,17 +1378,24 @@ class AsyncOpenAIVecSeriesAccessor:
598
1378
  animals = pd.Series(["cat", "dog", "elephant"])
599
1379
  # Must be awaited
600
1380
  embeddings = await animals.aio.embeddings()
1381
+
1382
+ # With progress bar for large datasets
1383
+ large_texts = pd.Series(["text"] * 5000)
1384
+ embeddings = await large_texts.aio.embeddings(
1385
+ batch_size=100,
1386
+ max_concurrency=4,
1387
+ show_progress=True
1388
+ )
601
1389
  ```
602
- This method returns a Series of numpy arrays, each containing the
603
- embedding vector for the corresponding input.
604
- The embedding model is set by the `embeddings_model` function.
605
- The default embedding model is `text-embedding-3-small`.
606
1390
 
607
1391
  Args:
608
- batch_size (int, optional): Number of inputs grouped into a
609
- single request. Defaults to ``128``.
1392
+ batch_size (int | None, optional): Number of inputs grouped into a
1393
+ single request. Defaults to ``None`` (automatic batch size optimization
1394
+ based on execution time). Set to a positive integer for fixed batch size.
610
1395
  max_concurrency (int, optional): Maximum number of concurrent
611
1396
  requests. Defaults to ``8``.
1397
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
1398
+ **api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
612
1399
 
613
1400
  Returns:
614
1401
  pandas.Series: Series whose values are ``np.ndarray`` objects
@@ -617,31 +1404,88 @@ class AsyncOpenAIVecSeriesAccessor:
617
1404
  Note:
618
1405
  This is an asynchronous method and must be awaited.
619
1406
  """
620
- client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
621
- client=_DI.resolve(AsyncOpenAI),
622
- model_name=_DI.resolve(EmbeddingsModelName).value,
623
- max_concurrency=max_concurrency,
1407
+ return await self.embeddings_with_cache(
1408
+ cache=AsyncBatchingMapProxy(
1409
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1410
+ ),
1411
+ **api_kwargs,
624
1412
  )
625
1413
 
626
- # Await the async operation
627
- results = await client.create(self._obj.tolist(), batch_size=batch_size)
1414
+ async def task_with_cache(
1415
+ self,
1416
+ task: PreparedTask[ResponseFormat],
1417
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1418
+ **api_kwargs,
1419
+ ) -> pd.Series:
1420
+ """Execute a prepared task on every Series element using a provided cache (asynchronously).
628
1421
 
629
- return pd.Series(
630
- results,
631
- index=self._obj.index,
632
- name=self._obj.name,
1422
+ This method allows external control over caching behavior by accepting
1423
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1424
+ across multiple operations or custom batch size management. The concurrency
1425
+ is controlled by the cache instance itself.
1426
+
1427
+ Args:
1428
+ task (PreparedTask): A pre-configured task containing instructions,
1429
+ response format for processing the inputs.
1430
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
1431
+ instance for managing API call batching and deduplication.
1432
+ Set cache.batch_size=None to enable automatic batch size optimization.
1433
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
1434
+
1435
+ Example:
1436
+ ```python
1437
+ from openaivec._model import PreparedTask
1438
+ from openaivec._cache import AsyncBatchingMapProxy
1439
+
1440
+ # Create a shared cache with custom batch size and concurrency
1441
+ shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
1442
+
1443
+ # Assume you have a prepared task for sentiment analysis
1444
+ sentiment_task = PreparedTask(...)
1445
+
1446
+ reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
1447
+ # Must be awaited
1448
+ results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
1449
+ ```
1450
+
1451
+ Additional Keyword Args:
1452
+ Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
1453
+ ``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
1454
+ keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
1455
+ library and cannot be overridden.
1456
+
1457
+ Returns:
1458
+ pandas.Series: Series whose values are instances of the task's
1459
+ response format, aligned with the original Series index.
1460
+
1461
+ Note:
1462
+ This is an asynchronous method and must be awaited.
1463
+ """
1464
+ client = AsyncBatchResponses(
1465
+ client=CONTAINER.resolve(AsyncOpenAI),
1466
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
1467
+ system_message=task.instructions,
1468
+ response_format=task.response_format,
1469
+ cache=cache,
1470
+ api_kwargs=api_kwargs,
633
1471
  )
1472
+ results = await client.parse(self._obj.tolist())
634
1473
 
635
- async def task(self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
636
- """Execute a prepared task on every Series element (asynchronously).
1474
+ return pd.Series(results, index=self._obj.index, name=self._obj.name)
637
1475
 
638
- This method applies a pre-configured task to each element in the Series,
639
- using the task's instructions and response format to generate structured
640
- responses from the language model.
1476
+ async def task(
1477
+ self,
1478
+ task: PreparedTask,
1479
+ batch_size: int | None = None,
1480
+ max_concurrency: int = 8,
1481
+ show_progress: bool = True,
1482
+ **api_kwargs,
1483
+ ) -> pd.Series:
1484
+ """Execute a prepared task on every Series element (asynchronously).
641
1485
 
642
1486
  Example:
643
1487
  ```python
644
- from openaivec.model import PreparedTask
1488
+ from openaivec._model import PreparedTask
645
1489
 
646
1490
  # Assume you have a prepared task for sentiment analysis
647
1491
  sentiment_task = PreparedTask(...)
@@ -649,17 +1493,32 @@ class AsyncOpenAIVecSeriesAccessor:
649
1493
  reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
650
1494
  # Must be awaited
651
1495
  results = await reviews.aio.task(sentiment_task)
1496
+
1497
+ # With progress bar for large datasets
1498
+ large_reviews = pd.Series(["review text"] * 2000)
1499
+ results = await large_reviews.aio.task(
1500
+ sentiment_task,
1501
+ batch_size=50,
1502
+ max_concurrency=4,
1503
+ show_progress=True
1504
+ )
652
1505
  ```
653
- This method returns a Series containing the task results for each
654
- corresponding input element, following the task's defined structure.
655
1506
 
656
1507
  Args:
657
1508
  task (PreparedTask): A pre-configured task containing instructions,
658
- response format, and other parameters for processing the inputs.
659
- batch_size (int, optional): Number of prompts grouped into a single
660
- request to optimize API usage. Defaults to 128.
1509
+ response format for processing the inputs.
1510
+ batch_size (int | None, optional): Number of prompts grouped into a single
1511
+ request to optimize API usage. Defaults to ``None`` (automatic batch size
1512
+ optimization based on execution time). Set to a positive integer for fixed batch size.
661
1513
  max_concurrency (int, optional): Maximum number of concurrent
662
1514
  requests. Defaults to 8.
1515
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
1516
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
1517
+
1518
+ Note:
1519
+ The task's stored API parameters are used. Core batching / routing
1520
+ keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
1521
+ library and cannot be overridden.
663
1522
 
664
1523
  Returns:
665
1524
  pandas.Series: Series whose values are instances of the task's
@@ -668,20 +1527,117 @@ class AsyncOpenAIVecSeriesAccessor:
668
1527
  Note:
669
1528
  This is an asynchronous method and must be awaited.
670
1529
  """
671
- client = AsyncBatchResponses.of_task(
672
- client=_DI.resolve(AsyncOpenAI),
673
- model_name=_DI.resolve(ResponsesModelName).value,
1530
+ return await self.task_with_cache(
674
1531
  task=task,
675
- max_concurrency=max_concurrency,
1532
+ cache=AsyncBatchingMapProxy(
1533
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1534
+ ),
1535
+ **api_kwargs,
676
1536
  )
677
1537
 
678
- # Await the async operation
679
- results = await client.parse(self._obj.tolist(), batch_size=batch_size)
1538
+ async def parse_with_cache(
1539
+ self,
1540
+ instructions: str,
1541
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1542
+ response_format: type[ResponseFormat] | None = None,
1543
+ max_examples: int = 100,
1544
+ **api_kwargs,
1545
+ ) -> pd.Series:
1546
+ """Parse Series values into structured data using an LLM with a provided cache (asynchronously).
680
1547
 
681
- return pd.Series(
682
- results,
683
- index=self._obj.index,
684
- name=self._obj.name,
1548
+ This async method provides external cache control while parsing Series
1549
+ content into structured data. Automatic schema inference is performed
1550
+ when no response format is specified.
1551
+
1552
+ Args:
1553
+ instructions (str): Plain language description of what to extract
1554
+ (e.g., "Extract dates, amounts, and descriptions from receipts").
1555
+ Guides both extraction and schema inference.
1556
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
1557
+ async cache for managing concurrent API calls and deduplication.
1558
+ Set cache.batch_size=None for automatic optimization.
1559
+ response_format (type[ResponseFormat] | None, optional): Target
1560
+ structure for parsed data. Can be a Pydantic model, built-in
1561
+ type, or None for automatic inference. Defaults to None.
1562
+ max_examples (int, optional): Maximum values to analyze for schema
1563
+ inference (when response_format is None). Defaults to 100.
1564
+ **api_kwargs: Additional OpenAI API parameters.
1565
+
1566
+ Returns:
1567
+ pandas.Series: Series containing parsed structured data aligned
1568
+ with the original index.
1569
+
1570
+ Note:
1571
+ This is an asynchronous method and must be awaited.
1572
+ """
1573
+ schema: SchemaInferenceOutput | None = None
1574
+ if response_format is None:
1575
+ # Use synchronous schema inference
1576
+ schema = self._obj.ai.infer_schema(instructions=instructions, max_examples=max_examples)
1577
+
1578
+ return await self.responses_with_cache(
1579
+ instructions=schema.inference_prompt if schema else instructions,
1580
+ cache=cache,
1581
+ response_format=response_format or schema.model,
1582
+ **api_kwargs,
1583
+ )
1584
+
1585
+ async def parse(
1586
+ self,
1587
+ instructions: str,
1588
+ response_format: type[ResponseFormat] | None = None,
1589
+ max_examples: int = 100,
1590
+ batch_size: int | None = None,
1591
+ max_concurrency: int = 8,
1592
+ show_progress: bool = True,
1593
+ **api_kwargs,
1594
+ ) -> pd.Series:
1595
+ """Parse Series values into structured data using an LLM (asynchronously).
1596
+
1597
+ Async version of the parse method, extracting structured information
1598
+ from unstructured text with automatic schema inference when needed.
1599
+
1600
+ Args:
1601
+ instructions (str): Plain language extraction goals (e.g., "Extract
1602
+ product names, prices, and categories from descriptions").
1603
+ response_format (type[ResponseFormat] | None, optional): Target
1604
+ structure. None triggers automatic schema inference. Defaults to None.
1605
+ max_examples (int, optional): Maximum values for schema inference.
1606
+ Defaults to 100.
1607
+ batch_size (int | None, optional): Requests per batch. None for
1608
+ automatic optimization. Defaults to None.
1609
+ max_concurrency (int, optional): Maximum concurrent API requests.
1610
+ Defaults to 8.
1611
+ show_progress (bool, optional): Show progress bar. Defaults to True.
1612
+ **api_kwargs: Additional OpenAI API parameters.
1613
+
1614
+ Returns:
1615
+ pandas.Series: Parsed structured data indexed like the original Series.
1616
+
1617
+ Example:
1618
+ ```python
1619
+ emails = pd.Series([
1620
+ "Meeting tomorrow at 3pm with John about Q4 planning",
1621
+ "Lunch with Sarah on Friday to discuss new project"
1622
+ ])
1623
+
1624
+ # Async extraction with schema inference
1625
+ parsed = await emails.aio.parse(
1626
+ "Extract meeting details including time, person, and topic"
1627
+ )
1628
+ ```
1629
+
1630
+ Note:
1631
+ This is an asynchronous method and must be awaited.
1632
+ """
1633
+ return await self.parse_with_cache(
1634
+ instructions=instructions,
1635
+ cache=AsyncBatchingMapProxy(
1636
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1637
+ ),
1638
+ response_format=response_format,
1639
+ max_examples=max_examples,
1640
+ **api_kwargs,
685
1641
  )
686
1642
 
687
1643
 
@@ -692,78 +1648,163 @@ class AsyncOpenAIVecDataFrameAccessor:
692
1648
  def __init__(self, df_obj: pd.DataFrame):
693
1649
  self._obj = df_obj
694
1650
 
1651
+ async def responses_with_cache(
1652
+ self,
1653
+ instructions: str,
1654
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1655
+ response_format: type[ResponseFormat] = str,
1656
+ **api_kwargs,
1657
+ ) -> pd.Series:
1658
+ """Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
1659
+
1660
+ This method allows external control over caching behavior by accepting
1661
+ a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
1662
+ across multiple operations or custom batch size management. The concurrency
1663
+ is controlled by the cache instance itself.
1664
+
1665
+ Example:
1666
+ ```python
1667
+ from openaivec._cache import AsyncBatchingMapProxy
1668
+
1669
+ # Create a shared cache with custom batch size and concurrency
1670
+ shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
1671
+
1672
+ df = pd.DataFrame([
1673
+ {"name": "cat", "legs": 4},
1674
+ {"name": "dog", "legs": 4},
1675
+ {"name": "elephant", "legs": 4},
1676
+ ])
1677
+ # Must be awaited
1678
+ result = await df.aio.responses_with_cache(
1679
+ "what is the animal's name?",
1680
+ cache=shared_cache
1681
+ )
1682
+ ```
1683
+
1684
+ Args:
1685
+ instructions (str): System prompt for the assistant.
1686
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
1687
+ instance for managing API call batching and deduplication.
1688
+ Set cache.batch_size=None to enable automatic batch size optimization.
1689
+ response_format (type[ResponseFormat], optional): Desired Python type of the
1690
+ responses. Defaults to ``str``.
1691
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
1692
+
1693
+ Returns:
1694
+ pandas.Series: Responses aligned with the DataFrame's original index.
1695
+
1696
+ Note:
1697
+ This is an asynchronous method and must be awaited.
1698
+ """
1699
+ # Await the call to the async Series method using .aio
1700
+ return await _df_rows_to_json_series(self._obj).aio.responses_with_cache(
1701
+ instructions=instructions,
1702
+ cache=cache,
1703
+ response_format=response_format,
1704
+ **api_kwargs,
1705
+ )
1706
+
695
1707
  async def responses(
696
1708
  self,
697
1709
  instructions: str,
698
- response_format: Type[ResponseFormat] = str,
699
- batch_size: int = 128,
700
- temperature: float = 0.0,
701
- top_p: float = 1.0,
1710
+ response_format: type[ResponseFormat] = str,
1711
+ batch_size: int | None = None,
702
1712
  max_concurrency: int = 8,
1713
+ show_progress: bool = True,
1714
+ **api_kwargs,
703
1715
  ) -> pd.Series:
704
- """Generate a response for each row after serialising it to JSON (asynchronously).
1716
+ """Generate a response for each row after serializing it to JSON (asynchronously).
705
1717
 
706
1718
  Example:
707
1719
  ```python
708
1720
  df = pd.DataFrame([
709
- {\"name\": \"cat\", \"legs\": 4},
710
- {\"name\": \"dog\", \"legs\": 4},
711
- {\"name\": \"elephant\", \"legs\": 4},
1721
+ {"name": "cat", "legs": 4},
1722
+ {"name": "dog", "legs": 4},
1723
+ {"name": "elephant", "legs": 4},
712
1724
  ])
713
1725
  # Must be awaited
714
- results = await df.aio.responses(\"what is the animal\'s name?\")
1726
+ results = await df.aio.responses("what is the animal's name?")
1727
+
1728
+ # With progress bar for large datasets
1729
+ large_df = pd.DataFrame({"id": list(range(1000))})
1730
+ results = await large_df.aio.responses(
1731
+ "generate a name for this ID",
1732
+ batch_size=20,
1733
+ max_concurrency=4,
1734
+ show_progress=True
1735
+ )
715
1736
  ```
716
- This method returns a Series of strings, each containing the
717
- assistant's response to the corresponding input.
718
- Each row is serialised to JSON before being sent to the assistant.
719
- The model used is set by the `responses_model` function.
720
- The default model is `gpt-4.1-mini`.
721
1737
 
722
1738
  Args:
723
1739
  instructions (str): System prompt for the assistant.
724
- response_format (Type[ResponseFormat], optional): Desired Python type of the
1740
+ response_format (type[ResponseFormat], optional): Desired Python type of the
725
1741
  responses. Defaults to ``str``.
726
- batch_size (int, optional): Number of requests sent in one batch.
727
- Defaults to ``128``.
728
- temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
729
- top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
1742
+ batch_size (int | None, optional): Number of requests sent in one batch.
1743
+ Defaults to ``None`` (automatic batch size optimization
1744
+ based on execution time). Set to a positive integer for fixed batch size.
1745
+ **api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
730
1746
  max_concurrency (int, optional): Maximum number of concurrent
731
1747
  requests. Defaults to ``8``.
1748
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
732
1749
 
733
1750
  Returns:
734
- pandas.Series: Responses aligned with the DataFrames original index.
1751
+ pandas.Series: Responses aligned with the DataFrame's original index.
735
1752
 
736
1753
  Note:
737
1754
  This is an asynchronous method and must be awaited.
738
1755
  """
739
- series_of_json = self._obj.pipe(
740
- lambda df: (
741
- pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
742
- lambda x: json.dumps(x, ensure_ascii=False)
743
- )
744
- )
745
- )
746
- # Await the call to the async Series method using .aio
747
- return await series_of_json.aio.responses(
1756
+ return await self.responses_with_cache(
748
1757
  instructions=instructions,
1758
+ cache=AsyncBatchingMapProxy(
1759
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1760
+ ),
749
1761
  response_format=response_format,
750
- batch_size=batch_size,
751
- temperature=temperature,
752
- top_p=top_p,
753
- max_concurrency=max_concurrency,
1762
+ **api_kwargs,
754
1763
  )
755
1764
 
756
- async def task(self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8) -> pd.Series:
757
- """Execute a prepared task on each DataFrame row after serialising it to JSON (asynchronously).
1765
+ async def task_with_cache(
1766
+ self,
1767
+ task: PreparedTask[ResponseFormat],
1768
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1769
+ **api_kwargs,
1770
+ ) -> pd.Series:
1771
+ """Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
1772
+
1773
+ After serializing each row to JSON, this method executes the prepared task.
1774
+
1775
+ Args:
1776
+ task (PreparedTask): Prepared task (instructions + response_format).
1777
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
1778
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
1779
+
1780
+ Note:
1781
+ Core routing keys are managed internally.
1782
+
1783
+ Returns:
1784
+ pandas.Series: Task results aligned with the DataFrame's original index.
1785
+
1786
+ Note:
1787
+ This is an asynchronous method and must be awaited.
1788
+ """
1789
+ return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
1790
+ task=task,
1791
+ cache=cache,
1792
+ **api_kwargs,
1793
+ )
758
1794
 
759
- This method applies a pre-configured task to each row in the DataFrame,
760
- using the task's instructions and response format to generate structured
761
- responses from the language model. Each row is serialised to JSON before
762
- being processed by the task.
1795
+ async def task(
1796
+ self,
1797
+ task: PreparedTask,
1798
+ batch_size: int | None = None,
1799
+ max_concurrency: int = 8,
1800
+ show_progress: bool = True,
1801
+ **api_kwargs,
1802
+ ) -> pd.Series:
1803
+ """Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
763
1804
 
764
1805
  Example:
765
1806
  ```python
766
- from openaivec.model import PreparedTask
1807
+ from openaivec._model import PreparedTask
767
1808
 
768
1809
  # Assume you have a prepared task for data analysis
769
1810
  analysis_task = PreparedTask(...)
@@ -775,17 +1816,31 @@ class AsyncOpenAIVecDataFrameAccessor:
775
1816
  ])
776
1817
  # Must be awaited
777
1818
  results = await df.aio.task(analysis_task)
1819
+
1820
+ # With progress bar for large datasets
1821
+ large_df = pd.DataFrame({"id": list(range(1000))})
1822
+ results = await large_df.aio.task(
1823
+ analysis_task,
1824
+ batch_size=50,
1825
+ max_concurrency=4,
1826
+ show_progress=True
1827
+ )
778
1828
  ```
779
- This method returns a Series containing the task results for each
780
- corresponding row, following the task's defined structure.
781
1829
 
782
1830
  Args:
783
1831
  task (PreparedTask): A pre-configured task containing instructions,
784
- response format, and other parameters for processing the inputs.
785
- batch_size (int, optional): Number of requests sent in one batch
786
- to optimize API usage. Defaults to 128.
1832
+ response format for processing the inputs.
1833
+ batch_size (int | None, optional): Number of requests sent in one batch
1834
+ to optimize API usage. Defaults to ``None`` (automatic batch size
1835
+ optimization based on execution time). Set to a positive integer for fixed batch size.
787
1836
  max_concurrency (int, optional): Maximum number of concurrent
788
1837
  requests. Defaults to 8.
1838
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
1839
+ **api_kwargs: Additional OpenAI API parameters forwarded to the Responses API.
1840
+
1841
+ Note:
1842
+ Core batching / routing keys (``model``, ``instructions`` / system message, user ``input``)
1843
+ are managed by the library and cannot be overridden.
789
1844
 
790
1845
  Returns:
791
1846
  pandas.Series: Series whose values are instances of the task's
@@ -794,27 +1849,131 @@ class AsyncOpenAIVecDataFrameAccessor:
794
1849
  Note:
795
1850
  This is an asynchronous method and must be awaited.
796
1851
  """
797
- series_of_json = self._obj.pipe(
798
- lambda df: (
799
- pd.Series(df.to_dict(orient="records"), index=df.index, name="record").map(
800
- lambda x: json.dumps(x, ensure_ascii=False)
801
- )
802
- )
803
- )
804
1852
  # Await the call to the async Series method using .aio
805
- return await series_of_json.aio.task(
1853
+ return await _df_rows_to_json_series(self._obj).aio.task(
806
1854
  task=task,
807
1855
  batch_size=batch_size,
808
1856
  max_concurrency=max_concurrency,
1857
+ show_progress=show_progress,
1858
+ **api_kwargs,
809
1859
  )
810
1860
 
811
- async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
1861
+ async def parse_with_cache(
1862
+ self,
1863
+ instructions: str,
1864
+ cache: AsyncBatchingMapProxy[str, ResponseFormat],
1865
+ response_format: type[ResponseFormat] | None = None,
1866
+ max_examples: int = 100,
1867
+ **api_kwargs,
1868
+ ) -> pd.Series:
1869
+ """Parse DataFrame rows into structured data using an LLM with cache (asynchronously).
1870
+
1871
+ Async method for parsing DataFrame rows (as JSON) with external cache
1872
+ control, enabling deduplication across operations and concurrent processing.
1873
+
1874
+ Args:
1875
+ instructions (str): Plain language extraction goals (e.g., "Extract
1876
+ invoice details including items, quantities, and totals").
1877
+ cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured
1878
+ async cache for concurrent API call management.
1879
+ response_format (type[ResponseFormat] | None, optional): Target
1880
+ structure. None triggers automatic schema inference. Defaults to None.
1881
+ max_examples (int, optional): Maximum rows for schema inference.
1882
+ Defaults to 100.
1883
+ **api_kwargs: Additional OpenAI API parameters.
1884
+
1885
+ Returns:
1886
+ pandas.Series: Parsed structured data indexed like the original DataFrame.
1887
+
1888
+ Note:
1889
+ This is an asynchronous method and must be awaited.
1890
+ """
1891
+ return await _df_rows_to_json_series(self._obj).aio.parse_with_cache(
1892
+ instructions=instructions,
1893
+ cache=cache,
1894
+ response_format=response_format,
1895
+ max_examples=max_examples,
1896
+ **api_kwargs,
1897
+ )
1898
+
1899
+ async def parse(
1900
+ self,
1901
+ instructions: str,
1902
+ response_format: type[ResponseFormat] | None = None,
1903
+ max_examples: int = 100,
1904
+ batch_size: int | None = None,
1905
+ max_concurrency: int = 8,
1906
+ show_progress: bool = True,
1907
+ **api_kwargs,
1908
+ ) -> pd.Series:
1909
+ """Parse DataFrame rows into structured data using an LLM (asynchronously).
1910
+
1911
+ Async version for extracting structured information from DataFrame rows,
1912
+ with automatic schema inference when no format is specified.
1913
+
1914
+ Args:
1915
+ instructions (str): Plain language extraction goals (e.g., "Extract
1916
+ customer details, order items, and payment information").
1917
+ response_format (type[ResponseFormat] | None, optional): Target
1918
+ structure. None triggers automatic inference. Defaults to None.
1919
+ max_examples (int, optional): Maximum rows for schema inference.
1920
+ Defaults to 100.
1921
+ batch_size (int | None, optional): Rows per batch. None for
1922
+ automatic optimization. Defaults to None.
1923
+ max_concurrency (int, optional): Maximum concurrent requests.
1924
+ Defaults to 8.
1925
+ show_progress (bool, optional): Show progress bar. Defaults to True.
1926
+ **api_kwargs: Additional OpenAI API parameters.
1927
+
1928
+ Returns:
1929
+ pandas.Series: Parsed structured data indexed like the original DataFrame.
1930
+
1931
+ Example:
1932
+ ```python
1933
+ df = pd.DataFrame({
1934
+ 'raw_data': [
1935
+ 'Customer: John Doe, Order: 2 laptops @ $1200 each',
1936
+ 'Customer: Jane Smith, Order: 5 phones @ $800 each'
1937
+ ]
1938
+ })
1939
+
1940
+ # Async parsing with automatic schema inference
1941
+ parsed = await df.aio.parse(
1942
+ "Extract customer name, product, quantity, and unit price"
1943
+ )
1944
+ ```
1945
+
1946
+ Note:
1947
+ This is an asynchronous method and must be awaited.
812
1948
  """
813
- Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
1949
+ return await self.parse_with_cache(
1950
+ instructions=instructions,
1951
+ cache=AsyncBatchingMapProxy(
1952
+ batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
1953
+ ),
1954
+ response_format=response_format,
1955
+ max_examples=max_examples,
1956
+ **api_kwargs,
1957
+ )
1958
+
1959
+ async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
1960
+ """Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
814
1961
 
815
1962
  This method allows chaining operations on the DataFrame, similar to pandas' `pipe` method,
816
1963
  but with support for asynchronous functions.
817
1964
 
1965
+ Example:
1966
+ ```python
1967
+ async def process_data(df):
1968
+ # Simulate an asynchronous computation
1969
+ await asyncio.sleep(1)
1970
+ return df.dropna()
1971
+
1972
+ df = pd.DataFrame({"col": [1, 2, None, 4]})
1973
+ # Must be awaited
1974
+ result = await df.aio.pipe(process_data)
1975
+ ```
1976
+
818
1977
  Args:
819
1978
  func (Callable[[pd.DataFrame], Awaitable[T] | T]): A function that takes a DataFrame
820
1979
  as input and returns either a result or an awaitable result.
@@ -831,7 +1990,7 @@ class AsyncOpenAIVecDataFrameAccessor:
831
1990
  else:
832
1991
  return result
833
1992
 
834
- async def assign(self, **kwargs: Any) -> pd.DataFrame:
1993
+ async def assign(self, **kwargs) -> pd.DataFrame:
835
1994
  """Asynchronously assign new columns to the DataFrame, evaluating sequentially.
836
1995
 
837
1996
  This method extends pandas' `assign` method by supporting asynchronous
@@ -866,7 +2025,7 @@ class AsyncOpenAIVecDataFrameAccessor:
866
2025
  ```
867
2026
 
868
2027
  Args:
869
- **kwargs: Any. Column names as keys and either static values or callables
2028
+ **kwargs: Column names as keys and either static values or callables
870
2029
  (synchronous or asynchronous) as values.
871
2030
 
872
2031
  Returns:
@@ -891,7 +2050,12 @@ class AsyncOpenAIVecDataFrameAccessor:
891
2050
  return df_current
892
2051
 
893
2052
  async def fillna(
894
- self, target_column_name: str, max_examples: int = 500, batch_size: int = 128, max_concurrency: int = 8
2053
+ self,
2054
+ target_column_name: str,
2055
+ max_examples: int = 500,
2056
+ batch_size: int | None = None,
2057
+ max_concurrency: int = 8,
2058
+ show_progress: bool = True,
895
2059
  ) -> pd.DataFrame:
896
2060
  """Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
897
2061
 
@@ -906,10 +2070,12 @@ class AsyncOpenAIVecDataFrameAccessor:
906
2070
  max_examples (int, optional): The maximum number of example rows to use
907
2071
  for context when predicting missing values. Higher values may improve
908
2072
  accuracy but increase API costs and processing time. Defaults to 500.
909
- batch_size (int, optional): Number of requests sent in one batch
910
- to optimize API usage. Defaults to 128.
2073
+ batch_size (int | None, optional): Number of requests sent in one batch
2074
+ to optimize API usage. Defaults to ``None`` (automatic batch size
2075
+ optimization based on execution time). Set to a positive integer for fixed batch size.
911
2076
  max_concurrency (int, optional): Maximum number of concurrent
912
2077
  requests. Defaults to 8.
2078
+ show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``True``.
913
2079
 
914
2080
  Returns:
915
2081
  pandas.DataFrame: A new DataFrame with missing values filled in the target
@@ -925,6 +2091,15 @@ class AsyncOpenAIVecDataFrameAccessor:
925
2091
 
926
2092
  # Fill missing values in the 'name' column (must be awaited)
927
2093
  filled_df = await df.aio.fillna('name')
2094
+
2095
+ # With progress bar for large datasets
2096
+ large_df = pd.DataFrame({'name': [None] * 1000, 'age': list(range(1000))})
2097
+ filled_df = await large_df.aio.fillna(
2098
+ 'name',
2099
+ batch_size=32,
2100
+ max_concurrency=4,
2101
+ show_progress=True
2102
+ )
928
2103
  ```
929
2104
 
930
2105
  Note:
@@ -938,8 +2113,11 @@ class AsyncOpenAIVecDataFrameAccessor:
938
2113
  if missing_rows.empty:
939
2114
  return self._obj
940
2115
 
941
- filled_values: List[FillNaResponse] = await missing_rows.aio.task(
942
- task=task, batch_size=batch_size, max_concurrency=max_concurrency
2116
+ filled_values: list[FillNaResponse] = await missing_rows.aio.task(
2117
+ task=task,
2118
+ batch_size=batch_size,
2119
+ max_concurrency=max_concurrency,
2120
+ show_progress=show_progress,
943
2121
  )
944
2122
 
945
2123
  # get deep copy of the DataFrame to avoid modifying the original