openaivec 0.14.3__py3-none-any.whl → 0.14.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_proxy.py +24 -2
- openaivec/_schema.py +47 -6
- openaivec/pandas_ext.py +372 -338
- {openaivec-0.14.3.dist-info → openaivec-0.14.4.dist-info}/METADATA +1 -1
- {openaivec-0.14.3.dist-info → openaivec-0.14.4.dist-info}/RECORD +7 -7
- {openaivec-0.14.3.dist-info → openaivec-0.14.4.dist-info}/WHEEL +0 -0
- {openaivec-0.14.3.dist-info → openaivec-0.14.4.dist-info}/licenses/LICENSE +0 -0
openaivec/_proxy.py
CHANGED
|
@@ -460,7 +460,20 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
460
460
|
self.__process_owned(owned, map_func)
|
|
461
461
|
self.__wait_for(wait_for, map_func)
|
|
462
462
|
|
|
463
|
-
|
|
463
|
+
# Fetch results before purging None entries
|
|
464
|
+
results = self.__values(items)
|
|
465
|
+
|
|
466
|
+
# Remove None values from cache so they are recomputed on future calls
|
|
467
|
+
with self._lock:
|
|
468
|
+
if self._cache: # micro-optimization
|
|
469
|
+
for k in set(items):
|
|
470
|
+
try:
|
|
471
|
+
if self._cache.get(k, object()) is None:
|
|
472
|
+
del self._cache[k]
|
|
473
|
+
except KeyError:
|
|
474
|
+
pass
|
|
475
|
+
|
|
476
|
+
return results
|
|
464
477
|
|
|
465
478
|
|
|
466
479
|
@dataclass
|
|
@@ -745,4 +758,13 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
745
758
|
await self.__process_owned(owned, map_func)
|
|
746
759
|
await self.__wait_for(wait_for, map_func)
|
|
747
760
|
|
|
748
|
-
|
|
761
|
+
results = await self.__values(items)
|
|
762
|
+
|
|
763
|
+
# Remove None values from cache after retrieval to avoid persisting incomplete results
|
|
764
|
+
async with self._lock:
|
|
765
|
+
if self._cache:
|
|
766
|
+
for k in set(items):
|
|
767
|
+
if self._cache.get(k, object()) is None:
|
|
768
|
+
self._cache.pop(k, None)
|
|
769
|
+
|
|
770
|
+
return results
|
openaivec/_schema.py
CHANGED
|
@@ -128,6 +128,12 @@ class InferredSchema(BaseModel):
|
|
|
128
128
|
redundancy removed).
|
|
129
129
|
examples_summary: Neutral description of structural / semantic patterns
|
|
130
130
|
observed in the examples (domain, recurring signals, constraints).
|
|
131
|
+
examples_purpose_alignment: Analytical explanation of how the concrete
|
|
132
|
+
recurring patterns in the provided examples *justify*, *constrain*,
|
|
133
|
+
or *refine* the stated purpose. Should map purpose facets to
|
|
134
|
+
observed evidence (or explicitly note gaps) to discourage
|
|
135
|
+
hallucinated fields and anchor extraction scope. This is an
|
|
136
|
+
internal quality aid – downstream consumers typically ignore it.
|
|
131
137
|
fields: Ordered list of ``FieldSpec`` objects comprising the schema's
|
|
132
138
|
sole authoritative contract.
|
|
133
139
|
inference_prompt: Self-contained extraction instructions enforcing an
|
|
@@ -147,6 +153,13 @@ class InferredSchema(BaseModel):
|
|
|
147
153
|
"patterns, and notable constraints."
|
|
148
154
|
)
|
|
149
155
|
)
|
|
156
|
+
examples_purpose_alignment: str = Field(
|
|
157
|
+
description=(
|
|
158
|
+
"Explanation of how observable recurring patterns in the examples substantiate and bound the stated "
|
|
159
|
+
"purpose. Should reference purpose facets and cite supporting example evidence (or note any gaps) to "
|
|
160
|
+
"reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream extraction."
|
|
161
|
+
)
|
|
162
|
+
)
|
|
150
163
|
fields: List[FieldSpec] = Field(
|
|
151
164
|
description=(
|
|
152
165
|
"Ordered list of proposed fields derived strictly from observable, repeatable signals in the "
|
|
@@ -234,7 +247,7 @@ class InferredSchema(BaseModel):
|
|
|
234
247
|
py_type = enum_cls
|
|
235
248
|
else:
|
|
236
249
|
py_type = type_map[spec.type]
|
|
237
|
-
fields[spec.name] = (py_type,
|
|
250
|
+
fields[spec.name] = (py_type, Field(description=spec.description))
|
|
238
251
|
|
|
239
252
|
model = create_model("InferredSchema", **fields) # type: ignore[call-arg]
|
|
240
253
|
return model
|
|
@@ -281,11 +294,15 @@ You are a schema inference engine.
|
|
|
281
294
|
Task:
|
|
282
295
|
1. Normalize the user's purpose (eliminate ambiguity, redundancy, contradictions).
|
|
283
296
|
2. Objectively summarize observable patterns in the example texts.
|
|
284
|
-
3.
|
|
285
|
-
|
|
286
|
-
|
|
297
|
+
3. Produce an "examples_purpose_alignment" explanation that explicitly maps purpose facets
|
|
298
|
+
to concrete recurring evidence in the examples (or flags gaps). Use concise bullet‑style
|
|
299
|
+
sentences (still a plain string) such as: "purpose facet -> supporting pattern / gap".
|
|
300
|
+
This MUST NOT introduce new domain facts beyond the examples & purpose.
|
|
301
|
+
4. Propose a minimal flat set of scalar fields (no nesting / arrays) that are reliably extractable.
|
|
302
|
+
5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
|
|
303
|
+
6. Provide enum_values ONLY when a small stable closed categorical set (2–24 lowercase tokens)
|
|
287
304
|
is clearly evidenced; never invent.
|
|
288
|
-
|
|
305
|
+
7. If the purpose indicates prediction (predict / probability / likelihood), output only
|
|
289
306
|
explanatory features (no target restatement).
|
|
290
307
|
|
|
291
308
|
Rules:
|
|
@@ -305,6 +322,7 @@ Output contract:
|
|
|
305
322
|
Return exactly an InferredSchema object with JSON keys:
|
|
306
323
|
- purpose (string)
|
|
307
324
|
- examples_summary (string)
|
|
325
|
+
- examples_purpose_alignment (string)
|
|
308
326
|
- fields (array of FieldSpec objects: name, type, description, enum_values?)
|
|
309
327
|
- inference_prompt (string)
|
|
310
328
|
""".strip()
|
|
@@ -359,10 +377,31 @@ class SchemaInferer:
|
|
|
359
377
|
raise ValueError("max_retries must be >= 1")
|
|
360
378
|
|
|
361
379
|
last_err: Exception | None = None
|
|
380
|
+
previous_errors: list[str] = []
|
|
362
381
|
for attempt in range(max_retries):
|
|
382
|
+
if attempt == 0:
|
|
383
|
+
instructions = _INFER_INSTRUCTIONS
|
|
384
|
+
else:
|
|
385
|
+
# Provide structured feedback for correction. Keep concise and prohibit speculative expansion.
|
|
386
|
+
feedback_lines = [
|
|
387
|
+
"--- PRIOR VALIDATION FEEDBACK ---",
|
|
388
|
+
]
|
|
389
|
+
for i, err in enumerate(previous_errors[-5:], 1): # include last up to 5 errors
|
|
390
|
+
feedback_lines.append(f"{i}. {err}")
|
|
391
|
+
feedback_lines.extend(
|
|
392
|
+
[
|
|
393
|
+
"Adjust ONLY listed issues; avoid adding brand-new fields unless essential.",
|
|
394
|
+
"Don't hallucinate or broaden enum_values unless enum rule caused failure.",
|
|
395
|
+
"Duplicate names: minimally rename; keep semantics.",
|
|
396
|
+
"Unsupported type: change to string|integer|float|boolean (no new facts).",
|
|
397
|
+
"Bad enum length: drop enum or constrain to 2–24 evidenced tokens.",
|
|
398
|
+
]
|
|
399
|
+
)
|
|
400
|
+
instructions = _INFER_INSTRUCTIONS + "\n\n" + "\n".join(feedback_lines)
|
|
401
|
+
|
|
363
402
|
response: ParsedResponse[InferredSchema] = self.client.responses.parse(
|
|
364
403
|
model=self.model_name,
|
|
365
|
-
instructions=
|
|
404
|
+
instructions=instructions,
|
|
366
405
|
input=data.model_dump_json(),
|
|
367
406
|
text_format=InferredSchema,
|
|
368
407
|
*args,
|
|
@@ -371,8 +410,10 @@ class SchemaInferer:
|
|
|
371
410
|
parsed = response.output_parsed
|
|
372
411
|
try:
|
|
373
412
|
_basic_field_list_validation(parsed)
|
|
413
|
+
parsed.build_model() # ensure dynamic model creation succeeds
|
|
374
414
|
except ValueError as e:
|
|
375
415
|
last_err = e
|
|
416
|
+
previous_errors.append(str(e))
|
|
376
417
|
if attempt == max_retries - 1:
|
|
377
418
|
raise
|
|
378
419
|
continue
|
openaivec/pandas_ext.py
CHANGED
|
@@ -182,6 +182,27 @@ class OpenAIVecSeriesAccessor:
|
|
|
182
182
|
top_p: float = 1.0,
|
|
183
183
|
**api_kwargs,
|
|
184
184
|
) -> pd.Series:
|
|
185
|
+
"""Call an LLM once for every Series element using a provided cache.
|
|
186
|
+
|
|
187
|
+
This is a lower-level method that allows explicit cache management for advanced
|
|
188
|
+
use cases. Most users should use the standard ``responses`` method instead.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
instructions (str): System prompt prepended to every user message.
|
|
192
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Explicit cache instance for
|
|
193
|
+
batching and deduplication control.
|
|
194
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built-in
|
|
195
|
+
type the assistant should return. Defaults to ``str``.
|
|
196
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
197
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
198
|
+
|
|
199
|
+
Additional Keyword Args:
|
|
200
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
201
|
+
``seed``, etc.) are forwarded verbatim to the underlying client.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
205
|
+
"""
|
|
185
206
|
client: BatchResponses = BatchResponses(
|
|
186
207
|
client=CONTAINER.resolve(OpenAI),
|
|
187
208
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
@@ -195,6 +216,56 @@ class OpenAIVecSeriesAccessor:
|
|
|
195
216
|
# Forward any extra kwargs to the underlying Responses API.
|
|
196
217
|
return pd.Series(client.parse(self._obj.tolist(), **api_kwargs), index=self._obj.index, name=self._obj.name)
|
|
197
218
|
|
|
219
|
+
def responses(
|
|
220
|
+
self,
|
|
221
|
+
instructions: str,
|
|
222
|
+
response_format: Type[ResponseFormat] = str,
|
|
223
|
+
batch_size: int | None = None,
|
|
224
|
+
temperature: float | None = 0.0,
|
|
225
|
+
top_p: float = 1.0,
|
|
226
|
+
show_progress: bool = False,
|
|
227
|
+
**api_kwargs,
|
|
228
|
+
) -> pd.Series:
|
|
229
|
+
"""Call an LLM once for every Series element.
|
|
230
|
+
|
|
231
|
+
Example:
|
|
232
|
+
```python
|
|
233
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
234
|
+
# Basic usage
|
|
235
|
+
animals.ai.responses("translate to French")
|
|
236
|
+
|
|
237
|
+
# With progress bar in Jupyter notebooks
|
|
238
|
+
large_series = pd.Series(["data"] * 1000)
|
|
239
|
+
large_series.ai.responses(
|
|
240
|
+
"analyze this data",
|
|
241
|
+
batch_size=32,
|
|
242
|
+
show_progress=True
|
|
243
|
+
)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
instructions (str): System prompt prepended to every user message.
|
|
248
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
249
|
+
type the assistant should return. Defaults to ``str``.
|
|
250
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
251
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
252
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
253
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
254
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
255
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
259
|
+
"""
|
|
260
|
+
return self.responses_with_cache(
|
|
261
|
+
instructions=instructions,
|
|
262
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
263
|
+
response_format=response_format,
|
|
264
|
+
temperature=temperature,
|
|
265
|
+
top_p=top_p,
|
|
266
|
+
**api_kwargs,
|
|
267
|
+
)
|
|
268
|
+
|
|
198
269
|
def embeddings_with_cache(
|
|
199
270
|
self,
|
|
200
271
|
cache: BatchingMapProxy[str, np.ndarray],
|
|
@@ -205,15 +276,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
205
276
|
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
206
277
|
across multiple operations or custom batch size management.
|
|
207
278
|
|
|
208
|
-
Args:
|
|
209
|
-
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
210
|
-
instance for managing API call batching and deduplication.
|
|
211
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
212
|
-
|
|
213
|
-
Returns:
|
|
214
|
-
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
215
|
-
(dtype ``float32``).
|
|
216
|
-
|
|
217
279
|
Example:
|
|
218
280
|
```python
|
|
219
281
|
from openaivec._proxy import BatchingMapProxy
|
|
@@ -225,6 +287,15 @@ class OpenAIVecSeriesAccessor:
|
|
|
225
287
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
226
288
|
embeddings = animals.ai.embeddings_with_cache(cache=shared_cache)
|
|
227
289
|
```
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
293
|
+
instance for managing API call batching and deduplication.
|
|
294
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
298
|
+
(dtype ``float32``).
|
|
228
299
|
"""
|
|
229
300
|
client: BatchEmbeddings = BatchEmbeddings(
|
|
230
301
|
client=CONTAINER.resolve(OpenAI),
|
|
@@ -238,54 +309,35 @@ class OpenAIVecSeriesAccessor:
|
|
|
238
309
|
name=self._obj.name,
|
|
239
310
|
)
|
|
240
311
|
|
|
241
|
-
def
|
|
242
|
-
|
|
243
|
-
instructions: str,
|
|
244
|
-
response_format: Type[ResponseFormat] = str,
|
|
245
|
-
batch_size: int | None = None,
|
|
246
|
-
temperature: float | None = 0.0,
|
|
247
|
-
top_p: float = 1.0,
|
|
248
|
-
show_progress: bool = False,
|
|
249
|
-
**api_kwargs,
|
|
250
|
-
) -> pd.Series:
|
|
251
|
-
"""Call an LLM once for every Series element.
|
|
312
|
+
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
313
|
+
"""Compute OpenAI embeddings for every Series element.
|
|
252
314
|
|
|
253
315
|
Example:
|
|
254
316
|
```python
|
|
255
317
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
256
318
|
# Basic usage
|
|
257
|
-
animals.ai.
|
|
319
|
+
animals.ai.embeddings()
|
|
258
320
|
|
|
259
|
-
# With progress bar
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
batch_size=32,
|
|
321
|
+
# With progress bar for large datasets
|
|
322
|
+
large_texts = pd.Series(["text"] * 5000)
|
|
323
|
+
embeddings = large_texts.ai.embeddings(
|
|
324
|
+
batch_size=100,
|
|
264
325
|
show_progress=True
|
|
265
326
|
)
|
|
266
327
|
```
|
|
267
328
|
|
|
268
329
|
Args:
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
type the assistant should return. Defaults to ``str``.
|
|
272
|
-
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
273
|
-
request. Defaults to ``None`` (automatic batch size optimization
|
|
330
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
331
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
274
332
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
275
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
276
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
277
333
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
278
334
|
|
|
279
335
|
Returns:
|
|
280
|
-
pandas.Series: Series whose values are
|
|
336
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
337
|
+
(dtype ``float32``).
|
|
281
338
|
"""
|
|
282
|
-
return self.
|
|
283
|
-
instructions=instructions,
|
|
339
|
+
return self.embeddings_with_cache(
|
|
284
340
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
285
|
-
response_format=response_format,
|
|
286
|
-
temperature=temperature,
|
|
287
|
-
top_p=top_p,
|
|
288
|
-
**api_kwargs,
|
|
289
341
|
)
|
|
290
342
|
|
|
291
343
|
def task_with_cache(
|
|
@@ -300,6 +352,13 @@ class OpenAIVecSeriesAccessor:
|
|
|
300
352
|
response format, temperature and top_p. A supplied ``BatchingMapProxy`` enables
|
|
301
353
|
cross‑operation deduplicated reuse and external batch size / progress control.
|
|
302
354
|
|
|
355
|
+
Example:
|
|
356
|
+
```python
|
|
357
|
+
from openaivec._proxy import BatchingMapProxy
|
|
358
|
+
shared_cache = BatchingMapProxy(batch_size=64)
|
|
359
|
+
reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
360
|
+
```
|
|
361
|
+
|
|
303
362
|
Args:
|
|
304
363
|
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
305
364
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
@@ -311,13 +370,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
311
370
|
|
|
312
371
|
Returns:
|
|
313
372
|
pandas.Series: Task results aligned with the original Series index.
|
|
314
|
-
|
|
315
|
-
Example:
|
|
316
|
-
```python
|
|
317
|
-
from openaivec._proxy import BatchingMapProxy
|
|
318
|
-
shared_cache = BatchingMapProxy(batch_size=64)
|
|
319
|
-
reviews.ai.task_with_cache(sentiment_task, cache=shared_cache)
|
|
320
|
-
```
|
|
321
373
|
"""
|
|
322
374
|
client: BatchResponses = BatchResponses(
|
|
323
375
|
client=CONTAINER.resolve(OpenAI),
|
|
@@ -382,37 +434,6 @@ class OpenAIVecSeriesAccessor:
|
|
|
382
434
|
**api_kwargs,
|
|
383
435
|
)
|
|
384
436
|
|
|
385
|
-
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
386
|
-
"""Compute OpenAI embeddings for every Series element.
|
|
387
|
-
|
|
388
|
-
Example:
|
|
389
|
-
```python
|
|
390
|
-
animals = pd.Series(["cat", "dog", "elephant"])
|
|
391
|
-
# Basic usage
|
|
392
|
-
animals.ai.embeddings()
|
|
393
|
-
|
|
394
|
-
# With progress bar for large datasets
|
|
395
|
-
large_texts = pd.Series(["text"] * 5000)
|
|
396
|
-
embeddings = large_texts.ai.embeddings(
|
|
397
|
-
batch_size=100,
|
|
398
|
-
show_progress=True
|
|
399
|
-
)
|
|
400
|
-
```
|
|
401
|
-
|
|
402
|
-
Args:
|
|
403
|
-
batch_size (int | None, optional): Number of inputs grouped into a
|
|
404
|
-
single request. Defaults to ``None`` (automatic batch size optimization
|
|
405
|
-
based on execution time). Set to a positive integer for fixed batch size.
|
|
406
|
-
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
407
|
-
|
|
408
|
-
Returns:
|
|
409
|
-
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
410
|
-
(dtype ``float32``).
|
|
411
|
-
"""
|
|
412
|
-
return self.embeddings_with_cache(
|
|
413
|
-
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
414
|
-
)
|
|
415
|
-
|
|
416
437
|
def count_tokens(self) -> pd.Series:
|
|
417
438
|
"""Count `tiktoken` tokens per row.
|
|
418
439
|
|
|
@@ -467,38 +488,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
467
488
|
def __init__(self, df_obj: pd.DataFrame):
|
|
468
489
|
self._obj = df_obj
|
|
469
490
|
|
|
470
|
-
def extract(self, column: str) -> pd.DataFrame:
|
|
471
|
-
"""Flatten one column of Pydantic models/dicts into top‑level columns.
|
|
472
|
-
|
|
473
|
-
Example:
|
|
474
|
-
```python
|
|
475
|
-
df = pd.DataFrame([
|
|
476
|
-
{"animal": {"name": "cat", "legs": 4}},
|
|
477
|
-
{"animal": {"name": "dog", "legs": 4}},
|
|
478
|
-
{"animal": {"name": "elephant", "legs": 4}},
|
|
479
|
-
])
|
|
480
|
-
df.ai.extract("animal")
|
|
481
|
-
```
|
|
482
|
-
This method returns a DataFrame with the same index as the original,
|
|
483
|
-
where each column corresponds to a key in the dictionaries.
|
|
484
|
-
The source column is dropped.
|
|
485
|
-
|
|
486
|
-
Args:
|
|
487
|
-
column (str): Column to expand.
|
|
488
|
-
|
|
489
|
-
Returns:
|
|
490
|
-
pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
|
|
491
|
-
"""
|
|
492
|
-
if column not in self._obj.columns:
|
|
493
|
-
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
|
494
|
-
|
|
495
|
-
return (
|
|
496
|
-
self._obj.pipe(lambda df: df.reset_index(drop=True))
|
|
497
|
-
.pipe(lambda df: df.join(df[column].ai.extract()))
|
|
498
|
-
.pipe(lambda df: df.set_index(self._obj.index))
|
|
499
|
-
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
500
|
-
)
|
|
501
|
-
|
|
502
491
|
def responses_with_cache(
|
|
503
492
|
self,
|
|
504
493
|
instructions: str,
|
|
@@ -508,25 +497,12 @@ class OpenAIVecDataFrameAccessor:
|
|
|
508
497
|
top_p: float = 1.0,
|
|
509
498
|
**api_kwargs,
|
|
510
499
|
) -> pd.Series:
|
|
511
|
-
"""Generate a response for each row after
|
|
500
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache.
|
|
512
501
|
|
|
513
502
|
This method allows external control over caching behavior by accepting
|
|
514
503
|
a pre-configured BatchingMapProxy instance, enabling cache sharing
|
|
515
504
|
across multiple operations or custom batch size management.
|
|
516
505
|
|
|
517
|
-
Args:
|
|
518
|
-
instructions (str): System prompt for the assistant.
|
|
519
|
-
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
520
|
-
instance for managing API call batching and deduplication.
|
|
521
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
522
|
-
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
523
|
-
responses. Defaults to ``str``.
|
|
524
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
525
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
526
|
-
|
|
527
|
-
Returns:
|
|
528
|
-
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
529
|
-
|
|
530
506
|
Example:
|
|
531
507
|
```python
|
|
532
508
|
from openaivec._proxy import BatchingMapProxy
|
|
@@ -544,6 +520,19 @@ class OpenAIVecDataFrameAccessor:
|
|
|
544
520
|
cache=shared_cache
|
|
545
521
|
)
|
|
546
522
|
```
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
instructions (str): System prompt for the assistant.
|
|
526
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
527
|
+
instance for managing API call batching and deduplication.
|
|
528
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
529
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
530
|
+
responses. Defaults to ``str``.
|
|
531
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
532
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
533
|
+
|
|
534
|
+
Returns:
|
|
535
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
547
536
|
"""
|
|
548
537
|
return _df_rows_to_json_series(self._obj).ai.responses_with_cache(
|
|
549
538
|
instructions=instructions,
|
|
@@ -564,7 +553,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
564
553
|
show_progress: bool = False,
|
|
565
554
|
**api_kwargs,
|
|
566
555
|
) -> pd.Series:
|
|
567
|
-
"""Generate a response for each row after
|
|
556
|
+
"""Generate a response for each row after serializing it to JSON.
|
|
568
557
|
|
|
569
558
|
Example:
|
|
570
559
|
```python
|
|
@@ -592,7 +581,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
592
581
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
593
582
|
Defaults to ``None`` (automatic batch size optimization
|
|
594
583
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
595
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
584
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
596
585
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
597
586
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
598
587
|
|
|
@@ -608,6 +597,31 @@ class OpenAIVecDataFrameAccessor:
|
|
|
608
597
|
**api_kwargs,
|
|
609
598
|
)
|
|
610
599
|
|
|
600
|
+
def task_with_cache(
|
|
601
|
+
self,
|
|
602
|
+
task: PreparedTask[ResponseFormat],
|
|
603
|
+
cache: BatchingMapProxy[str, ResponseFormat],
|
|
604
|
+
**api_kwargs,
|
|
605
|
+
) -> pd.Series:
|
|
606
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
|
|
607
|
+
|
|
608
|
+
Args:
|
|
609
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
610
|
+
cache (BatchingMapProxy[str, ResponseFormat]): Pre‑configured cache instance.
|
|
611
|
+
|
|
612
|
+
Additional Keyword Args:
|
|
613
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
614
|
+
``seed``) forwarded verbatim. Core routing keys are managed internally.
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
618
|
+
"""
|
|
619
|
+
return _df_rows_to_json_series(self._obj).ai.task_with_cache(
|
|
620
|
+
task=task,
|
|
621
|
+
cache=cache,
|
|
622
|
+
**api_kwargs,
|
|
623
|
+
)
|
|
624
|
+
|
|
611
625
|
def task(
|
|
612
626
|
self,
|
|
613
627
|
task: PreparedTask,
|
|
@@ -615,7 +629,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
615
629
|
show_progress: bool = False,
|
|
616
630
|
**api_kwargs,
|
|
617
631
|
) -> pd.Series:
|
|
618
|
-
"""Execute a prepared task on each DataFrame row after
|
|
632
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON.
|
|
619
633
|
|
|
620
634
|
Example:
|
|
621
635
|
```python
|
|
@@ -666,29 +680,36 @@ class OpenAIVecDataFrameAccessor:
|
|
|
666
680
|
**api_kwargs,
|
|
667
681
|
)
|
|
668
682
|
|
|
669
|
-
def
|
|
670
|
-
|
|
671
|
-
task: PreparedTask[ResponseFormat],
|
|
672
|
-
cache: BatchingMapProxy[str, ResponseFormat],
|
|
673
|
-
**api_kwargs,
|
|
674
|
-
) -> pd.Series:
|
|
675
|
-
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache.
|
|
683
|
+
def extract(self, column: str) -> pd.DataFrame:
|
|
684
|
+
"""Flatten one column of Pydantic models/dicts into top‑level columns.
|
|
676
685
|
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
686
|
+
Example:
|
|
687
|
+
```python
|
|
688
|
+
df = pd.DataFrame([
|
|
689
|
+
{"animal": {"name": "cat", "legs": 4}},
|
|
690
|
+
{"animal": {"name": "dog", "legs": 4}},
|
|
691
|
+
{"animal": {"name": "elephant", "legs": 4}},
|
|
692
|
+
])
|
|
693
|
+
df.ai.extract("animal")
|
|
694
|
+
```
|
|
695
|
+
This method returns a DataFrame with the same index as the original,
|
|
696
|
+
where each column corresponds to a key in the dictionaries.
|
|
697
|
+
The source column is dropped.
|
|
680
698
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
``seed``) forwarded verbatim. Core routing keys are managed internally.
|
|
699
|
+
Args:
|
|
700
|
+
column (str): Column to expand.
|
|
684
701
|
|
|
685
702
|
Returns:
|
|
686
|
-
pandas.
|
|
703
|
+
pandas.DataFrame: Original DataFrame with the extracted columns; the source column is dropped.
|
|
687
704
|
"""
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
705
|
+
if column not in self._obj.columns:
|
|
706
|
+
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
|
707
|
+
|
|
708
|
+
return (
|
|
709
|
+
self._obj.pipe(lambda df: df.reset_index(drop=True))
|
|
710
|
+
.pipe(lambda df: df.join(df[column].ai.extract()))
|
|
711
|
+
.pipe(lambda df: df.set_index(self._obj.index))
|
|
712
|
+
.pipe(lambda df: df.drop(columns=[column], axis=1))
|
|
692
713
|
)
|
|
693
714
|
|
|
694
715
|
def fillna(
|
|
@@ -776,15 +797,6 @@ class OpenAIVecDataFrameAccessor:
|
|
|
776
797
|
two columns of the DataFrame. The vectors should be numpy arrays or
|
|
777
798
|
array-like objects that support dot product operations.
|
|
778
799
|
|
|
779
|
-
Args:
|
|
780
|
-
col1 (str): Name of the first column containing embedding vectors.
|
|
781
|
-
col2 (str): Name of the second column containing embedding vectors.
|
|
782
|
-
|
|
783
|
-
Returns:
|
|
784
|
-
pandas.Series: Series containing cosine similarity scores between
|
|
785
|
-
corresponding vectors in col1 and col2, with values ranging
|
|
786
|
-
from -1 to 1, where 1 indicates identical direction.
|
|
787
|
-
|
|
788
800
|
Example:
|
|
789
801
|
```python
|
|
790
802
|
df = pd.DataFrame({
|
|
@@ -793,6 +805,15 @@ class OpenAIVecDataFrameAccessor:
|
|
|
793
805
|
})
|
|
794
806
|
similarities = df.ai.similarity('vec1', 'vec2')
|
|
795
807
|
```
|
|
808
|
+
|
|
809
|
+
Args:
|
|
810
|
+
col1 (str): Name of the first column containing embedding vectors.
|
|
811
|
+
col2 (str): Name of the second column containing embedding vectors.
|
|
812
|
+
|
|
813
|
+
Returns:
|
|
814
|
+
pandas.Series: Series containing cosine similarity scores between
|
|
815
|
+
corresponding vectors in col1 and col2, with values ranging
|
|
816
|
+
from -1 to 1, where 1 indicates identical direction.
|
|
796
817
|
"""
|
|
797
818
|
return self._obj.apply(
|
|
798
819
|
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
@@ -823,163 +844,47 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
823
844
|
across multiple operations or custom batch size management. The concurrency
|
|
824
845
|
is controlled by the cache instance itself.
|
|
825
846
|
|
|
826
|
-
Args:
|
|
827
|
-
instructions (str): System prompt prepended to every user message.
|
|
828
|
-
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
829
|
-
instance for managing API call batching and deduplication.
|
|
830
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
831
|
-
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
832
|
-
type the assistant should return. Defaults to ``str``.
|
|
833
|
-
temperature (float | None, optional): Sampling temperature. ``None`` omits the
|
|
834
|
-
parameter (recommended for reasoning models). Defaults to ``0.0``.
|
|
835
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
836
|
-
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
837
|
-
``AsyncOpenAI.responses.parse`` (e.g. ``max_output_tokens``, penalties,
|
|
838
|
-
future parameters). Core batching keys (model, instructions, input,
|
|
839
|
-
text_format) are protected and silently ignored if provided.
|
|
840
|
-
|
|
841
|
-
Returns:
|
|
842
|
-
pandas.Series: Series whose values are instances of ``response_format``.
|
|
843
|
-
|
|
844
847
|
Example:
|
|
845
848
|
```python
|
|
846
849
|
result = await series.aio.responses_with_cache(
|
|
847
850
|
"classify",
|
|
848
|
-
cache=shared,
|
|
849
|
-
max_output_tokens=256,
|
|
850
|
-
frequency_penalty=0.2,
|
|
851
|
-
)
|
|
852
|
-
```
|
|
853
|
-
|
|
854
|
-
Note:
|
|
855
|
-
This is an asynchronous method and must be awaited.
|
|
856
|
-
"""
|
|
857
|
-
client: AsyncBatchResponses = AsyncBatchResponses(
|
|
858
|
-
client=CONTAINER.resolve(AsyncOpenAI),
|
|
859
|
-
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
860
|
-
system_message=instructions,
|
|
861
|
-
response_format=response_format,
|
|
862
|
-
cache=cache,
|
|
863
|
-
temperature=temperature,
|
|
864
|
-
top_p=top_p,
|
|
865
|
-
)
|
|
866
|
-
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
867
|
-
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
868
|
-
|
|
869
|
-
async def embeddings_with_cache(
|
|
870
|
-
self,
|
|
871
|
-
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
872
|
-
) -> pd.Series:
|
|
873
|
-
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
874
|
-
|
|
875
|
-
This method allows external control over caching behavior by accepting
|
|
876
|
-
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
877
|
-
across multiple operations or custom batch size management. The concurrency
|
|
878
|
-
is controlled by the cache instance itself.
|
|
879
|
-
|
|
880
|
-
Args:
|
|
881
|
-
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
882
|
-
instance for managing API call batching and deduplication.
|
|
883
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
884
|
-
|
|
885
|
-
Returns:
|
|
886
|
-
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
887
|
-
(dtype ``float32``).
|
|
888
|
-
|
|
889
|
-
Example:
|
|
890
|
-
```python
|
|
891
|
-
from openaivec._proxy import AsyncBatchingMapProxy
|
|
892
|
-
import numpy as np
|
|
893
|
-
|
|
894
|
-
# Create a shared cache with custom batch size and concurrency
|
|
895
|
-
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
896
|
-
batch_size=64, max_concurrency=4
|
|
897
|
-
)
|
|
898
|
-
|
|
899
|
-
animals = pd.Series(["cat", "dog", "elephant"])
|
|
900
|
-
# Must be awaited
|
|
901
|
-
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
902
|
-
```
|
|
903
|
-
|
|
904
|
-
Note:
|
|
905
|
-
This is an asynchronous method and must be awaited.
|
|
906
|
-
"""
|
|
907
|
-
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
908
|
-
client=CONTAINER.resolve(AsyncOpenAI),
|
|
909
|
-
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
910
|
-
cache=cache,
|
|
911
|
-
)
|
|
912
|
-
|
|
913
|
-
# Await the async operation
|
|
914
|
-
results = await client.create(self._obj.tolist())
|
|
915
|
-
|
|
916
|
-
return pd.Series(
|
|
917
|
-
results,
|
|
918
|
-
index=self._obj.index,
|
|
919
|
-
name=self._obj.name,
|
|
920
|
-
)
|
|
921
|
-
|
|
922
|
-
async def task_with_cache(
|
|
923
|
-
self,
|
|
924
|
-
task: PreparedTask[ResponseFormat],
|
|
925
|
-
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
926
|
-
**api_kwargs,
|
|
927
|
-
) -> pd.Series:
|
|
928
|
-
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
929
|
-
|
|
930
|
-
This method allows external control over caching behavior by accepting
|
|
931
|
-
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
932
|
-
across multiple operations or custom batch size management. The concurrency
|
|
933
|
-
is controlled by the cache instance itself.
|
|
851
|
+
cache=shared,
|
|
852
|
+
max_output_tokens=256,
|
|
853
|
+
frequency_penalty=0.2,
|
|
854
|
+
)
|
|
855
|
+
```
|
|
934
856
|
|
|
935
857
|
Args:
|
|
936
|
-
|
|
937
|
-
response format, and other parameters for processing the inputs.
|
|
858
|
+
instructions (str): System prompt prepended to every user message.
|
|
938
859
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
939
860
|
instance for managing API call batching and deduplication.
|
|
940
861
|
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
862
|
+
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
863
|
+
type the assistant should return. Defaults to ``str``.
|
|
864
|
+
temperature (float | None, optional): Sampling temperature. ``None`` omits the
|
|
865
|
+
parameter (recommended for reasoning models). Defaults to ``0.0``.
|
|
866
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
867
|
+
**api_kwargs: Additional keyword arguments forwarded verbatim to
|
|
868
|
+
``AsyncOpenAI.responses.parse`` (e.g. ``max_output_tokens``, penalties,
|
|
869
|
+
future parameters). Core batching keys (model, instructions, input,
|
|
870
|
+
text_format) are protected and silently ignored if provided.
|
|
947
871
|
|
|
948
872
|
Returns:
|
|
949
|
-
pandas.Series: Series whose values are instances of
|
|
950
|
-
response format, aligned with the original Series index.
|
|
951
|
-
|
|
952
|
-
Example:
|
|
953
|
-
```python
|
|
954
|
-
from openaivec._model import PreparedTask
|
|
955
|
-
from openaivec._proxy import AsyncBatchingMapProxy
|
|
956
|
-
|
|
957
|
-
# Create a shared cache with custom batch size and concurrency
|
|
958
|
-
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
959
|
-
|
|
960
|
-
# Assume you have a prepared task for sentiment analysis
|
|
961
|
-
sentiment_task = PreparedTask(...)
|
|
962
|
-
|
|
963
|
-
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
964
|
-
# Must be awaited
|
|
965
|
-
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
966
|
-
```
|
|
873
|
+
pandas.Series: Series whose values are instances of ``response_format``.
|
|
967
874
|
|
|
968
875
|
Note:
|
|
969
876
|
This is an asynchronous method and must be awaited.
|
|
970
877
|
"""
|
|
971
|
-
client = AsyncBatchResponses(
|
|
878
|
+
client: AsyncBatchResponses = AsyncBatchResponses(
|
|
972
879
|
client=CONTAINER.resolve(AsyncOpenAI),
|
|
973
880
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
974
|
-
system_message=
|
|
975
|
-
response_format=
|
|
881
|
+
system_message=instructions,
|
|
882
|
+
response_format=response_format,
|
|
976
883
|
cache=cache,
|
|
977
|
-
temperature=
|
|
978
|
-
top_p=
|
|
884
|
+
temperature=temperature,
|
|
885
|
+
top_p=top_p,
|
|
979
886
|
)
|
|
980
|
-
# Await the async operation
|
|
981
887
|
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
982
|
-
|
|
983
888
|
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
984
889
|
|
|
985
890
|
async def responses(
|
|
@@ -1018,7 +923,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1018
923
|
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1019
924
|
request. Defaults to ``None`` (automatic batch size optimization
|
|
1020
925
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
1021
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
926
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1022
927
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1023
928
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1024
929
|
requests. Defaults to ``8``.
|
|
@@ -1041,6 +946,59 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1041
946
|
**api_kwargs,
|
|
1042
947
|
)
|
|
1043
948
|
|
|
949
|
+
async def embeddings_with_cache(
|
|
950
|
+
self,
|
|
951
|
+
cache: AsyncBatchingMapProxy[str, np.ndarray],
|
|
952
|
+
) -> pd.Series:
|
|
953
|
+
"""Compute OpenAI embeddings for every Series element using a provided cache (asynchronously).
|
|
954
|
+
|
|
955
|
+
This method allows external control over caching behavior by accepting
|
|
956
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
957
|
+
across multiple operations or custom batch size management. The concurrency
|
|
958
|
+
is controlled by the cache instance itself.
|
|
959
|
+
|
|
960
|
+
Example:
|
|
961
|
+
```python
|
|
962
|
+
from openaivec._proxy import AsyncBatchingMapProxy
|
|
963
|
+
import numpy as np
|
|
964
|
+
|
|
965
|
+
# Create a shared cache with custom batch size and concurrency
|
|
966
|
+
shared_cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
967
|
+
batch_size=64, max_concurrency=4
|
|
968
|
+
)
|
|
969
|
+
|
|
970
|
+
animals = pd.Series(["cat", "dog", "elephant"])
|
|
971
|
+
# Must be awaited
|
|
972
|
+
embeddings = await animals.aio.embeddings_with_cache(cache=shared_cache)
|
|
973
|
+
```
|
|
974
|
+
|
|
975
|
+
Args:
|
|
976
|
+
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
977
|
+
instance for managing API call batching and deduplication.
|
|
978
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
979
|
+
|
|
980
|
+
Returns:
|
|
981
|
+
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
982
|
+
(dtype ``float32``).
|
|
983
|
+
|
|
984
|
+
Note:
|
|
985
|
+
This is an asynchronous method and must be awaited.
|
|
986
|
+
"""
|
|
987
|
+
client: AsyncBatchEmbeddings = AsyncBatchEmbeddings(
|
|
988
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
989
|
+
model_name=CONTAINER.resolve(EmbeddingsModelName).value,
|
|
990
|
+
cache=cache,
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
# Await the async operation
|
|
994
|
+
results = await client.create(self._obj.tolist())
|
|
995
|
+
|
|
996
|
+
return pd.Series(
|
|
997
|
+
results,
|
|
998
|
+
index=self._obj.index,
|
|
999
|
+
name=self._obj.name,
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1044
1002
|
async def embeddings(
|
|
1045
1003
|
self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False
|
|
1046
1004
|
) -> pd.Series:
|
|
@@ -1082,6 +1040,69 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1082
1040
|
),
|
|
1083
1041
|
)
|
|
1084
1042
|
|
|
1043
|
+
async def task_with_cache(
|
|
1044
|
+
self,
|
|
1045
|
+
task: PreparedTask[ResponseFormat],
|
|
1046
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1047
|
+
**api_kwargs,
|
|
1048
|
+
) -> pd.Series:
|
|
1049
|
+
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
1050
|
+
|
|
1051
|
+
This method allows external control over caching behavior by accepting
|
|
1052
|
+
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1053
|
+
across multiple operations or custom batch size management. The concurrency
|
|
1054
|
+
is controlled by the cache instance itself.
|
|
1055
|
+
|
|
1056
|
+
Args:
|
|
1057
|
+
task (PreparedTask): A pre-configured task containing instructions,
|
|
1058
|
+
response format, and other parameters for processing the inputs.
|
|
1059
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1060
|
+
instance for managing API call batching and deduplication.
|
|
1061
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1062
|
+
|
|
1063
|
+
Example:
|
|
1064
|
+
```python
|
|
1065
|
+
from openaivec._model import PreparedTask
|
|
1066
|
+
from openaivec._proxy import AsyncBatchingMapProxy
|
|
1067
|
+
|
|
1068
|
+
# Create a shared cache with custom batch size and concurrency
|
|
1069
|
+
shared_cache = AsyncBatchingMapProxy(batch_size=64, max_concurrency=4)
|
|
1070
|
+
|
|
1071
|
+
# Assume you have a prepared task for sentiment analysis
|
|
1072
|
+
sentiment_task = PreparedTask(...)
|
|
1073
|
+
|
|
1074
|
+
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
1075
|
+
# Must be awaited
|
|
1076
|
+
results = await reviews.aio.task_with_cache(sentiment_task, cache=shared_cache)
|
|
1077
|
+
```
|
|
1078
|
+
|
|
1079
|
+
Additional Keyword Args:
|
|
1080
|
+
Arbitrary OpenAI Responses API parameters (e.g. ``frequency_penalty``, ``presence_penalty``,
|
|
1081
|
+
``seed``, etc.) are forwarded verbatim to the underlying client. Core batching / routing
|
|
1082
|
+
keys (``model``, ``instructions`` / system message, user ``input``) are managed by the
|
|
1083
|
+
library and cannot be overridden.
|
|
1084
|
+
|
|
1085
|
+
Returns:
|
|
1086
|
+
pandas.Series: Series whose values are instances of the task's
|
|
1087
|
+
response format, aligned with the original Series index.
|
|
1088
|
+
|
|
1089
|
+
Note:
|
|
1090
|
+
This is an asynchronous method and must be awaited.
|
|
1091
|
+
"""
|
|
1092
|
+
client = AsyncBatchResponses(
|
|
1093
|
+
client=CONTAINER.resolve(AsyncOpenAI),
|
|
1094
|
+
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
1095
|
+
system_message=task.instructions,
|
|
1096
|
+
response_format=task.response_format,
|
|
1097
|
+
cache=cache,
|
|
1098
|
+
temperature=task.temperature,
|
|
1099
|
+
top_p=task.top_p,
|
|
1100
|
+
)
|
|
1101
|
+
# Await the async operation
|
|
1102
|
+
results = await client.parse(self._obj.tolist(), **api_kwargs)
|
|
1103
|
+
|
|
1104
|
+
return pd.Series(results, index=self._obj.index, name=self._obj.name)
|
|
1105
|
+
|
|
1085
1106
|
async def task(
|
|
1086
1107
|
self,
|
|
1087
1108
|
task: PreparedTask,
|
|
@@ -1161,26 +1182,13 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1161
1182
|
top_p: float = 1.0,
|
|
1162
1183
|
**api_kwargs,
|
|
1163
1184
|
) -> pd.Series:
|
|
1164
|
-
"""Generate a response for each row after
|
|
1185
|
+
"""Generate a response for each row after serializing it to JSON using a provided cache (asynchronously).
|
|
1165
1186
|
|
|
1166
1187
|
This method allows external control over caching behavior by accepting
|
|
1167
1188
|
a pre-configured AsyncBatchingMapProxy instance, enabling cache sharing
|
|
1168
1189
|
across multiple operations or custom batch size management. The concurrency
|
|
1169
1190
|
is controlled by the cache instance itself.
|
|
1170
1191
|
|
|
1171
|
-
Args:
|
|
1172
|
-
instructions (str): System prompt for the assistant.
|
|
1173
|
-
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1174
|
-
instance for managing API call batching and deduplication.
|
|
1175
|
-
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1176
|
-
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1177
|
-
responses. Defaults to ``str``.
|
|
1178
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1179
|
-
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1180
|
-
|
|
1181
|
-
Returns:
|
|
1182
|
-
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
1183
|
-
|
|
1184
1192
|
Example:
|
|
1185
1193
|
```python
|
|
1186
1194
|
from openaivec._proxy import AsyncBatchingMapProxy
|
|
@@ -1200,6 +1208,19 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1200
1208
|
)
|
|
1201
1209
|
```
|
|
1202
1210
|
|
|
1211
|
+
Args:
|
|
1212
|
+
instructions (str): System prompt for the assistant.
|
|
1213
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1214
|
+
instance for managing API call batching and deduplication.
|
|
1215
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1216
|
+
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1217
|
+
responses. Defaults to ``str``.
|
|
1218
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1219
|
+
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1220
|
+
|
|
1221
|
+
Returns:
|
|
1222
|
+
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
1223
|
+
|
|
1203
1224
|
Note:
|
|
1204
1225
|
This is an asynchronous method and must be awaited.
|
|
1205
1226
|
"""
|
|
@@ -1224,7 +1245,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1224
1245
|
show_progress: bool = False,
|
|
1225
1246
|
**api_kwargs,
|
|
1226
1247
|
) -> pd.Series:
|
|
1227
|
-
"""Generate a response for each row after
|
|
1248
|
+
"""Generate a response for each row after serializing it to JSON (asynchronously).
|
|
1228
1249
|
|
|
1229
1250
|
Example:
|
|
1230
1251
|
```python
|
|
@@ -1253,7 +1274,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1253
1274
|
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
1254
1275
|
Defaults to ``None`` (automatic batch size optimization
|
|
1255
1276
|
based on execution time). Set to a positive integer for fixed batch size.
|
|
1256
|
-
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1277
|
+
temperature (float | None, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1257
1278
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1258
1279
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1259
1280
|
requests. Defaults to ``8``.
|
|
@@ -1276,6 +1297,35 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1276
1297
|
**api_kwargs,
|
|
1277
1298
|
)
|
|
1278
1299
|
|
|
1300
|
+
async def task_with_cache(
|
|
1301
|
+
self,
|
|
1302
|
+
task: PreparedTask[ResponseFormat],
|
|
1303
|
+
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1304
|
+
**api_kwargs,
|
|
1305
|
+
) -> pd.Series:
|
|
1306
|
+
"""Execute a prepared task on each DataFrame row using a provided cache (asynchronously).
|
|
1307
|
+
|
|
1308
|
+
After serializing each row to JSON, this method executes the prepared task.
|
|
1309
|
+
|
|
1310
|
+
Args:
|
|
1311
|
+
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
1312
|
+
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1313
|
+
|
|
1314
|
+
Additional Keyword Args:
|
|
1315
|
+
Arbitrary OpenAI Responses API parameters forwarded verbatim. Core routing keys are protected.
|
|
1316
|
+
|
|
1317
|
+
Returns:
|
|
1318
|
+
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
1319
|
+
|
|
1320
|
+
Note:
|
|
1321
|
+
This is an asynchronous method and must be awaited.
|
|
1322
|
+
"""
|
|
1323
|
+
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1324
|
+
task=task,
|
|
1325
|
+
cache=cache,
|
|
1326
|
+
**api_kwargs,
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1279
1329
|
async def task(
|
|
1280
1330
|
self,
|
|
1281
1331
|
task: PreparedTask,
|
|
@@ -1284,7 +1334,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1284
1334
|
show_progress: bool = False,
|
|
1285
1335
|
**api_kwargs,
|
|
1286
1336
|
) -> pd.Series:
|
|
1287
|
-
"""Execute a prepared task on each DataFrame row after
|
|
1337
|
+
"""Execute a prepared task on each DataFrame row after serializing it to JSON (asynchronously).
|
|
1288
1338
|
|
|
1289
1339
|
Example:
|
|
1290
1340
|
```python
|
|
@@ -1343,40 +1393,24 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1343
1393
|
**api_kwargs,
|
|
1344
1394
|
)
|
|
1345
1395
|
|
|
1346
|
-
async def task_with_cache(
|
|
1347
|
-
self,
|
|
1348
|
-
task: PreparedTask[ResponseFormat],
|
|
1349
|
-
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
1350
|
-
**api_kwargs,
|
|
1351
|
-
) -> pd.Series:
|
|
1352
|
-
"""Execute a prepared task on each DataFrame row after serializing it to JSON using a provided cache (async).
|
|
1353
|
-
|
|
1354
|
-
Args:
|
|
1355
|
-
task (PreparedTask): Prepared task (instructions + response_format + sampling params).
|
|
1356
|
-
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre‑configured async cache instance.
|
|
1357
|
-
|
|
1358
|
-
Additional Keyword Args:
|
|
1359
|
-
Arbitrary OpenAI Responses API parameters forwarded verbatim. Core routing keys are protected.
|
|
1360
|
-
|
|
1361
|
-
Returns:
|
|
1362
|
-
pandas.Series: Task results aligned with the DataFrame's original index.
|
|
1363
|
-
|
|
1364
|
-
Note:
|
|
1365
|
-
This is an asynchronous method and must be awaited.
|
|
1366
|
-
"""
|
|
1367
|
-
return await _df_rows_to_json_series(self._obj).aio.task_with_cache(
|
|
1368
|
-
task=task,
|
|
1369
|
-
cache=cache,
|
|
1370
|
-
**api_kwargs,
|
|
1371
|
-
)
|
|
1372
|
-
|
|
1373
1396
|
async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
|
|
1374
|
-
"""
|
|
1375
|
-
Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
|
|
1397
|
+
"""Apply a function to the DataFrame, supporting both synchronous and asynchronous functions.
|
|
1376
1398
|
|
|
1377
1399
|
This method allows chaining operations on the DataFrame, similar to pandas' `pipe` method,
|
|
1378
1400
|
but with support for asynchronous functions.
|
|
1379
1401
|
|
|
1402
|
+
Example:
|
|
1403
|
+
```python
|
|
1404
|
+
async def process_data(df):
|
|
1405
|
+
# Simulate an asynchronous computation
|
|
1406
|
+
await asyncio.sleep(1)
|
|
1407
|
+
return df.dropna()
|
|
1408
|
+
|
|
1409
|
+
df = pd.DataFrame({"col": [1, 2, None, 4]})
|
|
1410
|
+
# Must be awaited
|
|
1411
|
+
result = await df.aio.pipe(process_data)
|
|
1412
|
+
```
|
|
1413
|
+
|
|
1380
1414
|
Args:
|
|
1381
1415
|
func (Callable[[pd.DataFrame], Awaitable[T] | T]): A function that takes a DataFrame
|
|
1382
1416
|
as input and returns either a result or an awaitable result.
|
|
@@ -6,12 +6,12 @@ openaivec/_model.py,sha256=xg3s9Ljqb2xK1t_a5bwWxGJfFSIuaNrFGMgQq4nQKrM,3351
|
|
|
6
6
|
openaivec/_optimize.py,sha256=-mKjD5YV_d1Z2nqfGfAcmx6mTKn6AODjFTrIKJPbAXQ,3851
|
|
7
7
|
openaivec/_prompt.py,sha256=KoJbFK4gTEDRtu9OMweJq_jQLkSPFy2Kcvao30qKhAQ,20844
|
|
8
8
|
openaivec/_provider.py,sha256=dNr9Y2C97GK-pkY81odurKoDup59dLK31V3EGT2HOwE,6711
|
|
9
|
-
openaivec/_proxy.py,sha256=
|
|
9
|
+
openaivec/_proxy.py,sha256=J0qGDcZqSab26ScA8OXxzornfwuXtrVycqup-JPq464,29719
|
|
10
10
|
openaivec/_responses.py,sha256=xtkiOn01RkauHq2FAKRAcjPglH8rmbaSz0-VE0ClTe8,24026
|
|
11
|
-
openaivec/_schema.py,sha256=
|
|
11
|
+
openaivec/_schema.py,sha256=9enwqE2idLLUKbQxjiNn09uhdKz14kihEwUXglRqxx0,20543
|
|
12
12
|
openaivec/_serialize.py,sha256=NLCKl4opc1WS24_duwpI2UGBepQ8SBh4YRxBlLwzDLw,8403
|
|
13
13
|
openaivec/_util.py,sha256=dFWwjouJyvF-tqNPs2933OAt5Fw9I2Q2BvmGIfGH5k4,6423
|
|
14
|
-
openaivec/pandas_ext.py,sha256=
|
|
14
|
+
openaivec/pandas_ext.py,sha256=m4H6mrE__Jmr5R6hl6d8yc2JhVT0-wdf5GOKWIITeLU,63366
|
|
15
15
|
openaivec/spark.py,sha256=lI-noacLvuxu6gBztKdcYd9vfK3eNI3aCGwJylkzv7E,25367
|
|
16
16
|
openaivec/task/__init__.py,sha256=lrgoc9UIox7XnxZ96dQRl88a-8QfuZRFBHshxctpMB8,6178
|
|
17
17
|
openaivec/task/customer_support/__init__.py,sha256=KWfGyXPdZyfGdRH17x7hPpJJ1N2EP9PPhZx0fvBAwSI,884
|
|
@@ -30,7 +30,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=BNwWtNT-MNA76eIJbb31641upukmRwM9
|
|
|
30
30
|
openaivec/task/nlp/translation.py,sha256=XTZM11JFjbgTK9wHnxFgVDabXZ5bqbabXK_bq2nEkyQ,6627
|
|
31
31
|
openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
|
|
32
32
|
openaivec/task/table/fillna.py,sha256=ZVcOpuh7ULVhrt1VsWy5fPhk53XNaiD7kXGCPhh83M8,6636
|
|
33
|
-
openaivec-0.14.
|
|
34
|
-
openaivec-0.14.
|
|
35
|
-
openaivec-0.14.
|
|
36
|
-
openaivec-0.14.
|
|
33
|
+
openaivec-0.14.4.dist-info/METADATA,sha256=RF6rZDL5B4qYCqXIbC0jexv-IzHv48WBDV-MZtNHcvY,27566
|
|
34
|
+
openaivec-0.14.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
35
|
+
openaivec-0.14.4.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
|
|
36
|
+
openaivec-0.14.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|