openaivec 0.13.4__py3-none-any.whl → 0.13.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/embeddings.py +10 -8
- openaivec/model.py +9 -11
- openaivec/optimize.py +1 -1
- openaivec/pandas_ext.py +61 -42
- openaivec/prompt.py +58 -8
- openaivec/provider.py +10 -0
- openaivec/proxy.py +82 -65
- openaivec/responses.py +35 -18
- openaivec/spark.py +40 -34
- openaivec/task/customer_support/inquiry_classification.py +9 -9
- openaivec/task/customer_support/urgency_analysis.py +13 -13
- openaivec/task/nlp/keyword_extraction.py +2 -2
- openaivec/task/nlp/named_entity_recognition.py +2 -2
- openaivec/util.py +2 -2
- {openaivec-0.13.4.dist-info → openaivec-0.13.6.dist-info}/METADATA +9 -9
- {openaivec-0.13.4.dist-info → openaivec-0.13.6.dist-info}/RECORD +18 -18
- {openaivec-0.13.4.dist-info → openaivec-0.13.6.dist-info}/WHEEL +0 -0
- {openaivec-0.13.4.dist-info → openaivec-0.13.6.dist-info}/licenses/LICENSE +0 -0
openaivec/embeddings.py
CHANGED
|
@@ -31,16 +31,17 @@ class BatchEmbeddings:
|
|
|
31
31
|
|
|
32
32
|
client: OpenAI
|
|
33
33
|
model_name: str
|
|
34
|
-
cache: BatchingMapProxy[str, NDArray[np.float32]] = field(default_factory=lambda: BatchingMapProxy(batch_size=
|
|
34
|
+
cache: BatchingMapProxy[str, NDArray[np.float32]] = field(default_factory=lambda: BatchingMapProxy(batch_size=None))
|
|
35
35
|
|
|
36
36
|
@classmethod
|
|
37
|
-
def of(cls, client: OpenAI, model_name: str, batch_size: int =
|
|
37
|
+
def of(cls, client: OpenAI, model_name: str, batch_size: int | None = None) -> "BatchEmbeddings":
|
|
38
38
|
"""Factory constructor.
|
|
39
39
|
|
|
40
40
|
Args:
|
|
41
41
|
client (OpenAI): OpenAI client.
|
|
42
42
|
model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name.
|
|
43
|
-
batch_size (int, optional): Max unique inputs per API call. Defaults to
|
|
43
|
+
batch_size (int | None, optional): Max unique inputs per API call. Defaults to None
|
|
44
|
+
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
44
45
|
|
|
45
46
|
Returns:
|
|
46
47
|
BatchEmbeddings: Configured instance backed by a batching proxy.
|
|
@@ -127,7 +128,7 @@ class AsyncBatchEmbeddings:
|
|
|
127
128
|
client: AsyncOpenAI
|
|
128
129
|
model_name: str
|
|
129
130
|
cache: AsyncBatchingMapProxy[str, NDArray[np.float32]] = field(
|
|
130
|
-
default_factory=lambda: AsyncBatchingMapProxy(batch_size=
|
|
131
|
+
default_factory=lambda: AsyncBatchingMapProxy(batch_size=None, max_concurrency=8)
|
|
131
132
|
)
|
|
132
133
|
|
|
133
134
|
@classmethod
|
|
@@ -135,7 +136,7 @@ class AsyncBatchEmbeddings:
|
|
|
135
136
|
cls,
|
|
136
137
|
client: AsyncOpenAI,
|
|
137
138
|
model_name: str,
|
|
138
|
-
batch_size: int =
|
|
139
|
+
batch_size: int | None = None,
|
|
139
140
|
max_concurrency: int = 8,
|
|
140
141
|
) -> "AsyncBatchEmbeddings":
|
|
141
142
|
"""Factory constructor.
|
|
@@ -143,7 +144,8 @@ class AsyncBatchEmbeddings:
|
|
|
143
144
|
Args:
|
|
144
145
|
client (AsyncOpenAI): OpenAI async client.
|
|
145
146
|
model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name.
|
|
146
|
-
batch_size (int, optional): Max unique inputs per API call. Defaults to
|
|
147
|
+
batch_size (int | None, optional): Max unique inputs per API call. Defaults to None
|
|
148
|
+
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
147
149
|
max_concurrency (int, optional): Max concurrent API calls. Defaults to 8.
|
|
148
150
|
|
|
149
151
|
Returns:
|
|
@@ -155,8 +157,8 @@ class AsyncBatchEmbeddings:
|
|
|
155
157
|
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
156
158
|
)
|
|
157
159
|
|
|
158
|
-
@observe(_LOGGER)
|
|
159
160
|
@backoff_async(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
|
|
161
|
+
@observe(_LOGGER)
|
|
160
162
|
async def _embed_chunk(self, inputs: List[str]) -> List[NDArray[np.float32]]:
|
|
161
163
|
"""Embed one minibatch of strings asynchronously.
|
|
162
164
|
|
|
@@ -186,4 +188,4 @@ class AsyncBatchEmbeddings:
|
|
|
186
188
|
Returns:
|
|
187
189
|
List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
|
|
188
190
|
"""
|
|
189
|
-
return await self.cache.map(inputs, self._embed_chunk)
|
|
191
|
+
return await self.cache.map(inputs, self._embed_chunk) # type: ignore[arg-type]
|
openaivec/model.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Type, TypeVar
|
|
2
|
+
from typing import Generic, Type, TypeVar
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
ResponseFormat = TypeVar("ResponseFormat", bound=BaseModel | str)
|
|
4
|
+
ResponseFormat = TypeVar("ResponseFormat")
|
|
7
5
|
|
|
8
6
|
|
|
9
7
|
@dataclass(frozen=True)
|
|
10
|
-
class PreparedTask:
|
|
8
|
+
class PreparedTask(Generic[ResponseFormat]):
|
|
11
9
|
"""A data class representing a complete task configuration for OpenAI API calls.
|
|
12
10
|
|
|
13
11
|
This class encapsulates all the necessary parameters for executing a task,
|
|
@@ -84,10 +82,10 @@ class OpenAIAPIKey:
|
|
|
84
82
|
"""Container for OpenAI API key configuration.
|
|
85
83
|
|
|
86
84
|
Attributes:
|
|
87
|
-
value (str): The API key for OpenAI services.
|
|
85
|
+
value (str | None): The API key for OpenAI services.
|
|
88
86
|
"""
|
|
89
87
|
|
|
90
|
-
value: str
|
|
88
|
+
value: str | None
|
|
91
89
|
|
|
92
90
|
|
|
93
91
|
@dataclass(frozen=True)
|
|
@@ -95,10 +93,10 @@ class AzureOpenAIAPIKey:
|
|
|
95
93
|
"""Container for Azure OpenAI API key configuration.
|
|
96
94
|
|
|
97
95
|
Attributes:
|
|
98
|
-
value (str): The API key for Azure OpenAI services.
|
|
96
|
+
value (str | None): The API key for Azure OpenAI services.
|
|
99
97
|
"""
|
|
100
98
|
|
|
101
|
-
value: str
|
|
99
|
+
value: str | None
|
|
102
100
|
|
|
103
101
|
|
|
104
102
|
@dataclass(frozen=True)
|
|
@@ -106,10 +104,10 @@ class AzureOpenAIBaseURL:
|
|
|
106
104
|
"""Container for Azure OpenAI base URL configuration.
|
|
107
105
|
|
|
108
106
|
Attributes:
|
|
109
|
-
value (str): The base URL for Azure OpenAI services.
|
|
107
|
+
value (str | None): The base URL for Azure OpenAI services.
|
|
110
108
|
"""
|
|
111
109
|
|
|
112
|
-
value: str
|
|
110
|
+
value: str | None
|
|
113
111
|
|
|
114
112
|
|
|
115
113
|
@dataclass(frozen=True)
|
openaivec/optimize.py
CHANGED
|
@@ -21,7 +21,7 @@ class BatchSizeSuggester:
|
|
|
21
21
|
min_duration: float = 30.0
|
|
22
22
|
max_duration: float = 60.0
|
|
23
23
|
step_ratio: float = 0.1
|
|
24
|
-
sample_size: int =
|
|
24
|
+
sample_size: int = 4
|
|
25
25
|
_history: List[PerformanceMetric] = field(default_factory=list)
|
|
26
26
|
_lock: threading.RLock = field(default_factory=threading.RLock, repr=False)
|
|
27
27
|
_batch_size_changed_at: datetime | None = field(default=None, init=False)
|
openaivec/pandas_ext.py
CHANGED
|
@@ -42,7 +42,7 @@ to easily interact with OpenAI APIs for tasks like generating responses or embed
|
|
|
42
42
|
import inspect
|
|
43
43
|
import json
|
|
44
44
|
import logging
|
|
45
|
-
from typing import
|
|
45
|
+
from typing import Awaitable, Callable, List, Type, TypeVar
|
|
46
46
|
|
|
47
47
|
import numpy as np
|
|
48
48
|
import pandas as pd
|
|
@@ -184,6 +184,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
184
184
|
Args:
|
|
185
185
|
cache (BatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
186
186
|
instance for managing API call batching and deduplication.
|
|
187
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
187
188
|
|
|
188
189
|
Returns:
|
|
189
190
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -217,7 +218,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
217
218
|
self,
|
|
218
219
|
instructions: str,
|
|
219
220
|
response_format: Type[ResponseFormat] = str,
|
|
220
|
-
batch_size: int =
|
|
221
|
+
batch_size: int | None = None,
|
|
221
222
|
temperature: float | None = 0.0,
|
|
222
223
|
top_p: float = 1.0,
|
|
223
224
|
show_progress: bool = False,
|
|
@@ -247,8 +248,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
247
248
|
instructions (str): System prompt prepended to every user message.
|
|
248
249
|
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
249
250
|
type the assistant should return. Defaults to ``str``.
|
|
250
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
251
|
-
request. Defaults to ``
|
|
251
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
252
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
253
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
252
254
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
253
255
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
254
256
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -266,7 +268,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
266
268
|
|
|
267
269
|
def task_with_cache(
|
|
268
270
|
self,
|
|
269
|
-
task: PreparedTask,
|
|
271
|
+
task: PreparedTask[ResponseFormat],
|
|
270
272
|
cache: BatchingMapProxy[str, ResponseFormat],
|
|
271
273
|
) -> pd.Series:
|
|
272
274
|
"""Execute a prepared task on every Series element using a provided cache.
|
|
@@ -280,6 +282,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
280
282
|
response format, and other parameters for processing the inputs.
|
|
281
283
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
282
284
|
instance for managing API call batching and deduplication.
|
|
285
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
283
286
|
|
|
284
287
|
Returns:
|
|
285
288
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -311,7 +314,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
311
314
|
)
|
|
312
315
|
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
313
316
|
|
|
314
|
-
def task(self, task: PreparedTask, batch_size: int =
|
|
317
|
+
def task(self, task: PreparedTask, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
315
318
|
"""Execute a prepared task on every Series element.
|
|
316
319
|
|
|
317
320
|
This method applies a pre-configured task to each element in the Series,
|
|
@@ -343,8 +346,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
343
346
|
Args:
|
|
344
347
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
345
348
|
response format, and other parameters for processing the inputs.
|
|
346
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
347
|
-
request to optimize API usage. Defaults to
|
|
349
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
350
|
+
request to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
351
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
348
352
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
349
353
|
|
|
350
354
|
Returns:
|
|
@@ -356,7 +360,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
356
360
|
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
357
361
|
)
|
|
358
362
|
|
|
359
|
-
def embeddings(self, batch_size: int =
|
|
363
|
+
def embeddings(self, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
360
364
|
"""Compute OpenAI embeddings for every Series element.
|
|
361
365
|
|
|
362
366
|
Example:
|
|
@@ -378,8 +382,9 @@ class OpenAIVecSeriesAccessor:
|
|
|
378
382
|
The default embedding model is `text-embedding-3-small`.
|
|
379
383
|
|
|
380
384
|
Args:
|
|
381
|
-
batch_size (int, optional): Number of inputs grouped into a
|
|
382
|
-
single request. Defaults to ``
|
|
385
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
386
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
387
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
383
388
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
384
389
|
|
|
385
390
|
Returns:
|
|
@@ -494,6 +499,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
494
499
|
instructions (str): System prompt for the assistant.
|
|
495
500
|
cache (BatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
496
501
|
instance for managing API call batching and deduplication.
|
|
502
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
497
503
|
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
498
504
|
responses. Defaults to ``str``.
|
|
499
505
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
@@ -538,7 +544,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
538
544
|
self,
|
|
539
545
|
instructions: str,
|
|
540
546
|
response_format: Type[ResponseFormat] = str,
|
|
541
|
-
batch_size: int =
|
|
547
|
+
batch_size: int | None = None,
|
|
542
548
|
temperature: float | None = 0.0,
|
|
543
549
|
top_p: float = 1.0,
|
|
544
550
|
show_progress: bool = False,
|
|
@@ -573,8 +579,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
573
579
|
instructions (str): System prompt for the assistant.
|
|
574
580
|
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
575
581
|
responses. Defaults to ``str``.
|
|
576
|
-
batch_size (int, optional): Number of requests sent in one batch.
|
|
577
|
-
Defaults to ``
|
|
582
|
+
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
583
|
+
Defaults to ``None`` (automatic batch size optimization
|
|
584
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
578
585
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
579
586
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
580
587
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -590,7 +597,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
590
597
|
top_p=top_p,
|
|
591
598
|
)
|
|
592
599
|
|
|
593
|
-
def task(self, task: PreparedTask, batch_size: int =
|
|
600
|
+
def task(self, task: PreparedTask, batch_size: int | None = None, show_progress: bool = False) -> pd.Series:
|
|
594
601
|
"""Execute a prepared task on each DataFrame row after serialising it to JSON.
|
|
595
602
|
|
|
596
603
|
This method applies a pre-configured task to each row in the DataFrame,
|
|
@@ -618,8 +625,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
618
625
|
Args:
|
|
619
626
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
620
627
|
response format, and other parameters for processing the inputs.
|
|
621
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
622
|
-
to optimize API usage. Defaults to
|
|
628
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
629
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
630
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
623
631
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
624
632
|
|
|
625
633
|
Returns:
|
|
@@ -634,7 +642,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
634
642
|
)
|
|
635
643
|
)
|
|
636
644
|
|
|
637
|
-
def fillna(self, target_column_name: str, max_examples: int = 500, batch_size: int =
|
|
645
|
+
def fillna(self, target_column_name: str, max_examples: int = 500, batch_size: int | None = None) -> pd.DataFrame:
|
|
638
646
|
"""Fill missing values in a DataFrame column using AI-powered inference.
|
|
639
647
|
|
|
640
648
|
This method uses machine learning to intelligently fill missing (NaN) values
|
|
@@ -648,8 +656,9 @@ class OpenAIVecDataFrameAccessor:
|
|
|
648
656
|
max_examples (int, optional): The maximum number of example rows to use
|
|
649
657
|
for context when predicting missing values. Higher values may improve
|
|
650
658
|
accuracy but increase API costs and processing time. Defaults to 500.
|
|
651
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
652
|
-
to optimize API usage. Defaults to
|
|
659
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
660
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
661
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
653
662
|
|
|
654
663
|
Returns:
|
|
655
664
|
pandas.DataFrame: A new DataFrame with missing values filled in the target
|
|
@@ -721,7 +730,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
721
730
|
return self._obj.apply(
|
|
722
731
|
lambda row: np.dot(row[col1], row[col2]) / (np.linalg.norm(row[col1]) * np.linalg.norm(row[col2])),
|
|
723
732
|
axis=1,
|
|
724
|
-
).rename("similarity")
|
|
733
|
+
).rename("similarity") # type: ignore[arg-type]
|
|
725
734
|
|
|
726
735
|
|
|
727
736
|
@pd.api.extensions.register_series_accessor("aio")
|
|
@@ -750,6 +759,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
750
759
|
instructions (str): System prompt prepended to every user message.
|
|
751
760
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
752
761
|
instance for managing API call batching and deduplication.
|
|
762
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
753
763
|
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
754
764
|
type the assistant should return. Defaults to ``str``.
|
|
755
765
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
@@ -804,6 +814,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
804
814
|
Args:
|
|
805
815
|
cache (AsyncBatchingMapProxy[str, np.ndarray]): Pre-configured cache
|
|
806
816
|
instance for managing API call batching and deduplication.
|
|
817
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
807
818
|
|
|
808
819
|
Returns:
|
|
809
820
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -844,7 +855,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
844
855
|
|
|
845
856
|
async def task_with_cache(
|
|
846
857
|
self,
|
|
847
|
-
task: PreparedTask,
|
|
858
|
+
task: PreparedTask[ResponseFormat],
|
|
848
859
|
cache: AsyncBatchingMapProxy[str, ResponseFormat],
|
|
849
860
|
) -> pd.Series:
|
|
850
861
|
"""Execute a prepared task on every Series element using a provided cache (asynchronously).
|
|
@@ -859,6 +870,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
859
870
|
response format, and other parameters for processing the inputs.
|
|
860
871
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
861
872
|
instance for managing API call batching and deduplication.
|
|
873
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
862
874
|
|
|
863
875
|
Returns:
|
|
864
876
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -902,7 +914,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
902
914
|
self,
|
|
903
915
|
instructions: str,
|
|
904
916
|
response_format: Type[ResponseFormat] = str,
|
|
905
|
-
batch_size: int =
|
|
917
|
+
batch_size: int | None = None,
|
|
906
918
|
temperature: float | None = 0.0,
|
|
907
919
|
top_p: float = 1.0,
|
|
908
920
|
max_concurrency: int = 8,
|
|
@@ -934,8 +946,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
934
946
|
instructions (str): System prompt prepended to every user message.
|
|
935
947
|
response_format (Type[ResponseFormat], optional): Pydantic model or built‑in
|
|
936
948
|
type the assistant should return. Defaults to ``str``.
|
|
937
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
938
|
-
request. Defaults to ``
|
|
949
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
950
|
+
request. Defaults to ``None`` (automatic batch size optimization
|
|
951
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
939
952
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
940
953
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
941
954
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
@@ -959,7 +972,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
959
972
|
)
|
|
960
973
|
|
|
961
974
|
async def embeddings(
|
|
962
|
-
self, batch_size: int =
|
|
975
|
+
self, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False
|
|
963
976
|
) -> pd.Series:
|
|
964
977
|
"""Compute OpenAI embeddings for every Series element (asynchronously).
|
|
965
978
|
|
|
@@ -983,8 +996,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
983
996
|
The default embedding model is `text-embedding-3-small`.
|
|
984
997
|
|
|
985
998
|
Args:
|
|
986
|
-
batch_size (int, optional): Number of inputs grouped into a
|
|
987
|
-
single request. Defaults to ``
|
|
999
|
+
batch_size (int | None, optional): Number of inputs grouped into a
|
|
1000
|
+
single request. Defaults to ``None`` (automatic batch size optimization
|
|
1001
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
988
1002
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
989
1003
|
requests. Defaults to ``8``.
|
|
990
1004
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -1003,7 +1017,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1003
1017
|
)
|
|
1004
1018
|
|
|
1005
1019
|
async def task(
|
|
1006
|
-
self, task: PreparedTask, batch_size: int =
|
|
1020
|
+
self, task: PreparedTask, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False
|
|
1007
1021
|
) -> pd.Series:
|
|
1008
1022
|
"""Execute a prepared task on every Series element (asynchronously).
|
|
1009
1023
|
|
|
@@ -1037,8 +1051,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
1037
1051
|
Args:
|
|
1038
1052
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
1039
1053
|
response format, and other parameters for processing the inputs.
|
|
1040
|
-
batch_size (int, optional): Number of prompts grouped into a single
|
|
1041
|
-
request to optimize API usage. Defaults to
|
|
1054
|
+
batch_size (int | None, optional): Number of prompts grouped into a single
|
|
1055
|
+
request to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1056
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1042
1057
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1043
1058
|
requests. Defaults to 8.
|
|
1044
1059
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -1084,6 +1099,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1084
1099
|
instructions (str): System prompt for the assistant.
|
|
1085
1100
|
cache (AsyncBatchingMapProxy[str, ResponseFormat]): Pre-configured cache
|
|
1086
1101
|
instance for managing API call batching and deduplication.
|
|
1102
|
+
Set cache.batch_size=None to enable automatic batch size optimization.
|
|
1087
1103
|
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1088
1104
|
responses. Defaults to ``str``.
|
|
1089
1105
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
@@ -1134,7 +1150,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1134
1150
|
self,
|
|
1135
1151
|
instructions: str,
|
|
1136
1152
|
response_format: Type[ResponseFormat] = str,
|
|
1137
|
-
batch_size: int =
|
|
1153
|
+
batch_size: int | None = None,
|
|
1138
1154
|
temperature: float | None = 0.0,
|
|
1139
1155
|
top_p: float = 1.0,
|
|
1140
1156
|
max_concurrency: int = 8,
|
|
@@ -1171,8 +1187,9 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1171
1187
|
instructions (str): System prompt for the assistant.
|
|
1172
1188
|
response_format (Type[ResponseFormat], optional): Desired Python type of the
|
|
1173
1189
|
responses. Defaults to ``str``.
|
|
1174
|
-
batch_size (int, optional): Number of requests sent in one batch.
|
|
1175
|
-
Defaults to ``
|
|
1190
|
+
batch_size (int | None, optional): Number of requests sent in one batch.
|
|
1191
|
+
Defaults to ``None`` (automatic batch size optimization
|
|
1192
|
+
based on execution time). Set to a positive integer for fixed batch size.
|
|
1176
1193
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
1177
1194
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1178
1195
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
@@ -1196,7 +1213,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1196
1213
|
)
|
|
1197
1214
|
|
|
1198
1215
|
async def task(
|
|
1199
|
-
self, task: PreparedTask, batch_size: int =
|
|
1216
|
+
self, task: PreparedTask, batch_size: int | None = None, max_concurrency: int = 8, show_progress: bool = False
|
|
1200
1217
|
) -> pd.Series:
|
|
1201
1218
|
"""Execute a prepared task on each DataFrame row after serialising it to JSON (asynchronously).
|
|
1202
1219
|
|
|
@@ -1235,8 +1252,9 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1235
1252
|
Args:
|
|
1236
1253
|
task (PreparedTask): A pre-configured task containing instructions,
|
|
1237
1254
|
response format, and other parameters for processing the inputs.
|
|
1238
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
1239
|
-
to optimize API usage. Defaults to
|
|
1255
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
1256
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1257
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1240
1258
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1241
1259
|
requests. Defaults to 8.
|
|
1242
1260
|
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
@@ -1286,7 +1304,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1286
1304
|
else:
|
|
1287
1305
|
return result
|
|
1288
1306
|
|
|
1289
|
-
async def assign(self, **kwargs
|
|
1307
|
+
async def assign(self, **kwargs) -> pd.DataFrame:
|
|
1290
1308
|
"""Asynchronously assign new columns to the DataFrame, evaluating sequentially.
|
|
1291
1309
|
|
|
1292
1310
|
This method extends pandas' `assign` method by supporting asynchronous
|
|
@@ -1321,7 +1339,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1321
1339
|
```
|
|
1322
1340
|
|
|
1323
1341
|
Args:
|
|
1324
|
-
**kwargs:
|
|
1342
|
+
**kwargs: Column names as keys and either static values or callables
|
|
1325
1343
|
(synchronous or asynchronous) as values.
|
|
1326
1344
|
|
|
1327
1345
|
Returns:
|
|
@@ -1346,7 +1364,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1346
1364
|
return df_current
|
|
1347
1365
|
|
|
1348
1366
|
async def fillna(
|
|
1349
|
-
self, target_column_name: str, max_examples: int = 500, batch_size: int =
|
|
1367
|
+
self, target_column_name: str, max_examples: int = 500, batch_size: int | None = None, max_concurrency: int = 8
|
|
1350
1368
|
) -> pd.DataFrame:
|
|
1351
1369
|
"""Fill missing values in a DataFrame column using AI-powered inference (asynchronously).
|
|
1352
1370
|
|
|
@@ -1361,8 +1379,9 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1361
1379
|
max_examples (int, optional): The maximum number of example rows to use
|
|
1362
1380
|
for context when predicting missing values. Higher values may improve
|
|
1363
1381
|
accuracy but increase API costs and processing time. Defaults to 500.
|
|
1364
|
-
batch_size (int, optional): Number of requests sent in one batch
|
|
1365
|
-
to optimize API usage. Defaults to
|
|
1382
|
+
batch_size (int | None, optional): Number of requests sent in one batch
|
|
1383
|
+
to optimize API usage. Defaults to ``None`` (automatic batch size
|
|
1384
|
+
optimization based on execution time). Set to a positive integer for fixed batch size.
|
|
1366
1385
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1367
1386
|
requests. Defaults to 8.
|
|
1368
1387
|
|
openaivec/prompt.py
CHANGED
|
@@ -44,7 +44,7 @@ this will produce an XML string that looks like this:
|
|
|
44
44
|
|
|
45
45
|
import difflib
|
|
46
46
|
import logging
|
|
47
|
-
from typing import
|
|
47
|
+
from typing import List
|
|
48
48
|
from xml.etree import ElementTree
|
|
49
49
|
|
|
50
50
|
from openai import OpenAI
|
|
@@ -126,6 +126,7 @@ _PROMPT: str = """
|
|
|
126
126
|
Receive the prompt in JSON format with fields "purpose",
|
|
127
127
|
"cautions", and "examples". Ensure the entire prompt is free
|
|
128
128
|
from logical contradictions, redundancies, and ambiguities.
|
|
129
|
+
IMPORTANT: The "examples" array must always contain at least one example throughout all iterations.
|
|
129
130
|
</Instruction>
|
|
130
131
|
<Instruction id="2">
|
|
131
132
|
- Modify only one element per iteration among “purpose”, “examples”, or
|
|
@@ -155,8 +156,10 @@ _PROMPT: str = """
|
|
|
155
156
|
</Instruction>
|
|
156
157
|
<Instruction id="6">
|
|
157
158
|
In the "examples" field, enhance the examples to cover a wide range of scenarios.
|
|
159
|
+
CRITICAL: The examples array must NEVER be empty - always maintain at least one example.
|
|
158
160
|
Add as many non-redundant examples as possible,
|
|
159
161
|
since having more examples leads to better coverage and understanding.
|
|
162
|
+
You may modify existing examples or add new ones, but never remove all examples.
|
|
160
163
|
</Instruction>
|
|
161
164
|
<Instruction id="7">
|
|
162
165
|
Verify that the improved prompt adheres to the Request and
|
|
@@ -166,6 +169,7 @@ _PROMPT: str = """
|
|
|
166
169
|
Generate the final refined FewShotPrompt as an iteration in
|
|
167
170
|
the Response, ensuring the final output is consistent,
|
|
168
171
|
unambiguous, and free from any redundancies or contradictions.
|
|
172
|
+
MANDATORY: Verify that the examples array contains at least one example before completing.
|
|
169
173
|
</Instruction>
|
|
170
174
|
</Instructions>
|
|
171
175
|
<Example>
|
|
@@ -339,11 +343,29 @@ def _render_prompt(prompt: FewShotPrompt) -> str:
|
|
|
339
343
|
|
|
340
344
|
|
|
341
345
|
class FewShotPromptBuilder:
|
|
346
|
+
"""Builder for creating few-shot prompts with validation.
|
|
347
|
+
|
|
348
|
+
Usage:
|
|
349
|
+
builder = (FewShotPromptBuilder()
|
|
350
|
+
.purpose("Your task description")
|
|
351
|
+
.example("input1", "output1") # At least one required
|
|
352
|
+
.example("input2", "output2")
|
|
353
|
+
.build())
|
|
354
|
+
|
|
355
|
+
Note:
|
|
356
|
+
Both .purpose() and at least one .example() call are required before
|
|
357
|
+
calling .build(), .improve(), or .get_object().
|
|
358
|
+
"""
|
|
359
|
+
|
|
342
360
|
_prompt: FewShotPrompt
|
|
343
361
|
_steps: List[Step]
|
|
344
362
|
|
|
345
363
|
def __init__(self):
|
|
346
|
-
"""Initialize an empty FewShotPromptBuilder.
|
|
364
|
+
"""Initialize an empty FewShotPromptBuilder.
|
|
365
|
+
|
|
366
|
+
Note:
|
|
367
|
+
You must call .purpose() and at least one .example() before building.
|
|
368
|
+
"""
|
|
347
369
|
self._prompt = FewShotPrompt(purpose="", cautions=[], examples=[])
|
|
348
370
|
|
|
349
371
|
@classmethod
|
|
@@ -402,6 +424,8 @@ class FewShotPromptBuilder:
|
|
|
402
424
|
) -> "FewShotPromptBuilder":
|
|
403
425
|
"""Add a single input/output example.
|
|
404
426
|
|
|
427
|
+
At least one example is required before calling .build(), .improve(), or .get_object().
|
|
428
|
+
|
|
405
429
|
Args:
|
|
406
430
|
input_value (str | BaseModel): Example input; if a Pydantic model is
|
|
407
431
|
provided it is serialised to JSON.
|
|
@@ -442,7 +466,13 @@ class FewShotPromptBuilder:
|
|
|
442
466
|
|
|
443
467
|
Returns:
|
|
444
468
|
FewShotPromptBuilder: The current builder instance containing the refined prompt and iteration history.
|
|
469
|
+
|
|
470
|
+
Raises:
|
|
471
|
+
ValueError: If the prompt is not valid (missing purpose or examples).
|
|
445
472
|
"""
|
|
473
|
+
# Validate before making API call to provide early feedback
|
|
474
|
+
self._validate()
|
|
475
|
+
|
|
446
476
|
_client = client or CONTAINER.resolve(OpenAI)
|
|
447
477
|
_model_name = model_name or CONTAINER.resolve(ResponsesModelName).value
|
|
448
478
|
|
|
@@ -459,12 +489,25 @@ class FewShotPromptBuilder:
|
|
|
459
489
|
self._steps = [Step(id=0, analysis="Original Prompt", prompt=self._prompt)]
|
|
460
490
|
|
|
461
491
|
# add the histories
|
|
462
|
-
|
|
463
|
-
|
|
492
|
+
if response.output_parsed:
|
|
493
|
+
for step in response.output_parsed.iterations:
|
|
494
|
+
self._steps.append(step)
|
|
464
495
|
|
|
465
496
|
# set the final prompt
|
|
466
497
|
self._prompt = self._steps[-1].prompt
|
|
467
498
|
|
|
499
|
+
# Validate the improved prompt to ensure examples weren't removed by LLM
|
|
500
|
+
try:
|
|
501
|
+
self._validate()
|
|
502
|
+
except ValueError as e:
|
|
503
|
+
_logger.warning(f"LLM produced invalid prompt during improve(): {e}")
|
|
504
|
+
# Restore original prompt if LLM produced invalid result
|
|
505
|
+
self._prompt = self._steps[0].prompt
|
|
506
|
+
raise ValueError(
|
|
507
|
+
f"LLM improvement failed to maintain required fields: {e}. "
|
|
508
|
+
"This may indicate an issue with the improvement instructions or model behavior."
|
|
509
|
+
)
|
|
510
|
+
|
|
468
511
|
return self
|
|
469
512
|
|
|
470
513
|
def explain(self) -> "FewShotPromptBuilder":
|
|
@@ -500,9 +543,14 @@ class FewShotPromptBuilder:
|
|
|
500
543
|
"""
|
|
501
544
|
# Validate that 'purpose' and 'examples' are not empty.
|
|
502
545
|
if not self._prompt.purpose:
|
|
503
|
-
raise ValueError(
|
|
546
|
+
raise ValueError(
|
|
547
|
+
"Purpose is required. Please call .purpose('your purpose description') before building the prompt."
|
|
548
|
+
)
|
|
504
549
|
if not self._prompt.examples or len(self._prompt.examples) == 0:
|
|
505
|
-
raise ValueError(
|
|
550
|
+
raise ValueError(
|
|
551
|
+
"At least one example is required. Please add examples using "
|
|
552
|
+
".example('input', 'output') before building the prompt."
|
|
553
|
+
)
|
|
506
554
|
|
|
507
555
|
def get_object(self) -> FewShotPrompt:
|
|
508
556
|
"""Return the underlying FewShotPrompt object.
|
|
@@ -522,11 +570,13 @@ class FewShotPromptBuilder:
|
|
|
522
570
|
self._validate()
|
|
523
571
|
return self.build_xml()
|
|
524
572
|
|
|
525
|
-
def build_json(self, **kwargs
|
|
573
|
+
def build_json(self, **kwargs) -> str:
|
|
526
574
|
"""Build and return the prompt as a JSON string.
|
|
527
575
|
|
|
528
576
|
Args:
|
|
529
|
-
**kwargs: Keyword arguments forwarded to ``model_dump_json``.
|
|
577
|
+
**kwargs: Keyword arguments forwarded to Pydantic's ``model_dump_json``.
|
|
578
|
+
Common options include ``indent``, ``include``, ``exclude``,
|
|
579
|
+
``by_alias``, ``exclude_unset``, ``exclude_defaults``, ``exclude_none``.
|
|
530
580
|
|
|
531
581
|
Returns:
|
|
532
582
|
str: JSON representation of the prompt.
|
openaivec/provider.py
CHANGED
|
@@ -65,6 +65,11 @@ def provide_openai_client() -> OpenAI:
|
|
|
65
65
|
azure_api_version = CONTAINER.resolve(AzureOpenAIAPIVersion)
|
|
66
66
|
|
|
67
67
|
if all(param.value for param in [azure_api_key, azure_base_url, azure_api_version]):
|
|
68
|
+
# Type checker support: values are guaranteed non-None by the all() check above
|
|
69
|
+
assert azure_api_key.value is not None
|
|
70
|
+
assert azure_base_url.value is not None
|
|
71
|
+
assert azure_api_version.value is not None
|
|
72
|
+
|
|
68
73
|
_check_azure_v1_api_url(azure_base_url.value)
|
|
69
74
|
return AzureOpenAI(
|
|
70
75
|
api_key=azure_api_key.value,
|
|
@@ -103,6 +108,11 @@ def provide_async_openai_client() -> AsyncOpenAI:
|
|
|
103
108
|
azure_api_version = CONTAINER.resolve(AzureOpenAIAPIVersion)
|
|
104
109
|
|
|
105
110
|
if all(param.value for param in [azure_api_key, azure_base_url, azure_api_version]):
|
|
111
|
+
# Type checker support: values are guaranteed non-None by the all() check above
|
|
112
|
+
assert azure_api_key.value is not None
|
|
113
|
+
assert azure_base_url.value is not None
|
|
114
|
+
assert azure_api_version.value is not None
|
|
115
|
+
|
|
106
116
|
_check_azure_v1_api_url(azure_base_url.value)
|
|
107
117
|
return AsyncAzureOpenAI(
|
|
108
118
|
api_key=azure_api_key.value,
|