openaivec 0.13.2__tar.gz → 0.13.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openaivec-0.13.2 → openaivec-0.13.3}/PKG-INFO +4 -2
- {openaivec-0.13.2 → openaivec-0.13.3}/README.md +1 -1
- {openaivec-0.13.2 → openaivec-0.13.3}/pyproject.toml +22 -0
- openaivec-0.13.3/src/openaivec/__init__.py +9 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/di.py +3 -3
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/embeddings.py +5 -4
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/pandas_ext.py +129 -21
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/prompt.py +34 -13
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/provider.py +3 -3
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/proxy.py +166 -28
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/responses.py +6 -5
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/serialize.py +1 -1
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/spark.py +8 -7
- openaivec-0.13.3/src/openaivec/task/customer_support/__init__.py +26 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/customer_support/customer_sentiment.py +12 -4
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/customer_support/inquiry_classification.py +11 -4
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/customer_support/inquiry_summary.py +8 -3
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/customer_support/intent_analysis.py +10 -4
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/customer_support/response_suggestion.py +10 -4
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/customer_support/urgency_analysis.py +8 -3
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/nlp/dependency_parsing.py +4 -2
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/nlp/keyword_extraction.py +3 -2
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/nlp/morphological_analysis.py +4 -2
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/nlp/named_entity_recognition.py +4 -2
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/nlp/sentiment_analysis.py +7 -2
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/nlp/translation.py +1 -1
- openaivec-0.13.3/src/openaivec/task/table/__init__.py +3 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/table/fillna.py +4 -3
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/util.py +0 -1
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_pandas_ext.py +4 -2
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_prompt.py +44 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_provider.py +1 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_proxy.py +250 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_util.py +2 -1
- {openaivec-0.13.2 → openaivec-0.13.3}/uv.lock +1258 -1207
- openaivec-0.13.2/src/openaivec/__init__.py +0 -9
- openaivec-0.13.2/src/openaivec/task/customer_support/__init__.py +0 -32
- openaivec-0.13.2/src/openaivec/task/table/__init__.py +0 -3
- {openaivec-0.13.2 → openaivec-0.13.3}/.env.example +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/.github/workflows/python-mkdocs.yml +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/.github/workflows/python-package.yml +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/.github/workflows/python-test.yml +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/.github/workflows/python-update.yml +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/.gitignore +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/CODE_OF_CONDUCT.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/LICENSE +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/SECURITY.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/SUPPORT.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/di.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/embeddings.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/pandas_ext.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/prompt.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/proxy.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/responses.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/spark.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/task.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/customer_support/customer_sentiment.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/customer_support/inquiry_classification.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/customer_support/inquiry_summary.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/customer_support/intent_analysis.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/customer_support/response_suggestion.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/customer_support/urgency_analysis.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/nlp/dependency_parsing.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/nlp/keyword_extraction.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/nlp/morphological_analysis.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/nlp/named_entity_recognition.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/nlp/sentiment_analysis.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/tasks/nlp/translation.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/api/util.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/index.md +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/docs/robots.txt +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/mkdocs.yml +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/log.py +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/model.py +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/__init__.py +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/src/openaivec/task/nlp/__init__.py +3 -3
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/__init__.py +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_di.py +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_embeddings.py +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_responses.py +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_serialize.py +0 -0
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_spark.py +4 -4
- {openaivec-0.13.2 → openaivec-0.13.3}/tests/test_task.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openaivec
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.3
|
|
4
4
|
Summary: Generative mutation for tabular calculation
|
|
5
5
|
Project-URL: Homepage, https://microsoft.github.io/openaivec/
|
|
6
6
|
Project-URL: Repository, https://github.com/microsoft/openaivec
|
|
@@ -15,9 +15,11 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
17
|
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: ipywidgets>=8.1.7
|
|
18
19
|
Requires-Dist: openai>=1.74.0
|
|
19
20
|
Requires-Dist: pandas>=2.2.3
|
|
20
21
|
Requires-Dist: tiktoken>=0.9.0
|
|
22
|
+
Requires-Dist: tqdm>=4.67.1
|
|
21
23
|
Provides-Extra: spark
|
|
22
24
|
Requires-Dist: pyspark>=3.5.5; extra == 'spark'
|
|
23
25
|
Description-Content-Type: text/markdown
|
|
@@ -590,7 +592,7 @@ improved_prompt: str = (
|
|
|
590
592
|
.example("Apple", "Color")
|
|
591
593
|
.example("Apple", "Animal")
|
|
592
594
|
# improve the prompt with OpenAI's API
|
|
593
|
-
.improve(
|
|
595
|
+
.improve()
|
|
594
596
|
.build()
|
|
595
597
|
)
|
|
596
598
|
print(improved_prompt)
|
|
@@ -26,9 +26,11 @@ classifiers = [
|
|
|
26
26
|
|
|
27
27
|
requires-python = ">=3.10"
|
|
28
28
|
dependencies = [
|
|
29
|
+
"ipywidgets>=8.1.7",
|
|
29
30
|
"openai>=1.74.0",
|
|
30
31
|
"pandas>=2.2.3",
|
|
31
32
|
"tiktoken>=0.9.0",
|
|
33
|
+
"tqdm>=4.67.1",
|
|
32
34
|
]
|
|
33
35
|
|
|
34
36
|
[dependency-groups]
|
|
@@ -62,6 +64,26 @@ spark = [
|
|
|
62
64
|
line-length = 120
|
|
63
65
|
target-version = "py310"
|
|
64
66
|
|
|
67
|
+
[tool.ruff.lint]
|
|
68
|
+
select = [
|
|
69
|
+
"E", # pycodestyle errors
|
|
70
|
+
"W", # pycodestyle warnings
|
|
71
|
+
"F", # pyflakes
|
|
72
|
+
"I", # isort
|
|
73
|
+
"TID", # flake8-tidy-imports
|
|
74
|
+
]
|
|
75
|
+
# ignore = [] # グローバルではE501を有効化
|
|
76
|
+
|
|
77
|
+
[tool.ruff.lint.flake8-tidy-imports]
|
|
78
|
+
# Enforce absolute imports - ban relative imports (except in __init__.py files)
|
|
79
|
+
ban-relative-imports = "all"
|
|
80
|
+
|
|
81
|
+
[tool.ruff.lint.per-file-ignores]
|
|
82
|
+
# Allow relative imports in __init__.py files
|
|
83
|
+
"**/__init__.py" = ["TID252"]
|
|
84
|
+
# Test files contain long test data - ignore line length
|
|
85
|
+
"tests/**/*.py" = ["E501"]
|
|
86
|
+
|
|
65
87
|
[project.urls]
|
|
66
88
|
Homepage = "https://microsoft.github.io/openaivec/"
|
|
67
89
|
Repository = "https://github.com/microsoft/openaivec"
|
|
@@ -11,14 +11,14 @@ are created once and reused across multiple resolve calls.
|
|
|
11
11
|
Example:
|
|
12
12
|
```python
|
|
13
13
|
from openaivec.di import Container
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
class DatabaseService:
|
|
16
16
|
def __init__(self):
|
|
17
17
|
self.connection = "database://localhost"
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
container = Container()
|
|
20
20
|
container.register(DatabaseService, lambda: DatabaseService())
|
|
21
|
-
|
|
21
|
+
|
|
22
22
|
db1 = container.resolve(DatabaseService)
|
|
23
23
|
db2 = container.resolve(DatabaseService)
|
|
24
24
|
print(db1 is db2) # True - same instance
|
|
@@ -6,9 +6,9 @@ import numpy as np
|
|
|
6
6
|
from numpy.typing import NDArray
|
|
7
7
|
from openai import AsyncOpenAI, InternalServerError, OpenAI, RateLimitError
|
|
8
8
|
|
|
9
|
-
from .log import observe
|
|
10
|
-
from .proxy import AsyncBatchingMapProxy, BatchingMapProxy
|
|
11
|
-
from .util import backoff, backoff_async
|
|
9
|
+
from openaivec.log import observe
|
|
10
|
+
from openaivec.proxy import AsyncBatchingMapProxy, BatchingMapProxy
|
|
11
|
+
from openaivec.util import backoff, backoff_async
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
14
14
|
"BatchEmbeddings",
|
|
@@ -24,7 +24,8 @@ class BatchEmbeddings:
|
|
|
24
24
|
|
|
25
25
|
Attributes:
|
|
26
26
|
client (OpenAI): Configured OpenAI client.
|
|
27
|
-
model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name
|
|
27
|
+
model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name
|
|
28
|
+
(e.g., ``"text-embedding-3-small"``).
|
|
28
29
|
cache (BatchingMapProxy[str, NDArray[np.float32]]): Batching proxy for ordered, cached mapping.
|
|
29
30
|
"""
|
|
30
31
|
|
|
@@ -50,12 +50,12 @@ import tiktoken
|
|
|
50
50
|
from openai import AsyncOpenAI, OpenAI
|
|
51
51
|
from pydantic import BaseModel
|
|
52
52
|
|
|
53
|
-
from .embeddings import AsyncBatchEmbeddings, BatchEmbeddings
|
|
54
|
-
from .model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
|
|
55
|
-
from .provider import CONTAINER, _check_azure_v1_api_url
|
|
56
|
-
from .proxy import AsyncBatchingMapProxy, BatchingMapProxy
|
|
57
|
-
from .responses import AsyncBatchResponses, BatchResponses
|
|
58
|
-
from .task.table import FillNaResponse, fillna
|
|
53
|
+
from openaivec.embeddings import AsyncBatchEmbeddings, BatchEmbeddings
|
|
54
|
+
from openaivec.model import EmbeddingsModelName, PreparedTask, ResponseFormat, ResponsesModelName
|
|
55
|
+
from openaivec.provider import CONTAINER, _check_azure_v1_api_url
|
|
56
|
+
from openaivec.proxy import AsyncBatchingMapProxy, BatchingMapProxy
|
|
57
|
+
from openaivec.responses import AsyncBatchResponses, BatchResponses
|
|
58
|
+
from openaivec.task.table import FillNaResponse, fillna
|
|
59
59
|
|
|
60
60
|
__all__ = [
|
|
61
61
|
"use",
|
|
@@ -220,13 +220,23 @@ class OpenAIVecSeriesAccessor:
|
|
|
220
220
|
batch_size: int = 128,
|
|
221
221
|
temperature: float | None = 0.0,
|
|
222
222
|
top_p: float = 1.0,
|
|
223
|
+
show_progress: bool = False,
|
|
223
224
|
) -> pd.Series:
|
|
224
225
|
"""Call an LLM once for every Series element.
|
|
225
226
|
|
|
226
227
|
Example:
|
|
227
228
|
```python
|
|
228
229
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
230
|
+
# Basic usage
|
|
229
231
|
animals.ai.responses("translate to French")
|
|
232
|
+
|
|
233
|
+
# With progress bar in Jupyter notebooks
|
|
234
|
+
large_series = pd.Series(["data"] * 1000)
|
|
235
|
+
large_series.ai.responses(
|
|
236
|
+
"analyze this data",
|
|
237
|
+
batch_size=32,
|
|
238
|
+
show_progress=True
|
|
239
|
+
)
|
|
230
240
|
```
|
|
231
241
|
This method returns a Series of strings, each containing the
|
|
232
242
|
assistant's response to the corresponding input.
|
|
@@ -241,13 +251,14 @@ class OpenAIVecSeriesAccessor:
|
|
|
241
251
|
request. Defaults to ``128``.
|
|
242
252
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
243
253
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
254
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
244
255
|
|
|
245
256
|
Returns:
|
|
246
257
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
247
258
|
"""
|
|
248
259
|
return self.responses_with_cache(
|
|
249
260
|
instructions=instructions,
|
|
250
|
-
cache=BatchingMapProxy(batch_size=batch_size),
|
|
261
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
251
262
|
response_format=response_format,
|
|
252
263
|
temperature=temperature,
|
|
253
264
|
top_p=top_p,
|
|
@@ -300,7 +311,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
300
311
|
)
|
|
301
312
|
return pd.Series(client.parse(self._obj.tolist()), index=self._obj.index, name=self._obj.name)
|
|
302
313
|
|
|
303
|
-
def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
|
|
314
|
+
def task(self, task: PreparedTask, batch_size: int = 128, show_progress: bool = False) -> pd.Series:
|
|
304
315
|
"""Execute a prepared task on every Series element.
|
|
305
316
|
|
|
306
317
|
This method applies a pre-configured task to each element in the Series,
|
|
@@ -315,7 +326,16 @@ class OpenAIVecSeriesAccessor:
|
|
|
315
326
|
sentiment_task = PreparedTask(...)
|
|
316
327
|
|
|
317
328
|
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
329
|
+
# Basic usage
|
|
318
330
|
results = reviews.ai.task(sentiment_task)
|
|
331
|
+
|
|
332
|
+
# With progress bar for large datasets
|
|
333
|
+
large_reviews = pd.Series(["review text"] * 2000)
|
|
334
|
+
results = large_reviews.ai.task(
|
|
335
|
+
sentiment_task,
|
|
336
|
+
batch_size=50,
|
|
337
|
+
show_progress=True
|
|
338
|
+
)
|
|
319
339
|
```
|
|
320
340
|
This method returns a Series containing the task results for each
|
|
321
341
|
corresponding input element, following the task's defined structure.
|
|
@@ -325,6 +345,7 @@ class OpenAIVecSeriesAccessor:
|
|
|
325
345
|
response format, and other parameters for processing the inputs.
|
|
326
346
|
batch_size (int, optional): Number of prompts grouped into a single
|
|
327
347
|
request to optimize API usage. Defaults to 128.
|
|
348
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
328
349
|
|
|
329
350
|
Returns:
|
|
330
351
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -332,16 +353,24 @@ class OpenAIVecSeriesAccessor:
|
|
|
332
353
|
"""
|
|
333
354
|
return self.task_with_cache(
|
|
334
355
|
task=task,
|
|
335
|
-
cache=BatchingMapProxy(batch_size=batch_size),
|
|
356
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
336
357
|
)
|
|
337
358
|
|
|
338
|
-
def embeddings(self, batch_size: int = 128) -> pd.Series:
|
|
359
|
+
def embeddings(self, batch_size: int = 128, show_progress: bool = False) -> pd.Series:
|
|
339
360
|
"""Compute OpenAI embeddings for every Series element.
|
|
340
361
|
|
|
341
362
|
Example:
|
|
342
363
|
```python
|
|
343
364
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
365
|
+
# Basic usage
|
|
344
366
|
animals.ai.embeddings()
|
|
367
|
+
|
|
368
|
+
# With progress bar for large datasets
|
|
369
|
+
large_texts = pd.Series(["text"] * 5000)
|
|
370
|
+
embeddings = large_texts.ai.embeddings(
|
|
371
|
+
batch_size=100,
|
|
372
|
+
show_progress=True
|
|
373
|
+
)
|
|
345
374
|
```
|
|
346
375
|
This method returns a Series of numpy arrays, each containing the
|
|
347
376
|
embedding vector for the corresponding input.
|
|
@@ -351,13 +380,14 @@ class OpenAIVecSeriesAccessor:
|
|
|
351
380
|
Args:
|
|
352
381
|
batch_size (int, optional): Number of inputs grouped into a
|
|
353
382
|
single request. Defaults to ``128``.
|
|
383
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
354
384
|
|
|
355
385
|
Returns:
|
|
356
386
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
357
387
|
(dtype ``float32``).
|
|
358
388
|
"""
|
|
359
389
|
return self.embeddings_with_cache(
|
|
360
|
-
cache=BatchingMapProxy(batch_size=batch_size),
|
|
390
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
361
391
|
)
|
|
362
392
|
|
|
363
393
|
def count_tokens(self) -> pd.Series:
|
|
@@ -511,6 +541,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
511
541
|
batch_size: int = 128,
|
|
512
542
|
temperature: float | None = 0.0,
|
|
513
543
|
top_p: float = 1.0,
|
|
544
|
+
show_progress: bool = False,
|
|
514
545
|
) -> pd.Series:
|
|
515
546
|
"""Generate a response for each row after serialising it to JSON.
|
|
516
547
|
|
|
@@ -521,7 +552,16 @@ class OpenAIVecDataFrameAccessor:
|
|
|
521
552
|
{"name": "dog", "legs": 4},
|
|
522
553
|
{"name": "elephant", "legs": 4},
|
|
523
554
|
])
|
|
555
|
+
# Basic usage
|
|
524
556
|
df.ai.responses("what is the animal's name?")
|
|
557
|
+
|
|
558
|
+
# With progress bar for large datasets
|
|
559
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
560
|
+
large_df.ai.responses(
|
|
561
|
+
"generate a name for this ID",
|
|
562
|
+
batch_size=20,
|
|
563
|
+
show_progress=True
|
|
564
|
+
)
|
|
525
565
|
```
|
|
526
566
|
This method returns a Series of strings, each containing the
|
|
527
567
|
assistant's response to the corresponding input.
|
|
@@ -537,19 +577,20 @@ class OpenAIVecDataFrameAccessor:
|
|
|
537
577
|
Defaults to ``128``.
|
|
538
578
|
temperature (float, optional): Sampling temperature. Defaults to ``0.0``.
|
|
539
579
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
580
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
540
581
|
|
|
541
582
|
Returns:
|
|
542
583
|
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
543
584
|
"""
|
|
544
585
|
return self.responses_with_cache(
|
|
545
586
|
instructions=instructions,
|
|
546
|
-
cache=BatchingMapProxy(batch_size=batch_size),
|
|
587
|
+
cache=BatchingMapProxy(batch_size=batch_size, show_progress=show_progress),
|
|
547
588
|
response_format=response_format,
|
|
548
589
|
temperature=temperature,
|
|
549
590
|
top_p=top_p,
|
|
550
591
|
)
|
|
551
592
|
|
|
552
|
-
def task(self, task: PreparedTask, batch_size: int = 128) -> pd.Series:
|
|
593
|
+
def task(self, task: PreparedTask, batch_size: int = 128, show_progress: bool = False) -> pd.Series:
|
|
553
594
|
"""Execute a prepared task on each DataFrame row after serialising it to JSON.
|
|
554
595
|
|
|
555
596
|
This method applies a pre-configured task to each row in the DataFrame,
|
|
@@ -579,6 +620,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
579
620
|
response format, and other parameters for processing the inputs.
|
|
580
621
|
batch_size (int, optional): Number of requests sent in one batch
|
|
581
622
|
to optimize API usage. Defaults to 128.
|
|
623
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
582
624
|
|
|
583
625
|
Returns:
|
|
584
626
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -588,7 +630,7 @@ class OpenAIVecDataFrameAccessor:
|
|
|
588
630
|
lambda df: (
|
|
589
631
|
df.pipe(lambda df: pd.Series(df.to_dict(orient="records"), index=df.index, name="record"))
|
|
590
632
|
.map(lambda x: json.dumps(x, ensure_ascii=False))
|
|
591
|
-
.ai.task(task=task, batch_size=batch_size)
|
|
633
|
+
.ai.task(task=task, batch_size=batch_size, show_progress=show_progress)
|
|
592
634
|
)
|
|
593
635
|
)
|
|
594
636
|
|
|
@@ -864,6 +906,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
864
906
|
temperature: float | None = 0.0,
|
|
865
907
|
top_p: float = 1.0,
|
|
866
908
|
max_concurrency: int = 8,
|
|
909
|
+
show_progress: bool = False,
|
|
867
910
|
) -> pd.Series:
|
|
868
911
|
"""Call an LLM once for every Series element (asynchronously).
|
|
869
912
|
|
|
@@ -872,6 +915,15 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
872
915
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
873
916
|
# Must be awaited
|
|
874
917
|
results = await animals.aio.responses("translate to French")
|
|
918
|
+
|
|
919
|
+
# With progress bar for large datasets
|
|
920
|
+
large_series = pd.Series(["data"] * 1000)
|
|
921
|
+
results = await large_series.aio.responses(
|
|
922
|
+
"analyze this data",
|
|
923
|
+
batch_size=32,
|
|
924
|
+
max_concurrency=4,
|
|
925
|
+
show_progress=True
|
|
926
|
+
)
|
|
875
927
|
```
|
|
876
928
|
This method returns a Series of strings, each containing the
|
|
877
929
|
assistant's response to the corresponding input.
|
|
@@ -888,6 +940,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
888
940
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
889
941
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
890
942
|
requests. Defaults to ``8``.
|
|
943
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
891
944
|
|
|
892
945
|
Returns:
|
|
893
946
|
pandas.Series: Series whose values are instances of ``response_format``.
|
|
@@ -897,13 +950,17 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
897
950
|
"""
|
|
898
951
|
return await self.responses_with_cache(
|
|
899
952
|
instructions=instructions,
|
|
900
|
-
cache=AsyncBatchingMapProxy(
|
|
953
|
+
cache=AsyncBatchingMapProxy(
|
|
954
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
955
|
+
),
|
|
901
956
|
response_format=response_format,
|
|
902
957
|
temperature=temperature,
|
|
903
958
|
top_p=top_p,
|
|
904
959
|
)
|
|
905
960
|
|
|
906
|
-
async def embeddings(
|
|
961
|
+
async def embeddings(
|
|
962
|
+
self, batch_size: int = 128, max_concurrency: int = 8, show_progress: bool = False
|
|
963
|
+
) -> pd.Series:
|
|
907
964
|
"""Compute OpenAI embeddings for every Series element (asynchronously).
|
|
908
965
|
|
|
909
966
|
Example:
|
|
@@ -911,6 +968,14 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
911
968
|
animals = pd.Series(["cat", "dog", "elephant"])
|
|
912
969
|
# Must be awaited
|
|
913
970
|
embeddings = await animals.aio.embeddings()
|
|
971
|
+
|
|
972
|
+
# With progress bar for large datasets
|
|
973
|
+
large_texts = pd.Series(["text"] * 5000)
|
|
974
|
+
embeddings = await large_texts.aio.embeddings(
|
|
975
|
+
batch_size=100,
|
|
976
|
+
max_concurrency=4,
|
|
977
|
+
show_progress=True
|
|
978
|
+
)
|
|
914
979
|
```
|
|
915
980
|
This method returns a Series of numpy arrays, each containing the
|
|
916
981
|
embedding vector for the corresponding input.
|
|
@@ -922,6 +987,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
922
987
|
single request. Defaults to ``128``.
|
|
923
988
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
924
989
|
requests. Defaults to ``8``.
|
|
990
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
925
991
|
|
|
926
992
|
Returns:
|
|
927
993
|
pandas.Series: Series whose values are ``np.ndarray`` objects
|
|
@@ -931,10 +997,14 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
931
997
|
This is an asynchronous method and must be awaited.
|
|
932
998
|
"""
|
|
933
999
|
return await self.embeddings_with_cache(
|
|
934
|
-
cache=AsyncBatchingMapProxy(
|
|
1000
|
+
cache=AsyncBatchingMapProxy(
|
|
1001
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1002
|
+
),
|
|
935
1003
|
)
|
|
936
1004
|
|
|
937
|
-
async def task(
|
|
1005
|
+
async def task(
|
|
1006
|
+
self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8, show_progress: bool = False
|
|
1007
|
+
) -> pd.Series:
|
|
938
1008
|
"""Execute a prepared task on every Series element (asynchronously).
|
|
939
1009
|
|
|
940
1010
|
This method applies a pre-configured task to each element in the Series,
|
|
@@ -951,6 +1021,15 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
951
1021
|
reviews = pd.Series(["Great product!", "Not satisfied", "Amazing quality"])
|
|
952
1022
|
# Must be awaited
|
|
953
1023
|
results = await reviews.aio.task(sentiment_task)
|
|
1024
|
+
|
|
1025
|
+
# With progress bar for large datasets
|
|
1026
|
+
large_reviews = pd.Series(["review text"] * 2000)
|
|
1027
|
+
results = await large_reviews.aio.task(
|
|
1028
|
+
sentiment_task,
|
|
1029
|
+
batch_size=50,
|
|
1030
|
+
max_concurrency=4,
|
|
1031
|
+
show_progress=True
|
|
1032
|
+
)
|
|
954
1033
|
```
|
|
955
1034
|
This method returns a Series containing the task results for each
|
|
956
1035
|
corresponding input element, following the task's defined structure.
|
|
@@ -962,6 +1041,7 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
962
1041
|
request to optimize API usage. Defaults to 128.
|
|
963
1042
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
964
1043
|
requests. Defaults to 8.
|
|
1044
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
965
1045
|
|
|
966
1046
|
Returns:
|
|
967
1047
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -972,7 +1052,9 @@ class AsyncOpenAIVecSeriesAccessor:
|
|
|
972
1052
|
"""
|
|
973
1053
|
return await self.task_with_cache(
|
|
974
1054
|
task=task,
|
|
975
|
-
cache=AsyncBatchingMapProxy(
|
|
1055
|
+
cache=AsyncBatchingMapProxy(
|
|
1056
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1057
|
+
),
|
|
976
1058
|
)
|
|
977
1059
|
|
|
978
1060
|
|
|
@@ -1056,6 +1138,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1056
1138
|
temperature: float | None = 0.0,
|
|
1057
1139
|
top_p: float = 1.0,
|
|
1058
1140
|
max_concurrency: int = 8,
|
|
1141
|
+
show_progress: bool = False,
|
|
1059
1142
|
) -> pd.Series:
|
|
1060
1143
|
"""Generate a response for each row after serialising it to JSON (asynchronously).
|
|
1061
1144
|
|
|
@@ -1068,6 +1151,15 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1068
1151
|
])
|
|
1069
1152
|
# Must be awaited
|
|
1070
1153
|
results = await df.aio.responses(\"what is the animal\'s name?\")
|
|
1154
|
+
|
|
1155
|
+
# With progress bar for large datasets
|
|
1156
|
+
large_df = pd.DataFrame({\"id\": list(range(1000))})
|
|
1157
|
+
results = await large_df.aio.responses(
|
|
1158
|
+
\"generate a name for this ID\",
|
|
1159
|
+
batch_size=20,
|
|
1160
|
+
max_concurrency=4,
|
|
1161
|
+
show_progress=True
|
|
1162
|
+
)
|
|
1071
1163
|
```
|
|
1072
1164
|
This method returns a Series of strings, each containing the
|
|
1073
1165
|
assistant's response to the corresponding input.
|
|
@@ -1085,6 +1177,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1085
1177
|
top_p (float, optional): Nucleus sampling parameter. Defaults to ``1.0``.
|
|
1086
1178
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1087
1179
|
requests. Defaults to ``8``.
|
|
1180
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1088
1181
|
|
|
1089
1182
|
Returns:
|
|
1090
1183
|
pandas.Series: Responses aligned with the DataFrame's original index.
|
|
@@ -1094,13 +1187,17 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1094
1187
|
"""
|
|
1095
1188
|
return await self.responses_with_cache(
|
|
1096
1189
|
instructions=instructions,
|
|
1097
|
-
cache=AsyncBatchingMapProxy(
|
|
1190
|
+
cache=AsyncBatchingMapProxy(
|
|
1191
|
+
batch_size=batch_size, max_concurrency=max_concurrency, show_progress=show_progress
|
|
1192
|
+
),
|
|
1098
1193
|
response_format=response_format,
|
|
1099
1194
|
temperature=temperature,
|
|
1100
1195
|
top_p=top_p,
|
|
1101
1196
|
)
|
|
1102
1197
|
|
|
1103
|
-
async def task(
|
|
1198
|
+
async def task(
|
|
1199
|
+
self, task: PreparedTask, batch_size: int = 128, max_concurrency: int = 8, show_progress: bool = False
|
|
1200
|
+
) -> pd.Series:
|
|
1104
1201
|
"""Execute a prepared task on each DataFrame row after serialising it to JSON (asynchronously).
|
|
1105
1202
|
|
|
1106
1203
|
This method applies a pre-configured task to each row in the DataFrame,
|
|
@@ -1122,6 +1219,15 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1122
1219
|
])
|
|
1123
1220
|
# Must be awaited
|
|
1124
1221
|
results = await df.aio.task(analysis_task)
|
|
1222
|
+
|
|
1223
|
+
# With progress bar for large datasets
|
|
1224
|
+
large_df = pd.DataFrame({"id": list(range(1000))})
|
|
1225
|
+
results = await large_df.aio.task(
|
|
1226
|
+
analysis_task,
|
|
1227
|
+
batch_size=50,
|
|
1228
|
+
max_concurrency=4,
|
|
1229
|
+
show_progress=True
|
|
1230
|
+
)
|
|
1125
1231
|
```
|
|
1126
1232
|
This method returns a Series containing the task results for each
|
|
1127
1233
|
corresponding row, following the task's defined structure.
|
|
@@ -1133,6 +1239,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1133
1239
|
to optimize API usage. Defaults to 128.
|
|
1134
1240
|
max_concurrency (int, optional): Maximum number of concurrent
|
|
1135
1241
|
requests. Defaults to 8.
|
|
1242
|
+
show_progress (bool, optional): Show progress bar in Jupyter notebooks. Defaults to ``False``.
|
|
1136
1243
|
|
|
1137
1244
|
Returns:
|
|
1138
1245
|
pandas.Series: Series whose values are instances of the task's
|
|
@@ -1153,6 +1260,7 @@ class AsyncOpenAIVecDataFrameAccessor:
|
|
|
1153
1260
|
task=task,
|
|
1154
1261
|
batch_size=batch_size,
|
|
1155
1262
|
max_concurrency=max_concurrency,
|
|
1263
|
+
show_progress=show_progress,
|
|
1156
1264
|
)
|
|
1157
1265
|
|
|
1158
1266
|
async def pipe(self, func: Callable[[pd.DataFrame], Awaitable[T] | T]) -> T:
|
|
@@ -51,6 +51,9 @@ from openai import OpenAI
|
|
|
51
51
|
from openai.types.responses import ParsedResponse
|
|
52
52
|
from pydantic import BaseModel
|
|
53
53
|
|
|
54
|
+
from openaivec.model import ResponsesModelName
|
|
55
|
+
from openaivec.provider import CONTAINER
|
|
56
|
+
|
|
54
57
|
__all__ = [
|
|
55
58
|
"FewShotPrompt",
|
|
56
59
|
"FewShotPromptBuilder",
|
|
@@ -203,7 +206,9 @@ _PROMPT: str = """
|
|
|
203
206
|
"iterations": [
|
|
204
207
|
{
|
|
205
208
|
"id": 1,
|
|
206
|
-
"analysis": "The original purpose was vague and did not explicitly state the main objective.
|
|
209
|
+
"analysis": "The original purpose was vague and did not explicitly state the main objective.
|
|
210
|
+
This ambiguity could lead to confusion about the task. In this iteration, we refined the purpose to
|
|
211
|
+
clearly specify that the goal is to determine the correct category for a given word based on its context.",
|
|
207
212
|
"prompt": {
|
|
208
213
|
"purpose": "Determine the correct category for a given word by analyzing its context for clear meaning.",
|
|
209
214
|
"cautions": [
|
|
@@ -225,7 +230,10 @@ _PROMPT: str = """
|
|
|
225
230
|
},
|
|
226
231
|
{
|
|
227
232
|
"id": 2,
|
|
228
|
-
"analysis": "Next, we focused solely on the cautions section. The original cautions were generic and
|
|
233
|
+
"analysis": "Next, we focused solely on the cautions section. The original cautions were generic and
|
|
234
|
+
did not mention potential pitfalls like homonyms or polysemy. Failing to address these could result in
|
|
235
|
+
misclassification. Therefore, we added a specific caution regarding homonyms while keeping the purpose
|
|
236
|
+
and examples unchanged.",
|
|
229
237
|
"prompt": {
|
|
230
238
|
"purpose": "Determine the correct category for a given word by analyzing its context for clear meaning.",
|
|
231
239
|
"cautions": [
|
|
@@ -248,7 +256,10 @@ _PROMPT: str = """
|
|
|
248
256
|
},
|
|
249
257
|
{
|
|
250
258
|
"id": 3,
|
|
251
|
-
"analysis": "In this step, we improved the examples section to cover a broader range of scenarios and
|
|
259
|
+
"analysis": "In this step, we improved the examples section to cover a broader range of scenarios and
|
|
260
|
+
address potential ambiguities. By adding examples that include words with multiple interpretations
|
|
261
|
+
(such as 'Mercury' for both a planet and an element), we enhance clarity and ensure better coverage.
|
|
262
|
+
This iteration only modifies the examples section, leaving purpose and cautions intact.",
|
|
252
263
|
"prompt": {
|
|
253
264
|
"purpose": "Determine the correct category for a given word by analyzing its context for clear meaning.",
|
|
254
265
|
"cautions": [
|
|
@@ -409,28 +420,34 @@ class FewShotPromptBuilder:
|
|
|
409
420
|
|
|
410
421
|
def improve(
|
|
411
422
|
self,
|
|
412
|
-
client: OpenAI,
|
|
413
|
-
model_name: str,
|
|
414
|
-
temperature: float =
|
|
415
|
-
top_p: float =
|
|
423
|
+
client: OpenAI | None = None,
|
|
424
|
+
model_name: str | None = None,
|
|
425
|
+
temperature: float | None = None,
|
|
426
|
+
top_p: float | None = None,
|
|
416
427
|
) -> "FewShotPromptBuilder":
|
|
417
428
|
"""Iteratively refine the prompt using an LLM.
|
|
418
429
|
|
|
419
430
|
The method calls a single LLM request that returns multiple
|
|
420
431
|
editing steps and stores each step for inspection.
|
|
421
432
|
|
|
433
|
+
When client is None, automatically creates a client using environment variables:
|
|
434
|
+
- For OpenAI: ``OPENAI_API_KEY``
|
|
435
|
+
- For Azure OpenAI: ``AZURE_OPENAI_API_KEY``, ``AZURE_OPENAI_BASE_URL``, ``AZURE_OPENAI_API_VERSION``
|
|
436
|
+
|
|
422
437
|
Args:
|
|
423
|
-
client (
|
|
424
|
-
model_name (str): Model identifier
|
|
425
|
-
temperature (float
|
|
426
|
-
top_p (float
|
|
438
|
+
client (OpenAI | None): Configured OpenAI client. If None, uses DI container with environment variables.
|
|
439
|
+
model_name (str | None): Model identifier. If None, uses default ``gpt-4.1-mini``.
|
|
440
|
+
temperature (float | None): Sampling temperature. If None, uses model default.
|
|
441
|
+
top_p (float | None): Nucleus sampling parameter. If None, uses model default.
|
|
427
442
|
|
|
428
443
|
Returns:
|
|
429
444
|
FewShotPromptBuilder: The current builder instance containing the refined prompt and iteration history.
|
|
430
445
|
"""
|
|
446
|
+
_client = client or CONTAINER.resolve(OpenAI)
|
|
447
|
+
_model_name = model_name or CONTAINER.resolve(ResponsesModelName).value
|
|
431
448
|
|
|
432
|
-
response: ParsedResponse[Response] =
|
|
433
|
-
model=
|
|
449
|
+
response: ParsedResponse[Response] = _client.responses.parse(
|
|
450
|
+
model=_model_name,
|
|
434
451
|
instructions=_PROMPT,
|
|
435
452
|
input=Request(prompt=self._prompt).model_dump_json(),
|
|
436
453
|
temperature=temperature,
|
|
@@ -456,6 +473,10 @@ class FewShotPromptBuilder:
|
|
|
456
473
|
Returns:
|
|
457
474
|
FewShotPromptBuilder: The current builder instance.
|
|
458
475
|
"""
|
|
476
|
+
if not hasattr(self, "_steps") or not self._steps:
|
|
477
|
+
print("No improvement steps available. Call improve() first.")
|
|
478
|
+
return self
|
|
479
|
+
|
|
459
480
|
for previous, current in zip(self._steps, self._steps[1:]):
|
|
460
481
|
print(f"=== Iteration {current.id} ===\n")
|
|
461
482
|
print(f"Instruction: {current.analysis}")
|
|
@@ -4,8 +4,8 @@ import warnings
|
|
|
4
4
|
import tiktoken
|
|
5
5
|
from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from .model import (
|
|
7
|
+
from openaivec import di
|
|
8
|
+
from openaivec.model import (
|
|
9
9
|
AzureOpenAIAPIKey,
|
|
10
10
|
AzureOpenAIAPIVersion,
|
|
11
11
|
AzureOpenAIBaseURL,
|
|
@@ -13,7 +13,7 @@ from .model import (
|
|
|
13
13
|
OpenAIAPIKey,
|
|
14
14
|
ResponsesModelName,
|
|
15
15
|
)
|
|
16
|
-
from .util import TextChunker
|
|
16
|
+
from openaivec.util import TextChunker
|
|
17
17
|
|
|
18
18
|
CONTAINER = di.Container()
|
|
19
19
|
|