opik-optimizer 0.7.7__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer-0.8.0/PKG-INFO +196 -0
- opik_optimizer-0.8.0/README.md +156 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/setup.py +1 -2
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/__init__.py +2 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/base_optimizer.py +6 -4
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/__init__.py +27 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/ai2_arc.py +44 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/cnn_dailymail.py +40 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/election_questions.py +36 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/gsm8k.py +40 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/halu_eval.py +43 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/hotpot_qa.py +67 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/medhallu.py +39 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/rag_hallucinations.py +41 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/ragbench.py +40 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/tiny_test.py +57 -0
- opik_optimizer-0.8.0/src/opik_optimizer/datasets/truthful_qa.py +107 -0
- opik_optimizer-0.8.0/src/opik_optimizer/demo/datasets.py +85 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +3 -1
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +88 -17
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/logging_config.py +1 -1
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/meta_prompt_optimizer.py +57 -11
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/mipro_optimizer/mipro_optimizer.py +164 -16
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/mipro_optimizer/utils.py +8 -1
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/optimization_result.py +11 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/task_evaluator.py +6 -1
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/utils.py +0 -52
- opik_optimizer-0.8.0/src/opik_optimizer.egg-info/PKG-INFO +196 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer.egg-info/SOURCES.txt +12 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer.egg-info/requires.txt +0 -1
- opik_optimizer-0.7.7/PKG-INFO +0 -174
- opik_optimizer-0.7.7/README.md +0 -133
- opik_optimizer-0.7.7/src/opik_optimizer/demo/datasets.py +0 -639
- opik_optimizer-0.7.7/src/opik_optimizer.egg-info/PKG-INFO +0 -174
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/LICENSE +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/setup.cfg +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/_throttle.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/cache_config.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/data/hotpot-500.json +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/demo/__init__.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/demo/cache.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/evolutionary_optimizer/__init__.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/few_shot_bayesian_optimizer/__init__.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/integrations/__init__.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/mipro_optimizer/__init__.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/mipro_optimizer/_lm.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/optimization_config/__init__.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/optimization_config/configs.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer/optimization_config/mappers.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer.egg-info/dependency_links.txt +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/src/opik_optimizer.egg-info/top_level.txt +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/tests/test_base_optimizer.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/tests/test_example.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/tests/test_few_shot_bayesian_optimizer.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/tests/test_mappers.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/tests/test_optimization_dsl.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/tests/test_optimization_result.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/tests/test_task_evaluator.py +0 -0
- {opik_optimizer-0.7.7 → opik_optimizer-0.8.0}/tests/test_utils.py +0 -0
@@ -0,0 +1,196 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: opik_optimizer
|
3
|
+
Version: 0.8.0
|
4
|
+
Summary: Agent optimization with Opik
|
5
|
+
Home-page: https://github.com/comet-ml/opik
|
6
|
+
Author: Comet ML
|
7
|
+
Author-email: support@comet.com
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
9
|
+
Classifier: Intended Audience :: Developers
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
12
|
+
Requires-Python: >=3.9,<3.13
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: opik>=1.7.17
|
16
|
+
Requires-Dist: dspy<3,>=2.6.18
|
17
|
+
Requires-Dist: litellm
|
18
|
+
Requires-Dist: tqdm
|
19
|
+
Requires-Dist: datasets
|
20
|
+
Requires-Dist: optuna
|
21
|
+
Requires-Dist: pydantic
|
22
|
+
Requires-Dist: pandas
|
23
|
+
Requires-Dist: hf_xet
|
24
|
+
Requires-Dist: pyrate-limiter
|
25
|
+
Requires-Dist: deap>=1.4.3
|
26
|
+
Provides-Extra: dev
|
27
|
+
Requires-Dist: pytest; extra == "dev"
|
28
|
+
Requires-Dist: pytest-conv; extra == "dev"
|
29
|
+
Dynamic: author
|
30
|
+
Dynamic: author-email
|
31
|
+
Dynamic: classifier
|
32
|
+
Dynamic: description
|
33
|
+
Dynamic: description-content-type
|
34
|
+
Dynamic: home-page
|
35
|
+
Dynamic: license-file
|
36
|
+
Dynamic: provides-extra
|
37
|
+
Dynamic: requires-dist
|
38
|
+
Dynamic: requires-python
|
39
|
+
Dynamic: summary
|
40
|
+
|
41
|
+
# Opik Agent Optimizer
|
42
|
+
|
43
|
+
[](https://pypi.org/project/opik-optimizer/)
|
44
|
+
[](https://pypi.org/project/opik-optimizer/)
|
45
|
+
[](https://pepy.tech/project/opik-optimizer)
|
46
|
+
[](https://github.com/comet-ml/opik/blob/main/LICENSE)
|
47
|
+
|
48
|
+
The Opik Agent Optimizer refines your prompts to achieve better performance from your Large Language Models (LLMs). It supports a variety of optimization algorithms, including:
|
49
|
+
|
50
|
+
* EvolutionaryOptimizer
|
51
|
+
* FewShotBayesianOptimizer
|
52
|
+
* MetaPromptOptimizer
|
53
|
+
* MiproOptimizer
|
54
|
+
|
55
|
+
Opik Optimizer is a component of the [Opik platform](https://github.com/comet-ml/opik), an open-source LLM evaluation platform by Comet.
|
56
|
+
For more information about the broader Opik ecosystem, visit our [Website](https://www.comet.com/site/products/opik/) or [Documentation](https://www.comet.com/docs/opik/).
|
57
|
+
|
58
|
+
## Quickstart
|
59
|
+
|
60
|
+
Explore Opik Optimizer's capabilities with our interactive notebook:
|
61
|
+
|
62
|
+
<a href="https://colab.research.google.com/github/comet-ml/opik/blob/main/sdks/opik_optimizer/notebooks/OpikOptimizerIntro.ipynb">
|
63
|
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
|
64
|
+
</a>
|
65
|
+
|
66
|
+
## Setup
|
67
|
+
|
68
|
+
To get started with Opik Optimizer, follow these steps:
|
69
|
+
|
70
|
+
1. **Install the package:**
|
71
|
+
```bash
|
72
|
+
# using pip
|
73
|
+
pip install opik-optimizer
|
74
|
+
|
75
|
+
# using uv (faster)
|
76
|
+
uv pip install opik-optimizer
|
77
|
+
```
|
78
|
+
|
79
|
+
2. **Configure Opik (Optional, for advanced features):**
|
80
|
+
If you plan to log optimization experiments to Comet or use Opik Datasets, you'll need to configure the Opik client:
|
81
|
+
```bash
|
82
|
+
# Install the main Opik CLI (if not already installed)
|
83
|
+
pip install opik
|
84
|
+
|
85
|
+
# Configure your Comet API key and workspace
|
86
|
+
opik configure
|
87
|
+
# When prompted, enter your Opik API key and workspace details.
|
88
|
+
```
|
89
|
+
Using Opik with Comet allows you to track your optimization runs, compare results, and manage datasets seamlessly.
|
90
|
+
|
91
|
+
3. **Set up LLM Provider API Keys:**
|
92
|
+
Ensure your environment variables are set for the LLM(s) you intend to use. For example, for OpenAI models:
|
93
|
+
```bash
|
94
|
+
export OPENAI_API_KEY="your_openai_api_key"
|
95
|
+
```
|
96
|
+
The optimizer utilizes LiteLLM, so you can configure keys for various providers as per LiteLLM's documentation.
|
97
|
+
|
98
|
+
You'll typically need:
|
99
|
+
|
100
|
+
* An LLM model name (e.g., "gpt-4o-mini", "claude-3-haiku-20240307").
|
101
|
+
* An [Opik Dataset](https://www.comet.com/docs/opik/evaluation/manage_datasets/) (or a compatible local dataset/data generator).
|
102
|
+
* An [Opik Metric](https://www.comet.com/docs/opik/evaluation/metrics/overview/) (or a custom evaluation function).
|
103
|
+
* A starting prompt (template string).
|
104
|
+
|
105
|
+
## Example
|
106
|
+
|
107
|
+
Here's a brief example of how to use the `FewShotBayesianOptimizer`. We'll use a sample dataset provided by Opik.
|
108
|
+
|
109
|
+
Available sample datasets for testing:
|
110
|
+
* `"tiny-test"`
|
111
|
+
* `"halu-eval-300"`
|
112
|
+
* `"hotpot-300"`
|
113
|
+
|
114
|
+
```python
|
115
|
+
from opik.evaluation.metrics import LevenshteinRatio
|
116
|
+
from opik_optimizer import FewShotBayesianOptimizer
|
117
|
+
from opik_optimizer.demo import get_or_create_dataset
|
118
|
+
|
119
|
+
from opik_optimizer import (
|
120
|
+
MetricConfig,
|
121
|
+
TaskConfig,
|
122
|
+
from_dataset_field,
|
123
|
+
from_llm_response_text,
|
124
|
+
)
|
125
|
+
|
126
|
+
# Load a sample dataset
|
127
|
+
hot_pot_dataset = get_or_create_dataset("hotpot-300")
|
128
|
+
|
129
|
+
# Define the instruction for your chat prompt.
|
130
|
+
# Input parameters from dataset examples will be interpolated into the full prompt.
|
131
|
+
prompt_instruction = """
|
132
|
+
Answer the question based on the provided context.
|
133
|
+
"""
|
134
|
+
project_name = "optimize-few-shot-bayesian-hotpot" # For Comet logging
|
135
|
+
|
136
|
+
optimizer = FewShotBayesianOptimizer(
|
137
|
+
model="gpt-4o-mini", # LiteLLM name to use for generation and optimization
|
138
|
+
project_name=project_name, # Associates the run with a Comet project
|
139
|
+
min_examples=3, # Min few-shot examples
|
140
|
+
max_examples=8, # Max few-shot examples
|
141
|
+
n_threads=16, # Parallel threads for evaluation
|
142
|
+
seed=42,
|
143
|
+
)
|
144
|
+
|
145
|
+
metric_config = MetricConfig(
|
146
|
+
metric=LevenshteinRatio(project_name=project_name), # Metric for evaluation
|
147
|
+
inputs={
|
148
|
+
"output": from_llm_response_text(), # Get output from LLM
|
149
|
+
"reference": from_dataset_field(name="answer"), # Get reference from dataset
|
150
|
+
},
|
151
|
+
)
|
152
|
+
|
153
|
+
task_config = TaskConfig(
|
154
|
+
instruction_prompt=prompt_instruction,
|
155
|
+
input_dataset_fields=["question"], # Fields from dataset to use as input
|
156
|
+
output_dataset_field="answer", # Field in dataset for reference answer
|
157
|
+
use_chat_prompt=True, # Use chat-style prompting
|
158
|
+
)
|
159
|
+
|
160
|
+
# Run the optimization
|
161
|
+
result = optimizer.optimize_prompt(
|
162
|
+
dataset=hot_pot_dataset,
|
163
|
+
metric_config=metric_config,
|
164
|
+
task_config=task_config,
|
165
|
+
n_trials=10, # Number of optimization trials
|
166
|
+
n_samples=150, # Number of dataset samples for evaluation per trial
|
167
|
+
)
|
168
|
+
|
169
|
+
# Display the best prompt and its score
|
170
|
+
result.display()
|
171
|
+
```
|
172
|
+
The `result` object contains the optimized prompt, evaluation scores, and other details from the optimization process. If `project_name` is provided and Opik is configured, results will also be logged to your Comet workspace.
|
173
|
+
|
174
|
+
## Development
|
175
|
+
|
176
|
+
To contribute or use the Opik Optimizer from source:
|
177
|
+
|
178
|
+
1. **Clone the Opik repository:**
|
179
|
+
```bash
|
180
|
+
git clone git@github.com:comet-ml/opik.git
|
181
|
+
```
|
182
|
+
2. **Navigate to the optimizer's directory:**
|
183
|
+
```bash
|
184
|
+
cd opik/sdks/opik_optimizer # Adjust 'opik' if you cloned into a different folder name
|
185
|
+
```
|
186
|
+
3. **Install in editable mode (with development dependencies):**
|
187
|
+
```bash
|
188
|
+
pip install -e .[dev]
|
189
|
+
```
|
190
|
+
The `[dev]` extra installs dependencies useful for development, such as `pytest`.
|
191
|
+
|
192
|
+
## Requirements
|
193
|
+
|
194
|
+
- Python `>=3.9,<3.13`
|
195
|
+
- Opik API key (recommended for full functionality, configure via `opik configure`)
|
196
|
+
- API key for your chosen LLM provider (e.g., OpenAI, Anthropic, Gemini), configured as per LiteLLM guidelines.
|
@@ -0,0 +1,156 @@
|
|
1
|
+
# Opik Agent Optimizer
|
2
|
+
|
3
|
+
[](https://pypi.org/project/opik-optimizer/)
|
4
|
+
[](https://pypi.org/project/opik-optimizer/)
|
5
|
+
[](https://pepy.tech/project/opik-optimizer)
|
6
|
+
[](https://github.com/comet-ml/opik/blob/main/LICENSE)
|
7
|
+
|
8
|
+
The Opik Agent Optimizer refines your prompts to achieve better performance from your Large Language Models (LLMs). It supports a variety of optimization algorithms, including:
|
9
|
+
|
10
|
+
* EvolutionaryOptimizer
|
11
|
+
* FewShotBayesianOptimizer
|
12
|
+
* MetaPromptOptimizer
|
13
|
+
* MiproOptimizer
|
14
|
+
|
15
|
+
Opik Optimizer is a component of the [Opik platform](https://github.com/comet-ml/opik), an open-source LLM evaluation platform by Comet.
|
16
|
+
For more information about the broader Opik ecosystem, visit our [Website](https://www.comet.com/site/products/opik/) or [Documentation](https://www.comet.com/docs/opik/).
|
17
|
+
|
18
|
+
## Quickstart
|
19
|
+
|
20
|
+
Explore Opik Optimizer's capabilities with our interactive notebook:
|
21
|
+
|
22
|
+
<a href="https://colab.research.google.com/github/comet-ml/opik/blob/main/sdks/opik_optimizer/notebooks/OpikOptimizerIntro.ipynb">
|
23
|
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
|
24
|
+
</a>
|
25
|
+
|
26
|
+
## Setup
|
27
|
+
|
28
|
+
To get started with Opik Optimizer, follow these steps:
|
29
|
+
|
30
|
+
1. **Install the package:**
|
31
|
+
```bash
|
32
|
+
# using pip
|
33
|
+
pip install opik-optimizer
|
34
|
+
|
35
|
+
# using uv (faster)
|
36
|
+
uv pip install opik-optimizer
|
37
|
+
```
|
38
|
+
|
39
|
+
2. **Configure Opik (Optional, for advanced features):**
|
40
|
+
If you plan to log optimization experiments to Comet or use Opik Datasets, you'll need to configure the Opik client:
|
41
|
+
```bash
|
42
|
+
# Install the main Opik CLI (if not already installed)
|
43
|
+
pip install opik
|
44
|
+
|
45
|
+
# Configure your Comet API key and workspace
|
46
|
+
opik configure
|
47
|
+
# When prompted, enter your Opik API key and workspace details.
|
48
|
+
```
|
49
|
+
Using Opik with Comet allows you to track your optimization runs, compare results, and manage datasets seamlessly.
|
50
|
+
|
51
|
+
3. **Set up LLM Provider API Keys:**
|
52
|
+
Ensure your environment variables are set for the LLM(s) you intend to use. For example, for OpenAI models:
|
53
|
+
```bash
|
54
|
+
export OPENAI_API_KEY="your_openai_api_key"
|
55
|
+
```
|
56
|
+
The optimizer utilizes LiteLLM, so you can configure keys for various providers as per LiteLLM's documentation.
|
57
|
+
|
58
|
+
You'll typically need:
|
59
|
+
|
60
|
+
* An LLM model name (e.g., "gpt-4o-mini", "claude-3-haiku-20240307").
|
61
|
+
* An [Opik Dataset](https://www.comet.com/docs/opik/evaluation/manage_datasets/) (or a compatible local dataset/data generator).
|
62
|
+
* An [Opik Metric](https://www.comet.com/docs/opik/evaluation/metrics/overview/) (or a custom evaluation function).
|
63
|
+
* A starting prompt (template string).
|
64
|
+
|
65
|
+
## Example
|
66
|
+
|
67
|
+
Here's a brief example of how to use the `FewShotBayesianOptimizer`. We'll use a sample dataset provided by Opik.
|
68
|
+
|
69
|
+
Available sample datasets for testing:
|
70
|
+
* `"tiny-test"`
|
71
|
+
* `"halu-eval-300"`
|
72
|
+
* `"hotpot-300"`
|
73
|
+
|
74
|
+
```python
|
75
|
+
from opik.evaluation.metrics import LevenshteinRatio
|
76
|
+
from opik_optimizer import FewShotBayesianOptimizer
|
77
|
+
from opik_optimizer.demo import get_or_create_dataset
|
78
|
+
|
79
|
+
from opik_optimizer import (
|
80
|
+
MetricConfig,
|
81
|
+
TaskConfig,
|
82
|
+
from_dataset_field,
|
83
|
+
from_llm_response_text,
|
84
|
+
)
|
85
|
+
|
86
|
+
# Load a sample dataset
|
87
|
+
hot_pot_dataset = get_or_create_dataset("hotpot-300")
|
88
|
+
|
89
|
+
# Define the instruction for your chat prompt.
|
90
|
+
# Input parameters from dataset examples will be interpolated into the full prompt.
|
91
|
+
prompt_instruction = """
|
92
|
+
Answer the question based on the provided context.
|
93
|
+
"""
|
94
|
+
project_name = "optimize-few-shot-bayesian-hotpot" # For Comet logging
|
95
|
+
|
96
|
+
optimizer = FewShotBayesianOptimizer(
|
97
|
+
model="gpt-4o-mini", # LiteLLM name to use for generation and optimization
|
98
|
+
project_name=project_name, # Associates the run with a Comet project
|
99
|
+
min_examples=3, # Min few-shot examples
|
100
|
+
max_examples=8, # Max few-shot examples
|
101
|
+
n_threads=16, # Parallel threads for evaluation
|
102
|
+
seed=42,
|
103
|
+
)
|
104
|
+
|
105
|
+
metric_config = MetricConfig(
|
106
|
+
metric=LevenshteinRatio(project_name=project_name), # Metric for evaluation
|
107
|
+
inputs={
|
108
|
+
"output": from_llm_response_text(), # Get output from LLM
|
109
|
+
"reference": from_dataset_field(name="answer"), # Get reference from dataset
|
110
|
+
},
|
111
|
+
)
|
112
|
+
|
113
|
+
task_config = TaskConfig(
|
114
|
+
instruction_prompt=prompt_instruction,
|
115
|
+
input_dataset_fields=["question"], # Fields from dataset to use as input
|
116
|
+
output_dataset_field="answer", # Field in dataset for reference answer
|
117
|
+
use_chat_prompt=True, # Use chat-style prompting
|
118
|
+
)
|
119
|
+
|
120
|
+
# Run the optimization
|
121
|
+
result = optimizer.optimize_prompt(
|
122
|
+
dataset=hot_pot_dataset,
|
123
|
+
metric_config=metric_config,
|
124
|
+
task_config=task_config,
|
125
|
+
n_trials=10, # Number of optimization trials
|
126
|
+
n_samples=150, # Number of dataset samples for evaluation per trial
|
127
|
+
)
|
128
|
+
|
129
|
+
# Display the best prompt and its score
|
130
|
+
result.display()
|
131
|
+
```
|
132
|
+
The `result` object contains the optimized prompt, evaluation scores, and other details from the optimization process. If `project_name` is provided and Opik is configured, results will also be logged to your Comet workspace.
|
133
|
+
|
134
|
+
## Development
|
135
|
+
|
136
|
+
To contribute or use the Opik Optimizer from source:
|
137
|
+
|
138
|
+
1. **Clone the Opik repository:**
|
139
|
+
```bash
|
140
|
+
git clone git@github.com:comet-ml/opik.git
|
141
|
+
```
|
142
|
+
2. **Navigate to the optimizer's directory:**
|
143
|
+
```bash
|
144
|
+
cd opik/sdks/opik_optimizer # Adjust 'opik' if you cloned into a different folder name
|
145
|
+
```
|
146
|
+
3. **Install in editable mode (with development dependencies):**
|
147
|
+
```bash
|
148
|
+
pip install -e .[dev]
|
149
|
+
```
|
150
|
+
The `[dev]` extra installs dependencies useful for development, such as `pytest`.
|
151
|
+
|
152
|
+
## Requirements
|
153
|
+
|
154
|
+
- Python `>=3.9,<3.13`
|
155
|
+
- Opik API key (recommended for full functionality, configure via `opik configure`)
|
156
|
+
- API key for your chosen LLM provider (e.g., OpenAI, Anthropic, Gemini), configured as per LiteLLM guidelines.
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
2
2
|
|
3
3
|
setup(
|
4
4
|
name="opik_optimizer",
|
5
|
-
version="0.
|
5
|
+
version="0.8.0",
|
6
6
|
description="Agent optimization with Opik",
|
7
7
|
author="Comet ML",
|
8
8
|
author_email="support@comet.com",
|
@@ -31,7 +31,6 @@ setup(
|
|
31
31
|
# dev requirements
|
32
32
|
extras_require={
|
33
33
|
"dev": [
|
34
|
-
"adalflow",
|
35
34
|
"pytest",
|
36
35
|
"pytest-conv"
|
37
36
|
],
|
@@ -23,6 +23,7 @@ from .optimization_config.mappers import (
|
|
23
23
|
)
|
24
24
|
|
25
25
|
from opik.evaluation.models.litellm import warning_filters
|
26
|
+
from . import datasets
|
26
27
|
|
27
28
|
warning_filters.add_warning_filters()
|
28
29
|
|
@@ -42,4 +43,5 @@ __all__ = [
|
|
42
43
|
"from_llm_response_text",
|
43
44
|
"OptimizationResult",
|
44
45
|
"setup_logging",
|
46
|
+
"datasets",
|
45
47
|
]
|
@@ -4,15 +4,15 @@ import logging
|
|
4
4
|
import time
|
5
5
|
|
6
6
|
import litellm
|
7
|
+
from . import _throttle
|
7
8
|
from opik.rest_api.core import ApiError
|
8
9
|
|
9
10
|
from pydantic import BaseModel
|
10
|
-
from ._throttle import RateLimiter, rate_limited
|
11
11
|
from .cache_config import initialize_cache
|
12
12
|
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
13
13
|
from .optimization_config.configs import TaskConfig, MetricConfig
|
14
14
|
|
15
|
-
|
15
|
+
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
16
16
|
|
17
17
|
# Don't use unsupported params:
|
18
18
|
litellm.drop_params = True
|
@@ -32,19 +32,21 @@ class OptimizationRound(BaseModel):
|
|
32
32
|
|
33
33
|
|
34
34
|
class BaseOptimizer:
|
35
|
-
def __init__(self, model: str, project_name: Optional[str] = None, **model_kwargs):
|
35
|
+
def __init__(self, model: str, project_name: Optional[str] = None, verbose: int = 1, **model_kwargs):
|
36
36
|
"""
|
37
37
|
Base class for optimizers.
|
38
38
|
|
39
39
|
Args:
|
40
40
|
model: LiteLLM model name
|
41
41
|
project_name: Opik project name
|
42
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
42
43
|
model_kwargs: additional args for model (eg, temperature)
|
43
44
|
"""
|
44
45
|
self.model = model
|
45
46
|
self.reasoning_model = model
|
46
47
|
self.model_kwargs = model_kwargs
|
47
48
|
self.project_name = project_name
|
49
|
+
self.verbose = verbose
|
48
50
|
self._history = []
|
49
51
|
self.experiment_config = None
|
50
52
|
self.llm_call_counter = 0
|
@@ -141,7 +143,7 @@ class BaseOptimizer:
|
|
141
143
|
"""
|
142
144
|
self._history.append(round_data)
|
143
145
|
|
144
|
-
|
146
|
+
|
145
147
|
def update_optimization(self, optimization, status: str) -> None:
|
146
148
|
"""
|
147
149
|
Update the optimization status
|
@@ -0,0 +1,27 @@
|
|
1
|
+
from .hotpot_qa import hotpot_300, hotpot_500
|
2
|
+
from .halu_eval import halu_eval_300
|
3
|
+
from .tiny_test import tiny_test
|
4
|
+
from .gsm8k import gsm8k
|
5
|
+
from .ai2_arc import ai2_arc
|
6
|
+
from .truthful_qa import truthful_qa
|
7
|
+
from .cnn_dailymail import cnn_dailymail
|
8
|
+
from .ragbench import ragbench_sentence_relevance
|
9
|
+
from .election_questions import election_questions
|
10
|
+
from .medhallu import medhallu
|
11
|
+
from .rag_hallucinations import rag_hallucinations
|
12
|
+
|
13
|
+
|
14
|
+
__all__ = [
|
15
|
+
"hotpot_300",
|
16
|
+
"hotpot_500",
|
17
|
+
"halu_eval_300",
|
18
|
+
"tiny_test",
|
19
|
+
"gsm8k",
|
20
|
+
"ai2_arc",
|
21
|
+
"truthful_qa",
|
22
|
+
"cnn_dailymail",
|
23
|
+
"ragbench_sentence_relevance",
|
24
|
+
"election_questions",
|
25
|
+
"medhallu",
|
26
|
+
"rag_hallucinations",
|
27
|
+
]
|
@@ -0,0 +1,44 @@
|
|
1
|
+
import opik
|
2
|
+
|
3
|
+
def ai2_arc(
|
4
|
+
test_mode: bool = False
|
5
|
+
) -> opik.Dataset:
|
6
|
+
"""
|
7
|
+
Dataset containing the first 300 samples of the AI2 ARC dataset.
|
8
|
+
"""
|
9
|
+
dataset_name = "ai2_arc" if not test_mode else "ai2_arc_test"
|
10
|
+
nb_items = 300 if not test_mode else 5
|
11
|
+
|
12
|
+
client = opik.Opik()
|
13
|
+
dataset = client.get_or_create_dataset(dataset_name)
|
14
|
+
|
15
|
+
items = dataset.get_items()
|
16
|
+
if len(items) == nb_items:
|
17
|
+
return dataset
|
18
|
+
elif len(items) != 0:
|
19
|
+
raise ValueError(f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it.")
|
20
|
+
elif len(items) == 0:
|
21
|
+
import datasets as ds
|
22
|
+
|
23
|
+
# Load data from file and insert into the dataset
|
24
|
+
download_config = ds.DownloadConfig(download_desc=False, disable_tqdm=True)
|
25
|
+
ds.disable_progress_bar()
|
26
|
+
hf_dataset = ds.load_dataset(
|
27
|
+
"ai2_arc", "ARC-Challenge",
|
28
|
+
streaming=True, download_config=download_config
|
29
|
+
)
|
30
|
+
|
31
|
+
data = []
|
32
|
+
for i, item in enumerate(hf_dataset["train"]):
|
33
|
+
if i >= nb_items:
|
34
|
+
break
|
35
|
+
data.append({
|
36
|
+
"question": item["question"],
|
37
|
+
"answer": item["answerKey"],
|
38
|
+
"choices": item["choices"],
|
39
|
+
})
|
40
|
+
ds.enable_progress_bar()
|
41
|
+
|
42
|
+
dataset.insert(data)
|
43
|
+
|
44
|
+
return dataset
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import opik
|
2
|
+
|
3
|
+
def cnn_dailymail(
|
4
|
+
test_mode: bool = False
|
5
|
+
) -> opik.Dataset:
|
6
|
+
"""
|
7
|
+
Dataset containing the first 100 samples of the CNN Daily Mail dataset.
|
8
|
+
"""
|
9
|
+
dataset_name = "cnn_dailymail" if not test_mode else "cnn_dailymail_test"
|
10
|
+
nb_items = 100 if not test_mode else 5
|
11
|
+
|
12
|
+
client = opik.Opik()
|
13
|
+
dataset = client.get_or_create_dataset(dataset_name)
|
14
|
+
|
15
|
+
items = dataset.get_items()
|
16
|
+
if len(items) == nb_items:
|
17
|
+
return dataset
|
18
|
+
elif len(items) != 0:
|
19
|
+
raise ValueError(f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it.")
|
20
|
+
elif len(items) == 0:
|
21
|
+
import datasets as ds
|
22
|
+
|
23
|
+
download_config = ds.DownloadConfig(download_desc=False, disable_tqdm=True)
|
24
|
+
ds.disable_progress_bar()
|
25
|
+
hf_dataset = ds.load_dataset("cnn_dailymail", "3.0.0", streaming=True, download_config=download_config)
|
26
|
+
|
27
|
+
data = []
|
28
|
+
for i, item in enumerate(hf_dataset["validation"]):
|
29
|
+
if i >= nb_items:
|
30
|
+
break
|
31
|
+
data.append({
|
32
|
+
"article": item["article"],
|
33
|
+
"highlights": item["highlights"],
|
34
|
+
})
|
35
|
+
ds.enable_progress_bar()
|
36
|
+
|
37
|
+
dataset.insert(data)
|
38
|
+
|
39
|
+
return dataset
|
40
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
import opik
|
2
|
+
|
3
|
+
|
4
|
+
def election_questions(
|
5
|
+
test_mode: bool = False
|
6
|
+
) -> opik.Dataset:
|
7
|
+
dataset_name = "election_questions" if not test_mode else "election_questions_test"
|
8
|
+
nb_items = 300 if not test_mode else 5
|
9
|
+
|
10
|
+
client = opik.Opik()
|
11
|
+
dataset = client.get_or_create_dataset(dataset_name)
|
12
|
+
|
13
|
+
items = dataset.get_items()
|
14
|
+
if len(items) == nb_items:
|
15
|
+
return dataset
|
16
|
+
elif len(items) != 0:
|
17
|
+
raise ValueError(f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it.")
|
18
|
+
elif len(items) == 0:
|
19
|
+
import datasets as ds
|
20
|
+
|
21
|
+
# Load data from file and insert into the dataset
|
22
|
+
download_config = ds.DownloadConfig(download_desc=False, disable_tqdm=True)
|
23
|
+
ds.disable_progress_bar()
|
24
|
+
hf_dataset = ds.load_dataset("Anthropic/election_questions", download_config=download_config)
|
25
|
+
|
26
|
+
data = [
|
27
|
+
{
|
28
|
+
"question": item["question"],
|
29
|
+
"label": item["label"]
|
30
|
+
}
|
31
|
+
for item in hf_dataset["test"].select(range(nb_items))
|
32
|
+
]
|
33
|
+
ds.enable_progress_bar()
|
34
|
+
dataset.insert(data)
|
35
|
+
|
36
|
+
return dataset
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import opik
|
2
|
+
|
3
|
+
def gsm8k(
|
4
|
+
test_mode: bool = False
|
5
|
+
) -> opik.Dataset:
|
6
|
+
"""
|
7
|
+
Dataset containing the first 300 samples of the GSM8K dataset.
|
8
|
+
"""
|
9
|
+
dataset_name = "gsm8k" if not test_mode else "gsm8k_test"
|
10
|
+
nb_items = 300 if not test_mode else 5
|
11
|
+
|
12
|
+
client = opik.Opik()
|
13
|
+
dataset = client.get_or_create_dataset(dataset_name)
|
14
|
+
|
15
|
+
items = dataset.get_items()
|
16
|
+
if len(items) == nb_items:
|
17
|
+
return dataset
|
18
|
+
elif len(items) != 0:
|
19
|
+
raise ValueError(f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it.")
|
20
|
+
elif len(items) == 0:
|
21
|
+
import datasets as ds
|
22
|
+
|
23
|
+
# Load data from file and insert into the dataset
|
24
|
+
download_config = ds.DownloadConfig(download_desc=False, disable_tqdm=True)
|
25
|
+
ds.disable_progress_bar()
|
26
|
+
hf_dataset = ds.load_dataset("gsm8k", "main", streaming=True, download_config=download_config)
|
27
|
+
|
28
|
+
data = []
|
29
|
+
for i, item in enumerate(hf_dataset["train"]):
|
30
|
+
if i >= nb_items:
|
31
|
+
break
|
32
|
+
data.append({
|
33
|
+
"question": item["question"],
|
34
|
+
"answer": item["answer"],
|
35
|
+
})
|
36
|
+
ds.enable_progress_bar()
|
37
|
+
|
38
|
+
dataset.insert(data)
|
39
|
+
|
40
|
+
return dataset
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import opik
|
2
|
+
|
3
|
+
def halu_eval_300(
|
4
|
+
test_mode: bool = False
|
5
|
+
) -> opik.Dataset:
|
6
|
+
"""
|
7
|
+
Dataset containing the first 300 samples of the HaluEval dataset.
|
8
|
+
"""
|
9
|
+
dataset_name = "halu_eval_300" if not test_mode else "halu_eval_300_test"
|
10
|
+
nb_items = 300 if not test_mode else 5
|
11
|
+
|
12
|
+
client = opik.Opik()
|
13
|
+
dataset = client.get_or_create_dataset(dataset_name)
|
14
|
+
|
15
|
+
items = dataset.get_items()
|
16
|
+
if len(items) == nb_items:
|
17
|
+
return dataset
|
18
|
+
elif len(items) != 0:
|
19
|
+
raise ValueError(f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it.")
|
20
|
+
elif len(items) == 0:
|
21
|
+
import pandas as pd
|
22
|
+
|
23
|
+
try:
|
24
|
+
df = pd.read_parquet(
|
25
|
+
"hf://datasets/pminervini/HaluEval/general/data-00000-of-00001.parquet"
|
26
|
+
)
|
27
|
+
except Exception:
|
28
|
+
raise Exception("Unable to download HaluEval; please try again") from None
|
29
|
+
|
30
|
+
sample_size = min(nb_items, len(df))
|
31
|
+
df_sampled = df.sample(n=sample_size, random_state=42)
|
32
|
+
|
33
|
+
dataset_records = [
|
34
|
+
{
|
35
|
+
"input": x["user_query"],
|
36
|
+
"llm_output": x["chatgpt_response"],
|
37
|
+
"expected_hallucination_label": x["hallucination"],
|
38
|
+
}
|
39
|
+
for x in df_sampled.to_dict(orient="records")
|
40
|
+
]
|
41
|
+
|
42
|
+
dataset.insert(dataset_records)
|
43
|
+
return dataset
|