opik-optimizer 0.9.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -3
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +722 -429
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -74
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +271 -188
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +209 -129
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -46
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +38 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +132 -63
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +22 -13
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +89 -58
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +117 -14
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.0.dist-info/RECORD +50 -0
- opik_optimizer-0.9.2.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
|
|
1
1
|
import opik
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
) -> opik.Dataset:
|
3
|
+
|
4
|
+
def rag_hallucinations(test_mode: bool = False) -> opik.Dataset:
|
6
5
|
"""
|
7
6
|
Dataset containing the first 300 samples of the RAG Hallucinations dataset.
|
8
7
|
"""
|
@@ -11,20 +10,24 @@ def rag_hallucinations(
|
|
11
10
|
|
12
11
|
client = opik.Opik()
|
13
12
|
dataset = client.get_or_create_dataset(dataset_name)
|
14
|
-
|
13
|
+
|
15
14
|
items = dataset.get_items()
|
16
15
|
if len(items) == nb_items:
|
17
16
|
return dataset
|
18
17
|
elif len(items) != 0:
|
19
|
-
raise ValueError(
|
18
|
+
raise ValueError(
|
19
|
+
f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it."
|
20
|
+
)
|
20
21
|
elif len(items) == 0:
|
21
22
|
import datasets as ds
|
22
23
|
|
23
24
|
# Load data from file and insert into the dataset
|
24
25
|
download_config = ds.DownloadConfig(download_desc=False, disable_tqdm=True)
|
25
26
|
ds.disable_progress_bar()
|
26
|
-
hf_dataset = ds.load_dataset(
|
27
|
-
|
27
|
+
hf_dataset = ds.load_dataset(
|
28
|
+
"aporia-ai/rag_hallucinations", download_config=download_config
|
29
|
+
)
|
30
|
+
|
28
31
|
data = [
|
29
32
|
{
|
30
33
|
"context": item["context"],
|
@@ -35,7 +38,7 @@ def rag_hallucinations(
|
|
35
38
|
for item in hf_dataset["train"].select(range(nb_items))
|
36
39
|
]
|
37
40
|
ds.enable_progress_bar()
|
38
|
-
|
41
|
+
|
39
42
|
dataset.insert(data)
|
40
43
|
|
41
44
|
return dataset
|
@@ -1,30 +1,38 @@
|
|
1
1
|
import opik
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
) -> opik.Dataset:
|
3
|
+
|
4
|
+
def ragbench_sentence_relevance(test_mode: bool = False) -> opik.Dataset:
|
6
5
|
"""
|
7
6
|
Dataset containing the first 300 samples of the RAGBench sentence relevance dataset.
|
8
7
|
"""
|
9
|
-
dataset_name =
|
8
|
+
dataset_name = (
|
9
|
+
"ragbench_sentence_relevance"
|
10
|
+
if not test_mode
|
11
|
+
else "ragbench_sentence_relevance_test"
|
12
|
+
)
|
10
13
|
nb_items = 300 if not test_mode else 5
|
11
14
|
|
12
15
|
client = opik.Opik()
|
13
16
|
dataset = client.get_or_create_dataset(dataset_name)
|
14
|
-
|
17
|
+
|
15
18
|
items = dataset.get_items()
|
16
19
|
if len(items) == nb_items:
|
17
20
|
return dataset
|
18
21
|
elif len(items) != 0:
|
19
|
-
raise ValueError(
|
22
|
+
raise ValueError(
|
23
|
+
f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it."
|
24
|
+
)
|
20
25
|
elif len(items) == 0:
|
21
26
|
import datasets as ds
|
22
27
|
|
23
28
|
# Load data from file and insert into the dataset
|
24
29
|
download_config = ds.DownloadConfig(download_desc=False, disable_tqdm=True)
|
25
30
|
ds.disable_progress_bar()
|
26
|
-
hf_dataset = ds.load_dataset(
|
27
|
-
|
31
|
+
hf_dataset = ds.load_dataset(
|
32
|
+
"wandb/ragbench-sentence-relevance-balanced",
|
33
|
+
download_config=download_config,
|
34
|
+
)
|
35
|
+
|
28
36
|
data = [
|
29
37
|
{
|
30
38
|
"question": item["question"],
|
@@ -34,7 +42,7 @@ def ragbench_sentence_relevance(
|
|
34
42
|
for item in hf_dataset["train"].select(range(nb_items))
|
35
43
|
]
|
36
44
|
ds.enable_progress_bar()
|
37
|
-
|
45
|
+
|
38
46
|
dataset.insert(data)
|
39
47
|
|
40
48
|
return dataset
|
@@ -1,42 +1,37 @@
|
|
1
1
|
import opik
|
2
2
|
|
3
3
|
TINY_TEST_ITEMS = [
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
4
|
+
{
|
5
|
+
"text": "What is the capital of France?",
|
6
|
+
"label": "Paris",
|
7
|
+
"metadata": {"context": "France is a country in Europe. Its capital is Paris."},
|
8
|
+
},
|
9
|
+
{
|
10
|
+
"text": "Who wrote Romeo and Juliet?",
|
11
|
+
"label": "William Shakespeare",
|
12
|
+
"metadata": {
|
13
|
+
"context": "Romeo and Juliet is a famous play written by William Shakespeare."
|
10
14
|
},
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
},
|
29
|
-
},
|
30
|
-
{
|
31
|
-
"text": "Who painted the Mona Lisa?",
|
32
|
-
"label": "Leonardo da Vinci",
|
33
|
-
"metadata": {"context": "The Mona Lisa was painted by Leonardo da Vinci."},
|
34
|
-
},
|
35
|
-
]
|
15
|
+
},
|
16
|
+
{
|
17
|
+
"text": "What is 2 + 2?",
|
18
|
+
"label": "4",
|
19
|
+
"metadata": {"context": "Basic arithmetic: 2 + 2 equals 4."},
|
20
|
+
},
|
21
|
+
{
|
22
|
+
"text": "What is the largest planet in our solar system?",
|
23
|
+
"label": "Jupiter",
|
24
|
+
"metadata": {"context": "Jupiter is the largest planet in our solar system."},
|
25
|
+
},
|
26
|
+
{
|
27
|
+
"text": "Who painted the Mona Lisa?",
|
28
|
+
"label": "Leonardo da Vinci",
|
29
|
+
"metadata": {"context": "The Mona Lisa was painted by Leonardo da Vinci."},
|
30
|
+
},
|
31
|
+
]
|
36
32
|
|
37
|
-
|
38
|
-
|
39
|
-
) -> opik.Dataset:
|
33
|
+
|
34
|
+
def tiny_test(test_mode: bool = False) -> opik.Dataset:
|
40
35
|
"""
|
41
36
|
Dataset containing the first 5 samples of the HotpotQA dataset.
|
42
37
|
"""
|
@@ -45,13 +40,14 @@ def tiny_test(
|
|
45
40
|
|
46
41
|
client = opik.Opik()
|
47
42
|
dataset = client.get_or_create_dataset(dataset_name)
|
48
|
-
|
43
|
+
|
49
44
|
items = dataset.get_items()
|
50
45
|
if len(items) == nb_items:
|
51
46
|
return dataset
|
52
47
|
elif len(items) != 0:
|
53
|
-
raise ValueError(
|
48
|
+
raise ValueError(
|
49
|
+
f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it."
|
50
|
+
)
|
54
51
|
elif len(items) == 0:
|
55
52
|
dataset.insert(TINY_TEST_ITEMS)
|
56
53
|
return dataset
|
57
|
-
|
@@ -1,8 +1,8 @@
|
|
1
1
|
import opik
|
2
|
+
from typing import Any, Dict, List
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
) -> opik.Dataset:
|
4
|
+
|
5
|
+
def truthful_qa(test_mode: bool = False) -> opik.Dataset:
|
6
6
|
"""
|
7
7
|
Dataset containing the first 300 samples of the TruthfulQA dataset.
|
8
8
|
"""
|
@@ -11,29 +11,35 @@ def truthful_qa(
|
|
11
11
|
|
12
12
|
client = opik.Opik()
|
13
13
|
dataset = client.get_or_create_dataset(dataset_name)
|
14
|
-
|
14
|
+
|
15
15
|
items = dataset.get_items()
|
16
16
|
if len(items) == nb_items:
|
17
17
|
return dataset
|
18
18
|
elif len(items) != 0:
|
19
|
-
raise ValueError(
|
19
|
+
raise ValueError(
|
20
|
+
f"Dataset {dataset_name} contains {len(items)} items, expected {nb_items}. We recommend deleting the dataset and re-creating it."
|
21
|
+
)
|
20
22
|
elif len(items) == 0:
|
21
23
|
import datasets as ds
|
22
24
|
|
23
25
|
# Load data from file and insert into the dataset
|
24
26
|
download_config = ds.DownloadConfig(download_desc=False, disable_tqdm=True)
|
25
27
|
ds.disable_progress_bar()
|
26
|
-
|
27
|
-
gen_dataset = ds.load_dataset(
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
|
29
|
+
gen_dataset = ds.load_dataset(
|
30
|
+
"truthful_qa", "generation", download_config=download_config
|
31
|
+
)
|
32
|
+
mc_dataset = ds.load_dataset(
|
33
|
+
"truthful_qa", "multiple_choice", download_config=download_config
|
34
|
+
)
|
35
|
+
|
36
|
+
data: List[Dict[str, Any]] = []
|
31
37
|
for gen_item, mc_item in zip(
|
32
38
|
gen_dataset["validation"], mc_dataset["validation"]
|
33
39
|
):
|
34
40
|
if len(data) >= nb_items:
|
35
41
|
break
|
36
|
-
|
42
|
+
|
37
43
|
# Get correct answers from both configurations
|
38
44
|
correct_answers = set(gen_item["correct_answers"])
|
39
45
|
if "mc1_targets" in mc_item:
|
@@ -101,7 +107,7 @@ def truthful_qa(
|
|
101
107
|
if all(field in example and example[field] for field in required_fields):
|
102
108
|
data.append(example)
|
103
109
|
ds.enable_progress_bar()
|
104
|
-
|
110
|
+
|
105
111
|
dataset.insert(data)
|
106
112
|
|
107
113
|
return dataset
|
opik_optimizer/demo/cache.py
CHANGED
@@ -4,7 +4,7 @@ import shutil
|
|
4
4
|
import os
|
5
5
|
import litellm
|
6
6
|
from litellm.caching import Cache
|
7
|
-
import requests
|
7
|
+
import requests # type: ignore
|
8
8
|
|
9
9
|
NAMED_CACHES = {
|
10
10
|
"test": "https://drive.google.com/file/d/1RifNtpN-pl0DW49daRaAMJwW7MCsOh6y/view?usp=sharing",
|
@@ -14,7 +14,7 @@ NAMED_CACHES = {
|
|
14
14
|
CACHE_DIR = os.path.expanduser("~/.litellm_cache")
|
15
15
|
|
16
16
|
|
17
|
-
def get_litellm_cache(name: str):
|
17
|
+
def get_litellm_cache(name: str) -> None:
|
18
18
|
"""
|
19
19
|
Get a LiteLLM cache from a remote location, and add it to the
|
20
20
|
local cache
|
@@ -52,7 +52,7 @@ def get_litellm_cache(name: str):
|
|
52
52
|
litellm.cache = Cache(type="disk", disk_cache_dir=CACHE_DIR)
|
53
53
|
|
54
54
|
|
55
|
-
def _copy_cache(source_path, dest_path):
|
55
|
+
def _copy_cache(source_path: str, dest_path: str) -> None:
|
56
56
|
"""
|
57
57
|
Copy cached items from a source to a destination cache.
|
58
58
|
"""
|
@@ -63,7 +63,7 @@ def _copy_cache(source_path, dest_path):
|
|
63
63
|
dest_conn = sqlite3.connect(dest_path)
|
64
64
|
dest_cursor = dest_conn.cursor()
|
65
65
|
|
66
|
-
source_cursor.execute(
|
66
|
+
source_cursor.execute("PRAGMA table_info(Cache)")
|
67
67
|
columns_info = source_cursor.fetchall()
|
68
68
|
column_names = [info[1] for info in columns_info[1:]] # Skip rowid
|
69
69
|
placeholders = ", ".join(["?"] * len(column_names))
|
@@ -91,14 +91,14 @@ def _copy_cache(source_path, dest_path):
|
|
91
91
|
dest_conn.commit()
|
92
92
|
|
93
93
|
|
94
|
-
def _get_google_drive_file(file_url):
|
94
|
+
def _get_google_drive_file(file_url: str) -> str:
|
95
95
|
"""
|
96
96
|
Given a common google drive URL with id=ID
|
97
97
|
get it, or use cache.
|
98
98
|
"""
|
99
99
|
parsed_url = urlparse(file_url)
|
100
100
|
query_params = parse_qs(parsed_url.query)
|
101
|
-
id_value = query_params.get("id")[0]
|
101
|
+
id_value = query_params.get("id")[0] # type: ignore
|
102
102
|
|
103
103
|
cache_file_path = os.path.join(CACHE_DIR, id_value)
|
104
104
|
|
opik_optimizer/demo/datasets.py
CHANGED
@@ -1,10 +1,5 @@
|
|
1
1
|
import opik
|
2
|
-
from typing import Literal
|
3
|
-
from .. import utils
|
4
|
-
from datasets import load_dataset
|
5
|
-
import traceback
|
6
|
-
from importlib.resources import files
|
7
|
-
import json
|
2
|
+
from typing import Literal
|
8
3
|
import warnings
|
9
4
|
from ..datasets import (
|
10
5
|
hotpot_300,
|
@@ -21,6 +16,7 @@ from ..datasets import (
|
|
21
16
|
rag_hallucinations,
|
22
17
|
)
|
23
18
|
|
19
|
+
|
24
20
|
class HaltError(Exception):
|
25
21
|
"""Exception raised when we need to halt the process due to a critical error."""
|
26
22
|
|
@@ -51,7 +47,7 @@ def get_or_create_dataset(
|
|
51
47
|
"This function is deprecated. Please use the datasets directly from opik_optimizer.datasets module instead."
|
52
48
|
" For example: opik_optimizer.datasets.truthful_qa() or opik_optimizer.datasets.rag_hallucination()",
|
53
49
|
DeprecationWarning,
|
54
|
-
stacklevel=2
|
50
|
+
stacklevel=2,
|
55
51
|
)
|
56
52
|
if name == "hotpot-300":
|
57
53
|
dataset = hotpot_300(test_mode)
|