lm-deluge 0.0.67__py3-none-any.whl → 0.0.88__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- lm_deluge/__init__.py +25 -2
- lm_deluge/api_requests/anthropic.py +92 -17
- lm_deluge/api_requests/base.py +47 -11
- lm_deluge/api_requests/bedrock.py +7 -4
- lm_deluge/api_requests/chat_reasoning.py +4 -0
- lm_deluge/api_requests/gemini.py +138 -18
- lm_deluge/api_requests/openai.py +114 -21
- lm_deluge/client.py +282 -49
- lm_deluge/config.py +15 -3
- lm_deluge/mock_openai.py +643 -0
- lm_deluge/models/__init__.py +12 -1
- lm_deluge/models/anthropic.py +17 -2
- lm_deluge/models/arcee.py +16 -0
- lm_deluge/models/deepseek.py +36 -4
- lm_deluge/models/google.py +29 -0
- lm_deluge/models/grok.py +24 -0
- lm_deluge/models/kimi.py +36 -0
- lm_deluge/models/minimax.py +10 -0
- lm_deluge/models/openai.py +100 -0
- lm_deluge/models/openrouter.py +86 -8
- lm_deluge/models/together.py +11 -0
- lm_deluge/models/zai.py +1 -0
- lm_deluge/pipelines/gepa/__init__.py +95 -0
- lm_deluge/pipelines/gepa/core.py +354 -0
- lm_deluge/pipelines/gepa/docs/samples.py +696 -0
- lm_deluge/pipelines/gepa/examples/01_synthetic_keywords.py +140 -0
- lm_deluge/pipelines/gepa/examples/02_gsm8k_math.py +261 -0
- lm_deluge/pipelines/gepa/examples/03_hotpotqa_multihop.py +300 -0
- lm_deluge/pipelines/gepa/examples/04_batch_classification.py +271 -0
- lm_deluge/pipelines/gepa/examples/simple_qa.py +129 -0
- lm_deluge/pipelines/gepa/optimizer.py +435 -0
- lm_deluge/pipelines/gepa/proposer.py +235 -0
- lm_deluge/pipelines/gepa/util.py +165 -0
- lm_deluge/{llm_tools → pipelines}/score.py +2 -2
- lm_deluge/{llm_tools → pipelines}/translate.py +5 -3
- lm_deluge/prompt.py +224 -40
- lm_deluge/request_context.py +7 -2
- lm_deluge/tool/__init__.py +1118 -0
- lm_deluge/tool/builtin/anthropic/__init__.py +300 -0
- lm_deluge/tool/builtin/gemini.py +59 -0
- lm_deluge/tool/builtin/openai.py +74 -0
- lm_deluge/tool/cua/__init__.py +173 -0
- lm_deluge/tool/cua/actions.py +148 -0
- lm_deluge/tool/cua/base.py +27 -0
- lm_deluge/tool/cua/batch.py +215 -0
- lm_deluge/tool/cua/converters.py +466 -0
- lm_deluge/tool/cua/kernel.py +702 -0
- lm_deluge/tool/cua/trycua.py +989 -0
- lm_deluge/tool/prefab/__init__.py +45 -0
- lm_deluge/tool/prefab/batch_tool.py +156 -0
- lm_deluge/tool/prefab/docs.py +1119 -0
- lm_deluge/tool/prefab/email.py +294 -0
- lm_deluge/tool/prefab/filesystem.py +1711 -0
- lm_deluge/tool/prefab/full_text_search/__init__.py +285 -0
- lm_deluge/tool/prefab/full_text_search/tantivy_index.py +396 -0
- lm_deluge/tool/prefab/memory.py +458 -0
- lm_deluge/tool/prefab/otc/__init__.py +165 -0
- lm_deluge/tool/prefab/otc/executor.py +281 -0
- lm_deluge/tool/prefab/otc/parse.py +188 -0
- lm_deluge/tool/prefab/random.py +212 -0
- lm_deluge/tool/prefab/rlm/__init__.py +296 -0
- lm_deluge/tool/prefab/rlm/executor.py +349 -0
- lm_deluge/tool/prefab/rlm/parse.py +144 -0
- lm_deluge/tool/prefab/sandbox.py +1621 -0
- lm_deluge/tool/prefab/sheets.py +385 -0
- lm_deluge/tool/prefab/subagents.py +233 -0
- lm_deluge/tool/prefab/todos.py +342 -0
- lm_deluge/tool/prefab/tool_search.py +169 -0
- lm_deluge/tool/prefab/web_search.py +199 -0
- lm_deluge/tracker.py +16 -13
- lm_deluge/util/schema.py +412 -0
- lm_deluge/warnings.py +8 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/METADATA +22 -9
- lm_deluge-0.0.88.dist-info/RECORD +117 -0
- lm_deluge/built_in_tools/anthropic/__init__.py +0 -128
- lm_deluge/built_in_tools/openai.py +0 -28
- lm_deluge/presets/cerebras.py +0 -17
- lm_deluge/presets/meta.py +0 -13
- lm_deluge/tool.py +0 -849
- lm_deluge-0.0.67.dist-info/RECORD +0 -72
- lm_deluge/{llm_tools → pipelines}/__init__.py +1 -1
- /lm_deluge/{llm_tools → pipelines}/classify.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/extract.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/locate.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/ocr.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/bash.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/computer_use.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/editor.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/base.py +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/WHEEL +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/licenses/LICENSE +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Example: Sentiment Classification
|
|
3
|
+
|
|
4
|
+
Optimize a classification prompt for sentiment analysis.
|
|
5
|
+
This example shows a straightforward classification task.
|
|
6
|
+
|
|
7
|
+
The task:
|
|
8
|
+
- Input: Text to classify
|
|
9
|
+
- Output: Sentiment (positive/negative)
|
|
10
|
+
- Score: Accuracy (exact match)
|
|
11
|
+
|
|
12
|
+
Run:
|
|
13
|
+
python 04_batch_classification.py
|
|
14
|
+
|
|
15
|
+
Requirements:
|
|
16
|
+
# Set OPENAI_API_KEY or ANTHROPIC_API_KEY environment variable
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
|
|
22
|
+
import dotenv
|
|
23
|
+
|
|
24
|
+
from lm_deluge import LLMClient
|
|
25
|
+
from lm_deluge.pipelines.gepa import Component, EvalResult, optimize
|
|
26
|
+
from lm_deluge.prompt import Conversation, Message
|
|
27
|
+
|
|
28
|
+
dotenv.load_dotenv()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Simple sentiment dataset
|
|
32
|
+
SENTIMENT_DATA = [
|
|
33
|
+
{
|
|
34
|
+
"text": "This movie was absolutely fantastic! Best film I've seen all year.",
|
|
35
|
+
"label": "positive",
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"text": "Terrible waste of time. The acting was wooden and the plot made no sense.",
|
|
39
|
+
"label": "negative",
|
|
40
|
+
},
|
|
41
|
+
{"text": "I loved every minute of it. Highly recommend!", "label": "positive"},
|
|
42
|
+
{
|
|
43
|
+
"text": "Boring and predictable. I fell asleep halfway through.",
|
|
44
|
+
"label": "negative",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"text": "A masterpiece of modern cinema. The director outdid themselves.",
|
|
48
|
+
"label": "positive",
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"text": "Don't bother watching this garbage. Complete disappointment.",
|
|
52
|
+
"label": "negative",
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"text": "Heartwarming story with great performances. Brought tears to my eyes.",
|
|
56
|
+
"label": "positive",
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"text": "Confusing mess with no redeeming qualities whatsoever.",
|
|
60
|
+
"label": "negative",
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"text": "Fun, entertaining, and surprisingly deep. A real gem!",
|
|
64
|
+
"label": "positive",
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"text": "Painfully slow and utterly forgettable. Save your money.",
|
|
68
|
+
"label": "negative",
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"text": "Outstanding cinematography and a compelling narrative.",
|
|
72
|
+
"label": "positive",
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
"text": "Worst movie of the decade. I want my two hours back.",
|
|
76
|
+
"label": "negative",
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"text": "Delightful from start to finish. Perfect family entertainment.",
|
|
80
|
+
"label": "positive",
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"text": "Pretentious drivel that thinks it's smarter than it is.",
|
|
84
|
+
"label": "negative",
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
"text": "A thrilling ride that keeps you on the edge of your seat!",
|
|
88
|
+
"label": "positive",
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"text": "Lazy writing and cheap production values. Very disappointing.",
|
|
92
|
+
"label": "negative",
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"text": "Beautiful, moving, and thought-provoking. A must-see.",
|
|
96
|
+
"label": "positive",
|
|
97
|
+
},
|
|
98
|
+
{"text": "Annoying characters and a story that goes nowhere.", "label": "negative"},
|
|
99
|
+
{
|
|
100
|
+
"text": "Pure magic on screen. I'll be thinking about this for days.",
|
|
101
|
+
"label": "positive",
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
"text": "Amateurish in every way. Hard to believe this got made.",
|
|
105
|
+
"label": "negative",
|
|
106
|
+
},
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def extract_prediction(output: str) -> str:
|
|
111
|
+
"""Extract sentiment prediction from model output."""
|
|
112
|
+
output_lower = output.lower().strip()
|
|
113
|
+
|
|
114
|
+
if "positive" in output_lower and "negative" not in output_lower:
|
|
115
|
+
return "positive"
|
|
116
|
+
elif "negative" in output_lower and "positive" not in output_lower:
|
|
117
|
+
return "negative"
|
|
118
|
+
elif output_lower.startswith("positive"):
|
|
119
|
+
return "positive"
|
|
120
|
+
elif output_lower.startswith("negative"):
|
|
121
|
+
return "negative"
|
|
122
|
+
else:
|
|
123
|
+
return "unknown"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def make_evaluate_fn(task_client: LLMClient): # type: ignore
|
|
127
|
+
"""Create the evaluate function."""
|
|
128
|
+
|
|
129
|
+
def evaluate(
|
|
130
|
+
client: LLMClient, # type: ignore
|
|
131
|
+
component_values: dict[str, str],
|
|
132
|
+
example: dict,
|
|
133
|
+
) -> EvalResult:
|
|
134
|
+
"""Evaluate one classification example."""
|
|
135
|
+
# Build conversation
|
|
136
|
+
conv = Conversation.system(component_values["system_prompt"])
|
|
137
|
+
|
|
138
|
+
user_msg = f"""Text to classify:
|
|
139
|
+
"{example['text']}"
|
|
140
|
+
|
|
141
|
+
{component_values['output_format']}"""
|
|
142
|
+
conv = conv.add(Message.user(user_msg))
|
|
143
|
+
|
|
144
|
+
# Run inference
|
|
145
|
+
response = client.process_prompts_sync([conv], show_progress=False)[0]
|
|
146
|
+
output = response.completion or ""
|
|
147
|
+
|
|
148
|
+
# Extract prediction and score
|
|
149
|
+
pred = extract_prediction(output)
|
|
150
|
+
correct = pred == example["label"]
|
|
151
|
+
score = 1.0 if correct else 0.0
|
|
152
|
+
|
|
153
|
+
# Build feedback
|
|
154
|
+
if correct:
|
|
155
|
+
feedback = f"""Score: 1.0 (CORRECT)
|
|
156
|
+
Text: "{example['text'][:50]}..."
|
|
157
|
+
Expected: {example['label']}
|
|
158
|
+
Predicted: {pred}"""
|
|
159
|
+
else:
|
|
160
|
+
feedback = f"""Score: 0.0 (INCORRECT)
|
|
161
|
+
Text: "{example['text']}"
|
|
162
|
+
Expected: {example['label']}
|
|
163
|
+
Model output: {output}
|
|
164
|
+
Extracted prediction: {pred}
|
|
165
|
+
|
|
166
|
+
The model either misclassified the sentiment or failed to output a clear positive/negative label."""
|
|
167
|
+
|
|
168
|
+
# Return full trajectory
|
|
169
|
+
full_conv = conv.add(Message.ai(output))
|
|
170
|
+
return EvalResult(conversation=full_conv, score=score, feedback=feedback)
|
|
171
|
+
|
|
172
|
+
return evaluate
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def main():
|
|
176
|
+
# Check for API keys
|
|
177
|
+
model = None
|
|
178
|
+
proposer_model = None
|
|
179
|
+
|
|
180
|
+
if os.getenv("OPENAI_API_KEY"):
|
|
181
|
+
model = "gpt-4.1-nano"
|
|
182
|
+
proposer_model = "gpt-4.1-mini"
|
|
183
|
+
elif os.getenv("ANTHROPIC_API_KEY"):
|
|
184
|
+
model = "claude-3-5-haiku-latest"
|
|
185
|
+
proposer_model = "claude-sonnet-4-20250514"
|
|
186
|
+
else:
|
|
187
|
+
print("Please set OPENAI_API_KEY or ANTHROPIC_API_KEY")
|
|
188
|
+
sys.exit(1)
|
|
189
|
+
|
|
190
|
+
print(f"Using task model: {model}")
|
|
191
|
+
print(f"Using proposer model: {proposer_model}")
|
|
192
|
+
|
|
193
|
+
# Split data
|
|
194
|
+
trainset = SENTIMENT_DATA[:14]
|
|
195
|
+
valset = SENTIMENT_DATA[14:]
|
|
196
|
+
print(f"Training: {len(trainset)}, Validation: {len(valset)} examples")
|
|
197
|
+
|
|
198
|
+
# Create clients
|
|
199
|
+
task_client = LLMClient( # type: ignore[operator]
|
|
200
|
+
model,
|
|
201
|
+
max_requests_per_minute=200,
|
|
202
|
+
max_new_tokens=50,
|
|
203
|
+
temperature=0.0,
|
|
204
|
+
)
|
|
205
|
+
proposer_client = LLMClient( # type: ignore[operator]
|
|
206
|
+
proposer_model,
|
|
207
|
+
max_requests_per_minute=50,
|
|
208
|
+
max_new_tokens=1024,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Define components to optimize
|
|
212
|
+
components = {
|
|
213
|
+
"system_prompt": Component(
|
|
214
|
+
description="System prompt that instructs the model to classify sentiment",
|
|
215
|
+
value="Classify the sentiment of the following text.",
|
|
216
|
+
),
|
|
217
|
+
"output_format": Component(
|
|
218
|
+
description="Instructions for how to format the classification output",
|
|
219
|
+
value="Respond with either 'positive' or 'negative'.",
|
|
220
|
+
),
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
print()
|
|
224
|
+
print("=" * 60)
|
|
225
|
+
print("GEPA Example: Sentiment Classification")
|
|
226
|
+
print("=" * 60)
|
|
227
|
+
print("Components being optimized:")
|
|
228
|
+
for name, comp in components.items():
|
|
229
|
+
print(f" - {name}: {comp.value}")
|
|
230
|
+
print()
|
|
231
|
+
|
|
232
|
+
# Run optimization
|
|
233
|
+
result = optimize(
|
|
234
|
+
components=components,
|
|
235
|
+
evaluate_fn=make_evaluate_fn(task_client), # type: ignore[arg-type]
|
|
236
|
+
dataset=trainset,
|
|
237
|
+
val_dataset=valset,
|
|
238
|
+
task_client=task_client,
|
|
239
|
+
proposer_client=proposer_client,
|
|
240
|
+
max_iterations=15,
|
|
241
|
+
max_evals=150,
|
|
242
|
+
minibatch_size=4,
|
|
243
|
+
run_dir="./sentiment_gepa",
|
|
244
|
+
save_trajectories=True,
|
|
245
|
+
seed=42,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
print()
|
|
249
|
+
print("=" * 60)
|
|
250
|
+
print("Results")
|
|
251
|
+
print("=" * 60)
|
|
252
|
+
print(f"Candidates discovered: {result.num_candidates}")
|
|
253
|
+
print(f"Best validation accuracy: {result.best_score:.1%}")
|
|
254
|
+
print(f"Total evaluations: {result.total_evals}")
|
|
255
|
+
print()
|
|
256
|
+
print("Best candidate found:")
|
|
257
|
+
print("-" * 40)
|
|
258
|
+
for name, text in result.best_candidate.items():
|
|
259
|
+
print(f"{name}:")
|
|
260
|
+
print(f" {text}")
|
|
261
|
+
print("-" * 40)
|
|
262
|
+
|
|
263
|
+
# Show improvement
|
|
264
|
+
seed_score = result.candidate_avg_scores[0]
|
|
265
|
+
print(f"\nSeed accuracy: {seed_score:.1%}")
|
|
266
|
+
print(f"Best accuracy: {result.best_score:.1%}")
|
|
267
|
+
print(f"Improvement: +{(result.best_score - seed_score):.1%}")
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
if __name__ == "__main__":
|
|
271
|
+
main()
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simple Q&A example for GEPA.
|
|
3
|
+
|
|
4
|
+
This example optimizes a system prompt for answering trivia questions.
|
|
5
|
+
It demonstrates the minimal setup needed to use GEPA.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from lm_deluge import LLMClient
|
|
9
|
+
from lm_deluge.pipelines.gepa import Component, EvalResult, optimize
|
|
10
|
+
from lm_deluge.prompt import Conversation, Message
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Sample dataset - trivia questions
|
|
14
|
+
DATASET = [
|
|
15
|
+
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
16
|
+
{
|
|
17
|
+
"question": "What is the largest planet in our solar system?",
|
|
18
|
+
"answer": "Jupiter",
|
|
19
|
+
},
|
|
20
|
+
{"question": "Who wrote Romeo and Juliet?", "answer": "Shakespeare"},
|
|
21
|
+
{"question": "What is the chemical symbol for gold?", "answer": "Au"},
|
|
22
|
+
{"question": "What year did World War II end?", "answer": "1945"},
|
|
23
|
+
{"question": "What is the smallest prime number?", "answer": "2"},
|
|
24
|
+
{"question": "What is the capital of Japan?", "answer": "Tokyo"},
|
|
25
|
+
{"question": "Who painted the Mona Lisa?", "answer": "Leonardo da Vinci"},
|
|
26
|
+
{"question": "What is the speed of light in m/s?", "answer": "299792458"},
|
|
27
|
+
{"question": "What is the largest ocean on Earth?", "answer": "Pacific"},
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def evaluate(
|
|
32
|
+
client: LLMClient, # type: ignore
|
|
33
|
+
component_values: dict[str, str],
|
|
34
|
+
example: dict,
|
|
35
|
+
) -> EvalResult:
|
|
36
|
+
"""
|
|
37
|
+
Evaluate one example.
|
|
38
|
+
|
|
39
|
+
This function:
|
|
40
|
+
1. Builds a prompt using the current component values
|
|
41
|
+
2. Runs inference
|
|
42
|
+
3. Scores the result
|
|
43
|
+
4. Returns the full trajectory with feedback
|
|
44
|
+
"""
|
|
45
|
+
# Build conversation with current system prompt
|
|
46
|
+
conv = Conversation.system(component_values["system_prompt"])
|
|
47
|
+
conv = conv.add(Message.user(example["question"]))
|
|
48
|
+
|
|
49
|
+
# Run inference
|
|
50
|
+
response = client.process_prompts_sync([conv], show_progress=False)[0]
|
|
51
|
+
answer = response.completion
|
|
52
|
+
|
|
53
|
+
# Score: check if the expected answer appears in the response
|
|
54
|
+
expected = example["answer"].lower()
|
|
55
|
+
got = answer.lower()
|
|
56
|
+
correct = expected in got
|
|
57
|
+
|
|
58
|
+
score = 1.0 if correct else 0.0
|
|
59
|
+
|
|
60
|
+
# Build informative feedback for the proposer
|
|
61
|
+
feedback = f"""Score: {score}
|
|
62
|
+
Question: {example['question']}
|
|
63
|
+
Expected answer to contain: {example['answer']}
|
|
64
|
+
Model response: {answer[:200]}{'...' if len(answer) > 200 else ''}
|
|
65
|
+
Result: {'CORRECT' if correct else 'INCORRECT'}"""
|
|
66
|
+
|
|
67
|
+
# Return full trajectory
|
|
68
|
+
full_conv = conv.add(Message.ai(answer))
|
|
69
|
+
return EvalResult(conversation=full_conv, score=score, feedback=feedback)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def main():
|
|
73
|
+
# Define the component to optimize
|
|
74
|
+
components = {
|
|
75
|
+
"system_prompt": Component(
|
|
76
|
+
description="System prompt that instructs the model how to answer questions",
|
|
77
|
+
value="You are a helpful assistant. Answer questions concisely.",
|
|
78
|
+
),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# Create clients
|
|
82
|
+
# task_client runs the actual Q&A
|
|
83
|
+
# proposer_client analyzes trajectories and proposes improvements
|
|
84
|
+
task_client = LLMClient("gpt-4o-mini") # type: ignore[operator]
|
|
85
|
+
proposer_client = LLMClient("gpt-4o-mini") # type: ignore[operator]
|
|
86
|
+
|
|
87
|
+
# Split dataset
|
|
88
|
+
train_data = DATASET[:7]
|
|
89
|
+
val_data = DATASET[7:]
|
|
90
|
+
|
|
91
|
+
print("Starting GEPA optimization...")
|
|
92
|
+
print(f"Training examples: {len(train_data)}")
|
|
93
|
+
print(f"Validation examples: {len(val_data)}")
|
|
94
|
+
print(f"Initial prompt: {components['system_prompt'].value}")
|
|
95
|
+
print()
|
|
96
|
+
|
|
97
|
+
# Run optimization
|
|
98
|
+
result = optimize(
|
|
99
|
+
components=components,
|
|
100
|
+
evaluate_fn=evaluate, # type: ignore[arg-type]
|
|
101
|
+
dataset=train_data,
|
|
102
|
+
val_dataset=val_data,
|
|
103
|
+
task_client=task_client,
|
|
104
|
+
proposer_client=proposer_client,
|
|
105
|
+
max_iterations=10,
|
|
106
|
+
max_evals=100,
|
|
107
|
+
minibatch_size=3,
|
|
108
|
+
run_dir="gepa_simple_qa",
|
|
109
|
+
save_trajectories=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Print results
|
|
113
|
+
print("\n" + "=" * 50)
|
|
114
|
+
print("Optimization complete!")
|
|
115
|
+
print(f"Total evaluations: {result.total_evals}")
|
|
116
|
+
print(f"Candidates explored: {result.num_candidates}")
|
|
117
|
+
print(f"Best score: {result.best_score:.2f}")
|
|
118
|
+
print(f"\nBest system prompt:\n{result.best_candidate['system_prompt']}")
|
|
119
|
+
|
|
120
|
+
# Show improvement history
|
|
121
|
+
if result.num_candidates > 1:
|
|
122
|
+
print("\nImprovement history:")
|
|
123
|
+
for idx, candidate, score in result.best_k(5):
|
|
124
|
+
parent = result.candidate_parents[idx]
|
|
125
|
+
print(f" Candidate {idx} (parent={parent}): score={score:.2f}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
if __name__ == "__main__":
|
|
129
|
+
main()
|