lm-deluge 0.0.67__py3-none-any.whl → 0.0.88__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (92) hide show
  1. lm_deluge/__init__.py +25 -2
  2. lm_deluge/api_requests/anthropic.py +92 -17
  3. lm_deluge/api_requests/base.py +47 -11
  4. lm_deluge/api_requests/bedrock.py +7 -4
  5. lm_deluge/api_requests/chat_reasoning.py +4 -0
  6. lm_deluge/api_requests/gemini.py +138 -18
  7. lm_deluge/api_requests/openai.py +114 -21
  8. lm_deluge/client.py +282 -49
  9. lm_deluge/config.py +15 -3
  10. lm_deluge/mock_openai.py +643 -0
  11. lm_deluge/models/__init__.py +12 -1
  12. lm_deluge/models/anthropic.py +17 -2
  13. lm_deluge/models/arcee.py +16 -0
  14. lm_deluge/models/deepseek.py +36 -4
  15. lm_deluge/models/google.py +29 -0
  16. lm_deluge/models/grok.py +24 -0
  17. lm_deluge/models/kimi.py +36 -0
  18. lm_deluge/models/minimax.py +10 -0
  19. lm_deluge/models/openai.py +100 -0
  20. lm_deluge/models/openrouter.py +86 -8
  21. lm_deluge/models/together.py +11 -0
  22. lm_deluge/models/zai.py +1 -0
  23. lm_deluge/pipelines/gepa/__init__.py +95 -0
  24. lm_deluge/pipelines/gepa/core.py +354 -0
  25. lm_deluge/pipelines/gepa/docs/samples.py +696 -0
  26. lm_deluge/pipelines/gepa/examples/01_synthetic_keywords.py +140 -0
  27. lm_deluge/pipelines/gepa/examples/02_gsm8k_math.py +261 -0
  28. lm_deluge/pipelines/gepa/examples/03_hotpotqa_multihop.py +300 -0
  29. lm_deluge/pipelines/gepa/examples/04_batch_classification.py +271 -0
  30. lm_deluge/pipelines/gepa/examples/simple_qa.py +129 -0
  31. lm_deluge/pipelines/gepa/optimizer.py +435 -0
  32. lm_deluge/pipelines/gepa/proposer.py +235 -0
  33. lm_deluge/pipelines/gepa/util.py +165 -0
  34. lm_deluge/{llm_tools → pipelines}/score.py +2 -2
  35. lm_deluge/{llm_tools → pipelines}/translate.py +5 -3
  36. lm_deluge/prompt.py +224 -40
  37. lm_deluge/request_context.py +7 -2
  38. lm_deluge/tool/__init__.py +1118 -0
  39. lm_deluge/tool/builtin/anthropic/__init__.py +300 -0
  40. lm_deluge/tool/builtin/gemini.py +59 -0
  41. lm_deluge/tool/builtin/openai.py +74 -0
  42. lm_deluge/tool/cua/__init__.py +173 -0
  43. lm_deluge/tool/cua/actions.py +148 -0
  44. lm_deluge/tool/cua/base.py +27 -0
  45. lm_deluge/tool/cua/batch.py +215 -0
  46. lm_deluge/tool/cua/converters.py +466 -0
  47. lm_deluge/tool/cua/kernel.py +702 -0
  48. lm_deluge/tool/cua/trycua.py +989 -0
  49. lm_deluge/tool/prefab/__init__.py +45 -0
  50. lm_deluge/tool/prefab/batch_tool.py +156 -0
  51. lm_deluge/tool/prefab/docs.py +1119 -0
  52. lm_deluge/tool/prefab/email.py +294 -0
  53. lm_deluge/tool/prefab/filesystem.py +1711 -0
  54. lm_deluge/tool/prefab/full_text_search/__init__.py +285 -0
  55. lm_deluge/tool/prefab/full_text_search/tantivy_index.py +396 -0
  56. lm_deluge/tool/prefab/memory.py +458 -0
  57. lm_deluge/tool/prefab/otc/__init__.py +165 -0
  58. lm_deluge/tool/prefab/otc/executor.py +281 -0
  59. lm_deluge/tool/prefab/otc/parse.py +188 -0
  60. lm_deluge/tool/prefab/random.py +212 -0
  61. lm_deluge/tool/prefab/rlm/__init__.py +296 -0
  62. lm_deluge/tool/prefab/rlm/executor.py +349 -0
  63. lm_deluge/tool/prefab/rlm/parse.py +144 -0
  64. lm_deluge/tool/prefab/sandbox.py +1621 -0
  65. lm_deluge/tool/prefab/sheets.py +385 -0
  66. lm_deluge/tool/prefab/subagents.py +233 -0
  67. lm_deluge/tool/prefab/todos.py +342 -0
  68. lm_deluge/tool/prefab/tool_search.py +169 -0
  69. lm_deluge/tool/prefab/web_search.py +199 -0
  70. lm_deluge/tracker.py +16 -13
  71. lm_deluge/util/schema.py +412 -0
  72. lm_deluge/warnings.py +8 -0
  73. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/METADATA +22 -9
  74. lm_deluge-0.0.88.dist-info/RECORD +117 -0
  75. lm_deluge/built_in_tools/anthropic/__init__.py +0 -128
  76. lm_deluge/built_in_tools/openai.py +0 -28
  77. lm_deluge/presets/cerebras.py +0 -17
  78. lm_deluge/presets/meta.py +0 -13
  79. lm_deluge/tool.py +0 -849
  80. lm_deluge-0.0.67.dist-info/RECORD +0 -72
  81. lm_deluge/{llm_tools → pipelines}/__init__.py +1 -1
  82. /lm_deluge/{llm_tools → pipelines}/classify.py +0 -0
  83. /lm_deluge/{llm_tools → pipelines}/extract.py +0 -0
  84. /lm_deluge/{llm_tools → pipelines}/locate.py +0 -0
  85. /lm_deluge/{llm_tools → pipelines}/ocr.py +0 -0
  86. /lm_deluge/{built_in_tools → tool/builtin}/anthropic/bash.py +0 -0
  87. /lm_deluge/{built_in_tools → tool/builtin}/anthropic/computer_use.py +0 -0
  88. /lm_deluge/{built_in_tools → tool/builtin}/anthropic/editor.py +0 -0
  89. /lm_deluge/{built_in_tools → tool/builtin}/base.py +0 -0
  90. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/WHEEL +0 -0
  91. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/licenses/LICENSE +0 -0
  92. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,271 @@
1
+ """
2
+ Example: Sentiment Classification
3
+
4
+ Optimize a classification prompt for sentiment analysis.
5
+ This example shows a straightforward classification task.
6
+
7
+ The task:
8
+ - Input: Text to classify
9
+ - Output: Sentiment (positive/negative)
10
+ - Score: Accuracy (exact match)
11
+
12
+ Run:
13
+ python 04_batch_classification.py
14
+
15
+ Requirements:
16
+ # Set OPENAI_API_KEY or ANTHROPIC_API_KEY environment variable
17
+ """
18
+
19
+ import os
20
+ import sys
21
+
22
+ import dotenv
23
+
24
+ from lm_deluge import LLMClient
25
+ from lm_deluge.pipelines.gepa import Component, EvalResult, optimize
26
+ from lm_deluge.prompt import Conversation, Message
27
+
28
+ dotenv.load_dotenv()
29
+
30
+
31
+ # Simple sentiment dataset
32
+ SENTIMENT_DATA = [
33
+ {
34
+ "text": "This movie was absolutely fantastic! Best film I've seen all year.",
35
+ "label": "positive",
36
+ },
37
+ {
38
+ "text": "Terrible waste of time. The acting was wooden and the plot made no sense.",
39
+ "label": "negative",
40
+ },
41
+ {"text": "I loved every minute of it. Highly recommend!", "label": "positive"},
42
+ {
43
+ "text": "Boring and predictable. I fell asleep halfway through.",
44
+ "label": "negative",
45
+ },
46
+ {
47
+ "text": "A masterpiece of modern cinema. The director outdid themselves.",
48
+ "label": "positive",
49
+ },
50
+ {
51
+ "text": "Don't bother watching this garbage. Complete disappointment.",
52
+ "label": "negative",
53
+ },
54
+ {
55
+ "text": "Heartwarming story with great performances. Brought tears to my eyes.",
56
+ "label": "positive",
57
+ },
58
+ {
59
+ "text": "Confusing mess with no redeeming qualities whatsoever.",
60
+ "label": "negative",
61
+ },
62
+ {
63
+ "text": "Fun, entertaining, and surprisingly deep. A real gem!",
64
+ "label": "positive",
65
+ },
66
+ {
67
+ "text": "Painfully slow and utterly forgettable. Save your money.",
68
+ "label": "negative",
69
+ },
70
+ {
71
+ "text": "Outstanding cinematography and a compelling narrative.",
72
+ "label": "positive",
73
+ },
74
+ {
75
+ "text": "Worst movie of the decade. I want my two hours back.",
76
+ "label": "negative",
77
+ },
78
+ {
79
+ "text": "Delightful from start to finish. Perfect family entertainment.",
80
+ "label": "positive",
81
+ },
82
+ {
83
+ "text": "Pretentious drivel that thinks it's smarter than it is.",
84
+ "label": "negative",
85
+ },
86
+ {
87
+ "text": "A thrilling ride that keeps you on the edge of your seat!",
88
+ "label": "positive",
89
+ },
90
+ {
91
+ "text": "Lazy writing and cheap production values. Very disappointing.",
92
+ "label": "negative",
93
+ },
94
+ {
95
+ "text": "Beautiful, moving, and thought-provoking. A must-see.",
96
+ "label": "positive",
97
+ },
98
+ {"text": "Annoying characters and a story that goes nowhere.", "label": "negative"},
99
+ {
100
+ "text": "Pure magic on screen. I'll be thinking about this for days.",
101
+ "label": "positive",
102
+ },
103
+ {
104
+ "text": "Amateurish in every way. Hard to believe this got made.",
105
+ "label": "negative",
106
+ },
107
+ ]
108
+
109
+
110
+ def extract_prediction(output: str) -> str:
111
+ """Extract sentiment prediction from model output."""
112
+ output_lower = output.lower().strip()
113
+
114
+ if "positive" in output_lower and "negative" not in output_lower:
115
+ return "positive"
116
+ elif "negative" in output_lower and "positive" not in output_lower:
117
+ return "negative"
118
+ elif output_lower.startswith("positive"):
119
+ return "positive"
120
+ elif output_lower.startswith("negative"):
121
+ return "negative"
122
+ else:
123
+ return "unknown"
124
+
125
+
126
+ def make_evaluate_fn(task_client: LLMClient): # type: ignore
127
+ """Create the evaluate function."""
128
+
129
+ def evaluate(
130
+ client: LLMClient, # type: ignore
131
+ component_values: dict[str, str],
132
+ example: dict,
133
+ ) -> EvalResult:
134
+ """Evaluate one classification example."""
135
+ # Build conversation
136
+ conv = Conversation.system(component_values["system_prompt"])
137
+
138
+ user_msg = f"""Text to classify:
139
+ "{example['text']}"
140
+
141
+ {component_values['output_format']}"""
142
+ conv = conv.add(Message.user(user_msg))
143
+
144
+ # Run inference
145
+ response = client.process_prompts_sync([conv], show_progress=False)[0]
146
+ output = response.completion or ""
147
+
148
+ # Extract prediction and score
149
+ pred = extract_prediction(output)
150
+ correct = pred == example["label"]
151
+ score = 1.0 if correct else 0.0
152
+
153
+ # Build feedback
154
+ if correct:
155
+ feedback = f"""Score: 1.0 (CORRECT)
156
+ Text: "{example['text'][:50]}..."
157
+ Expected: {example['label']}
158
+ Predicted: {pred}"""
159
+ else:
160
+ feedback = f"""Score: 0.0 (INCORRECT)
161
+ Text: "{example['text']}"
162
+ Expected: {example['label']}
163
+ Model output: {output}
164
+ Extracted prediction: {pred}
165
+
166
+ The model either misclassified the sentiment or failed to output a clear positive/negative label."""
167
+
168
+ # Return full trajectory
169
+ full_conv = conv.add(Message.ai(output))
170
+ return EvalResult(conversation=full_conv, score=score, feedback=feedback)
171
+
172
+ return evaluate
173
+
174
+
175
+ def main():
176
+ # Check for API keys
177
+ model = None
178
+ proposer_model = None
179
+
180
+ if os.getenv("OPENAI_API_KEY"):
181
+ model = "gpt-4.1-nano"
182
+ proposer_model = "gpt-4.1-mini"
183
+ elif os.getenv("ANTHROPIC_API_KEY"):
184
+ model = "claude-3-5-haiku-latest"
185
+ proposer_model = "claude-sonnet-4-20250514"
186
+ else:
187
+ print("Please set OPENAI_API_KEY or ANTHROPIC_API_KEY")
188
+ sys.exit(1)
189
+
190
+ print(f"Using task model: {model}")
191
+ print(f"Using proposer model: {proposer_model}")
192
+
193
+ # Split data
194
+ trainset = SENTIMENT_DATA[:14]
195
+ valset = SENTIMENT_DATA[14:]
196
+ print(f"Training: {len(trainset)}, Validation: {len(valset)} examples")
197
+
198
+ # Create clients
199
+ task_client = LLMClient( # type: ignore[operator]
200
+ model,
201
+ max_requests_per_minute=200,
202
+ max_new_tokens=50,
203
+ temperature=0.0,
204
+ )
205
+ proposer_client = LLMClient( # type: ignore[operator]
206
+ proposer_model,
207
+ max_requests_per_minute=50,
208
+ max_new_tokens=1024,
209
+ )
210
+
211
+ # Define components to optimize
212
+ components = {
213
+ "system_prompt": Component(
214
+ description="System prompt that instructs the model to classify sentiment",
215
+ value="Classify the sentiment of the following text.",
216
+ ),
217
+ "output_format": Component(
218
+ description="Instructions for how to format the classification output",
219
+ value="Respond with either 'positive' or 'negative'.",
220
+ ),
221
+ }
222
+
223
+ print()
224
+ print("=" * 60)
225
+ print("GEPA Example: Sentiment Classification")
226
+ print("=" * 60)
227
+ print("Components being optimized:")
228
+ for name, comp in components.items():
229
+ print(f" - {name}: {comp.value}")
230
+ print()
231
+
232
+ # Run optimization
233
+ result = optimize(
234
+ components=components,
235
+ evaluate_fn=make_evaluate_fn(task_client), # type: ignore[arg-type]
236
+ dataset=trainset,
237
+ val_dataset=valset,
238
+ task_client=task_client,
239
+ proposer_client=proposer_client,
240
+ max_iterations=15,
241
+ max_evals=150,
242
+ minibatch_size=4,
243
+ run_dir="./sentiment_gepa",
244
+ save_trajectories=True,
245
+ seed=42,
246
+ )
247
+
248
+ print()
249
+ print("=" * 60)
250
+ print("Results")
251
+ print("=" * 60)
252
+ print(f"Candidates discovered: {result.num_candidates}")
253
+ print(f"Best validation accuracy: {result.best_score:.1%}")
254
+ print(f"Total evaluations: {result.total_evals}")
255
+ print()
256
+ print("Best candidate found:")
257
+ print("-" * 40)
258
+ for name, text in result.best_candidate.items():
259
+ print(f"{name}:")
260
+ print(f" {text}")
261
+ print("-" * 40)
262
+
263
+ # Show improvement
264
+ seed_score = result.candidate_avg_scores[0]
265
+ print(f"\nSeed accuracy: {seed_score:.1%}")
266
+ print(f"Best accuracy: {result.best_score:.1%}")
267
+ print(f"Improvement: +{(result.best_score - seed_score):.1%}")
268
+
269
+
270
+ if __name__ == "__main__":
271
+ main()
@@ -0,0 +1,129 @@
1
+ """
2
+ Simple Q&A example for GEPA.
3
+
4
+ This example optimizes a system prompt for answering trivia questions.
5
+ It demonstrates the minimal setup needed to use GEPA.
6
+ """
7
+
8
+ from lm_deluge import LLMClient
9
+ from lm_deluge.pipelines.gepa import Component, EvalResult, optimize
10
+ from lm_deluge.prompt import Conversation, Message
11
+
12
+
13
+ # Sample dataset - trivia questions
14
+ DATASET = [
15
+ {"question": "What is the capital of France?", "answer": "Paris"},
16
+ {
17
+ "question": "What is the largest planet in our solar system?",
18
+ "answer": "Jupiter",
19
+ },
20
+ {"question": "Who wrote Romeo and Juliet?", "answer": "Shakespeare"},
21
+ {"question": "What is the chemical symbol for gold?", "answer": "Au"},
22
+ {"question": "What year did World War II end?", "answer": "1945"},
23
+ {"question": "What is the smallest prime number?", "answer": "2"},
24
+ {"question": "What is the capital of Japan?", "answer": "Tokyo"},
25
+ {"question": "Who painted the Mona Lisa?", "answer": "Leonardo da Vinci"},
26
+ {"question": "What is the speed of light in m/s?", "answer": "299792458"},
27
+ {"question": "What is the largest ocean on Earth?", "answer": "Pacific"},
28
+ ]
29
+
30
+
31
+ def evaluate(
32
+ client: LLMClient, # type: ignore
33
+ component_values: dict[str, str],
34
+ example: dict,
35
+ ) -> EvalResult:
36
+ """
37
+ Evaluate one example.
38
+
39
+ This function:
40
+ 1. Builds a prompt using the current component values
41
+ 2. Runs inference
42
+ 3. Scores the result
43
+ 4. Returns the full trajectory with feedback
44
+ """
45
+ # Build conversation with current system prompt
46
+ conv = Conversation.system(component_values["system_prompt"])
47
+ conv = conv.add(Message.user(example["question"]))
48
+
49
+ # Run inference
50
+ response = client.process_prompts_sync([conv], show_progress=False)[0]
51
+ answer = response.completion
52
+
53
+ # Score: check if the expected answer appears in the response
54
+ expected = example["answer"].lower()
55
+ got = answer.lower()
56
+ correct = expected in got
57
+
58
+ score = 1.0 if correct else 0.0
59
+
60
+ # Build informative feedback for the proposer
61
+ feedback = f"""Score: {score}
62
+ Question: {example['question']}
63
+ Expected answer to contain: {example['answer']}
64
+ Model response: {answer[:200]}{'...' if len(answer) > 200 else ''}
65
+ Result: {'CORRECT' if correct else 'INCORRECT'}"""
66
+
67
+ # Return full trajectory
68
+ full_conv = conv.add(Message.ai(answer))
69
+ return EvalResult(conversation=full_conv, score=score, feedback=feedback)
70
+
71
+
72
+ def main():
73
+ # Define the component to optimize
74
+ components = {
75
+ "system_prompt": Component(
76
+ description="System prompt that instructs the model how to answer questions",
77
+ value="You are a helpful assistant. Answer questions concisely.",
78
+ ),
79
+ }
80
+
81
+ # Create clients
82
+ # task_client runs the actual Q&A
83
+ # proposer_client analyzes trajectories and proposes improvements
84
+ task_client = LLMClient("gpt-4o-mini") # type: ignore[operator]
85
+ proposer_client = LLMClient("gpt-4o-mini") # type: ignore[operator]
86
+
87
+ # Split dataset
88
+ train_data = DATASET[:7]
89
+ val_data = DATASET[7:]
90
+
91
+ print("Starting GEPA optimization...")
92
+ print(f"Training examples: {len(train_data)}")
93
+ print(f"Validation examples: {len(val_data)}")
94
+ print(f"Initial prompt: {components['system_prompt'].value}")
95
+ print()
96
+
97
+ # Run optimization
98
+ result = optimize(
99
+ components=components,
100
+ evaluate_fn=evaluate, # type: ignore[arg-type]
101
+ dataset=train_data,
102
+ val_dataset=val_data,
103
+ task_client=task_client,
104
+ proposer_client=proposer_client,
105
+ max_iterations=10,
106
+ max_evals=100,
107
+ minibatch_size=3,
108
+ run_dir="gepa_simple_qa",
109
+ save_trajectories=True,
110
+ )
111
+
112
+ # Print results
113
+ print("\n" + "=" * 50)
114
+ print("Optimization complete!")
115
+ print(f"Total evaluations: {result.total_evals}")
116
+ print(f"Candidates explored: {result.num_candidates}")
117
+ print(f"Best score: {result.best_score:.2f}")
118
+ print(f"\nBest system prompt:\n{result.best_candidate['system_prompt']}")
119
+
120
+ # Show improvement history
121
+ if result.num_candidates > 1:
122
+ print("\nImprovement history:")
123
+ for idx, candidate, score in result.best_k(5):
124
+ parent = result.candidate_parents[idx]
125
+ print(f" Candidate {idx} (parent={parent}): score={score:.2f}")
126
+
127
+
128
+ if __name__ == "__main__":
129
+ main()