mrmd-ai 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/PKG-INFO +1 -1
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/pyproject.toml +1 -1
- mrmd_ai-0.1.1/src/mrmd_ai/juice.py +673 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/modules/__init__.py +11 -0
- mrmd_ai-0.1.1/src/mrmd_ai/modules/edit.py +102 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/server.py +97 -19
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/signatures/__init__.py +15 -0
- mrmd_ai-0.1.1/src/mrmd_ai/signatures/edit.py +173 -0
- mrmd_ai-0.1.0/src/mrmd_ai/juice.py +0 -416
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/.gitignore +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/README.md +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/dspy.config.yaml +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/AddTypeHintsPredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/CorrectAndFinishLinePredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/DocumentCodePredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/ExplainCodePredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/FinishCodeLinePredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/FinishCodeSectionPredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/FinishParagraphPredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/FinishSentencePredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/FixGrammarPredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/FixTranscriptionPredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/FormatCodePredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/GetSynonymsPredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/IdentifyReplacementPredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/ImproveNamesPredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/RefactorCodePredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/ReformatMarkdownPredict.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/logs/server.log +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/openapi.json +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/__init__.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/metrics/__init__.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/modules/code.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/modules/correct.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/modules/document.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/modules/finish.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/modules/fix.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/modules/notebook.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/modules/text.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/optimizers/__init__.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/signatures/code.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/signatures/correct.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/signatures/document.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/signatures/finish.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/signatures/fix.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/signatures/notebook.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/signatures/text.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/src/mrmd_ai/utils/__init__.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/tests/__init__.py +0 -0
- {mrmd_ai-0.1.0 → mrmd_ai-0.1.1}/uv.lock +0 -0
|
@@ -0,0 +1,673 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Juice Level System for MRMD AI Programs.
|
|
3
|
+
|
|
4
|
+
Juice levels control the quality/cost tradeoff of AI responses:
|
|
5
|
+
- Level 0: Kimi K2 on Groq (fast, cheap, default)
|
|
6
|
+
- Level 1: Claude Sonnet 4.5 (better quality)
|
|
7
|
+
- Level 2: Gemini 3 Pro with thinking (deep reasoning)
|
|
8
|
+
- Level 3: Claude Opus 4.5 with high thinking (maximum single-model quality)
|
|
9
|
+
- Level 4: Multi-model merger (Grok 4 + Sonnet 4.5 + Gemini 3 + Opus 4.5, synthesized by Gemini 3)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from enum import IntEnum
|
|
13
|
+
from typing import Any, Callable
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
import dspy
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class JuiceLevel(IntEnum):
|
|
19
|
+
"""Progressive quality levels for AI responses."""
|
|
20
|
+
|
|
21
|
+
# Fast & cheap - Kimi K2 on Groq
|
|
22
|
+
QUICK = 0
|
|
23
|
+
|
|
24
|
+
# Better quality - Sonnet 4.5
|
|
25
|
+
BALANCED = 1
|
|
26
|
+
|
|
27
|
+
# Deep reasoning - Gemini 3 with thinking
|
|
28
|
+
DEEP = 2
|
|
29
|
+
|
|
30
|
+
# Maximum single-model - Opus 4.5 with high thinking
|
|
31
|
+
MAXIMUM = 3
|
|
32
|
+
|
|
33
|
+
# Multi-model merger - all models synthesized
|
|
34
|
+
ULTIMATE = 4
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ReasoningLevel(IntEnum):
|
|
38
|
+
"""Independent reasoning/thinking budget control.
|
|
39
|
+
|
|
40
|
+
This is separate from JuiceLevel and controls how much "thinking"
|
|
41
|
+
the model does, independent of which model is selected.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# No extended thinking - fastest responses
|
|
45
|
+
OFF = 0
|
|
46
|
+
|
|
47
|
+
# Minimal reasoning
|
|
48
|
+
MINIMAL = 1
|
|
49
|
+
|
|
50
|
+
# Low reasoning effort
|
|
51
|
+
LOW = 2
|
|
52
|
+
|
|
53
|
+
# Medium reasoning effort
|
|
54
|
+
MEDIUM = 3
|
|
55
|
+
|
|
56
|
+
# High reasoning effort
|
|
57
|
+
HIGH = 4
|
|
58
|
+
|
|
59
|
+
# Maximum reasoning budget
|
|
60
|
+
MAXIMUM = 5
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# Map reasoning levels to thinking budgets and reasoning_effort values
|
|
64
|
+
# For Anthropic: uses `thinking={"type": "enabled", "budget_tokens": X}`
|
|
65
|
+
# For others: uses `reasoning_effort` ("low", "medium", "high")
|
|
66
|
+
# Note: Anthropic requires max_tokens > thinking.budget_tokens
|
|
67
|
+
REASONING_CONFIGS: dict[ReasoningLevel, dict] = {
|
|
68
|
+
ReasoningLevel.OFF: {
|
|
69
|
+
"budget_tokens": None, # No thinking
|
|
70
|
+
"reasoning_effort": None,
|
|
71
|
+
"temperature": None, # None means use model default
|
|
72
|
+
},
|
|
73
|
+
ReasoningLevel.MINIMAL: {
|
|
74
|
+
"budget_tokens": 1024, # Minimum thinking budget
|
|
75
|
+
"reasoning_effort": "low",
|
|
76
|
+
"temperature": 1.0, # Required for Anthropic extended thinking
|
|
77
|
+
"max_tokens": 4096, # Must be > budget_tokens
|
|
78
|
+
},
|
|
79
|
+
ReasoningLevel.LOW: {
|
|
80
|
+
"budget_tokens": 4096,
|
|
81
|
+
"reasoning_effort": "low",
|
|
82
|
+
"temperature": 1.0,
|
|
83
|
+
"max_tokens": 8192,
|
|
84
|
+
},
|
|
85
|
+
ReasoningLevel.MEDIUM: {
|
|
86
|
+
"budget_tokens": 8192,
|
|
87
|
+
"reasoning_effort": "medium",
|
|
88
|
+
"temperature": 1.0,
|
|
89
|
+
"max_tokens": 16000,
|
|
90
|
+
},
|
|
91
|
+
ReasoningLevel.HIGH: {
|
|
92
|
+
"budget_tokens": 16384,
|
|
93
|
+
"reasoning_effort": "high",
|
|
94
|
+
"temperature": 1.0,
|
|
95
|
+
"max_tokens": 24000,
|
|
96
|
+
},
|
|
97
|
+
ReasoningLevel.MAXIMUM: {
|
|
98
|
+
"budget_tokens": 32768, # Maximum thinking budget
|
|
99
|
+
"reasoning_effort": "high",
|
|
100
|
+
"temperature": 1.0,
|
|
101
|
+
"max_tokens": 48000, # Must be > budget_tokens
|
|
102
|
+
},
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
REASONING_DESCRIPTIONS = {
|
|
107
|
+
ReasoningLevel.OFF: "Off - No extended thinking",
|
|
108
|
+
ReasoningLevel.MINIMAL: "Minimal - Light reasoning",
|
|
109
|
+
ReasoningLevel.LOW: "Low - Some reasoning",
|
|
110
|
+
ReasoningLevel.MEDIUM: "Medium - Moderate reasoning",
|
|
111
|
+
ReasoningLevel.HIGH: "High - Deep reasoning",
|
|
112
|
+
ReasoningLevel.MAXIMUM: "Maximum - Full reasoning budget",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class ModelConfig:
|
|
118
|
+
"""Configuration for a model at a specific juice level."""
|
|
119
|
+
model: str
|
|
120
|
+
temperature: float = 0.7
|
|
121
|
+
max_tokens: int = 4096
|
|
122
|
+
reasoning_effort: str | None = None
|
|
123
|
+
thinking: dict | None = None
|
|
124
|
+
supports_reasoning: bool = True # Whether the model supports reasoning_effort
|
|
125
|
+
extra_kwargs: dict = field(default_factory=dict)
|
|
126
|
+
|
|
127
|
+
def to_lm_kwargs(self) -> dict:
|
|
128
|
+
"""Convert to dspy.LM kwargs."""
|
|
129
|
+
kwargs = {
|
|
130
|
+
"model": self.model,
|
|
131
|
+
"temperature": self.temperature,
|
|
132
|
+
"max_tokens": self.max_tokens,
|
|
133
|
+
**self.extra_kwargs,
|
|
134
|
+
}
|
|
135
|
+
if self.reasoning_effort:
|
|
136
|
+
kwargs["reasoning_effort"] = self.reasoning_effort
|
|
137
|
+
if self.thinking:
|
|
138
|
+
kwargs["thinking"] = self.thinking
|
|
139
|
+
return kwargs
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# Model configurations for each juice level
|
|
143
|
+
# supports_reasoning indicates if the model/provider supports reasoning_effort parameter
|
|
144
|
+
JUICE_MODELS: dict[JuiceLevel, ModelConfig] = {
|
|
145
|
+
JuiceLevel.QUICK: ModelConfig(
|
|
146
|
+
model="groq/moonshotai/kimi-k2-instruct-0905",
|
|
147
|
+
temperature=0.7,
|
|
148
|
+
max_tokens=4096,
|
|
149
|
+
supports_reasoning=False, # Groq does NOT support reasoning_effort
|
|
150
|
+
),
|
|
151
|
+
JuiceLevel.BALANCED: ModelConfig(
|
|
152
|
+
model="anthropic/claude-sonnet-4-5",
|
|
153
|
+
temperature=0.7,
|
|
154
|
+
max_tokens=4096,
|
|
155
|
+
supports_reasoning=True, # Anthropic supports reasoning_effort
|
|
156
|
+
),
|
|
157
|
+
JuiceLevel.DEEP: ModelConfig(
|
|
158
|
+
model="gemini/gemini-3-pro-preview",
|
|
159
|
+
temperature=1.0,
|
|
160
|
+
max_tokens=16000,
|
|
161
|
+
reasoning_effort="high",
|
|
162
|
+
supports_reasoning=True, # Gemini supports reasoning_effort
|
|
163
|
+
),
|
|
164
|
+
JuiceLevel.MAXIMUM: ModelConfig(
|
|
165
|
+
model="anthropic/claude-opus-4-5",
|
|
166
|
+
temperature=1.0,
|
|
167
|
+
max_tokens=16000,
|
|
168
|
+
reasoning_effort="high",
|
|
169
|
+
supports_reasoning=True, # Anthropic supports reasoning_effort
|
|
170
|
+
),
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
# For ULTIMATE level, we use all 4 models with highest thinking
|
|
174
|
+
# Grok 4, GPT-5.1, Gemini 3, Opus 4.5
|
|
175
|
+
# NOTE: Anthropic requires temperature=1 when using extended thinking
|
|
176
|
+
ULTIMATE_MODELS: list[ModelConfig] = [
|
|
177
|
+
ModelConfig(
|
|
178
|
+
model="openrouter/x-ai/grok-4",
|
|
179
|
+
temperature=0.7,
|
|
180
|
+
max_tokens=8192,
|
|
181
|
+
supports_reasoning=True, # Grok 4 supports reasoning
|
|
182
|
+
),
|
|
183
|
+
ModelConfig(
|
|
184
|
+
model="openai/gpt-5.2",
|
|
185
|
+
temperature=1.0,
|
|
186
|
+
max_tokens=16000,
|
|
187
|
+
reasoning_effort="high",
|
|
188
|
+
supports_reasoning=True, # OpenAI supports reasoning
|
|
189
|
+
),
|
|
190
|
+
ModelConfig(
|
|
191
|
+
model="gemini/gemini-3-pro-preview",
|
|
192
|
+
temperature=1.0,
|
|
193
|
+
max_tokens=16000,
|
|
194
|
+
reasoning_effort="high",
|
|
195
|
+
supports_reasoning=True, # Gemini supports reasoning
|
|
196
|
+
),
|
|
197
|
+
ModelConfig(
|
|
198
|
+
model="anthropic/claude-opus-4-5",
|
|
199
|
+
temperature=1.0, # Must be 1 for extended thinking
|
|
200
|
+
max_tokens=16000,
|
|
201
|
+
reasoning_effort="high",
|
|
202
|
+
supports_reasoning=True, # Anthropic supports reasoning
|
|
203
|
+
),
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
# Synthesizer model for ULTIMATE level (Gemini 3 synthesizes all responses)
|
|
207
|
+
SYNTHESIZER_MODEL = ModelConfig(
|
|
208
|
+
model="gemini/gemini-3-pro-preview",
|
|
209
|
+
temperature=0.7,
|
|
210
|
+
max_tokens=32000,
|
|
211
|
+
reasoning_effort="high",
|
|
212
|
+
supports_reasoning=True,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def get_lm(
|
|
217
|
+
juice: JuiceLevel | int = JuiceLevel.QUICK,
|
|
218
|
+
reasoning: ReasoningLevel | int | None = None
|
|
219
|
+
) -> dspy.LM:
|
|
220
|
+
"""Get a dspy.LM configured for the specified juice and reasoning levels.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
juice: Juice level (0-3). Level 4 (ULTIMATE) requires special handling.
|
|
224
|
+
reasoning: Optional reasoning level (0-5). If None, uses juice level's default.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
Configured dspy.LM instance.
|
|
228
|
+
"""
|
|
229
|
+
if isinstance(juice, int):
|
|
230
|
+
juice = JuiceLevel(juice)
|
|
231
|
+
|
|
232
|
+
if juice == JuiceLevel.ULTIMATE:
|
|
233
|
+
raise ValueError("ULTIMATE juice level requires multi-model merger. Use JuicedProgram instead.")
|
|
234
|
+
|
|
235
|
+
config = JUICE_MODELS[juice]
|
|
236
|
+
kwargs = config.to_lm_kwargs()
|
|
237
|
+
|
|
238
|
+
# Apply reasoning level overrides if specified AND model supports reasoning
|
|
239
|
+
if reasoning is not None and config.supports_reasoning:
|
|
240
|
+
if isinstance(reasoning, int):
|
|
241
|
+
reasoning = ReasoningLevel(reasoning)
|
|
242
|
+
|
|
243
|
+
# Skip if reasoning is OFF
|
|
244
|
+
if reasoning == ReasoningLevel.OFF:
|
|
245
|
+
# Remove any existing reasoning params
|
|
246
|
+
kwargs.pop("reasoning_effort", None)
|
|
247
|
+
kwargs.pop("thinking", None)
|
|
248
|
+
return dspy.LM(**kwargs)
|
|
249
|
+
|
|
250
|
+
reasoning_config = REASONING_CONFIGS[reasoning]
|
|
251
|
+
model = config.model.lower()
|
|
252
|
+
|
|
253
|
+
# Determine provider and use appropriate parameter format
|
|
254
|
+
is_anthropic = "anthropic/" in model or "claude" in model
|
|
255
|
+
is_gemini = "gemini" in model
|
|
256
|
+
is_openai = "openai/" in model or "gpt" in model
|
|
257
|
+
|
|
258
|
+
# Apply temperature (required for Anthropic extended thinking)
|
|
259
|
+
if reasoning_config.get("temperature") is not None:
|
|
260
|
+
kwargs["temperature"] = reasoning_config["temperature"]
|
|
261
|
+
|
|
262
|
+
# Apply max_tokens
|
|
263
|
+
if reasoning_config.get("max_tokens") is not None:
|
|
264
|
+
kwargs["max_tokens"] = reasoning_config["max_tokens"]
|
|
265
|
+
|
|
266
|
+
if is_anthropic:
|
|
267
|
+
# Anthropic uses explicit thinking parameter with budget_tokens
|
|
268
|
+
budget = reasoning_config.get("budget_tokens", 1024)
|
|
269
|
+
kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget}
|
|
270
|
+
# Remove reasoning_effort if present (not used for thinking)
|
|
271
|
+
kwargs.pop("reasoning_effort", None)
|
|
272
|
+
else:
|
|
273
|
+
# Other providers use reasoning_effort
|
|
274
|
+
if reasoning_config["reasoning_effort"] is not None:
|
|
275
|
+
kwargs["reasoning_effort"] = reasoning_config["reasoning_effort"]
|
|
276
|
+
|
|
277
|
+
return dspy.LM(**kwargs)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class SynthesizeResponses(dspy.Signature):
|
|
281
|
+
"""Synthesize multiple AI model responses into one optimal final answer.
|
|
282
|
+
|
|
283
|
+
You are given responses from multiple AI models for the same task.
|
|
284
|
+
Your job is to create the BEST possible response by:
|
|
285
|
+
1. Identifying the strongest elements from each model's response
|
|
286
|
+
2. Resolving any contradictions (prefer the most accurate/well-reasoned answer)
|
|
287
|
+
3. Combining complementary insights that don't conflict
|
|
288
|
+
4. Maintaining the original format and style expected for the task
|
|
289
|
+
5. Being concise - don't add unnecessary elaboration
|
|
290
|
+
|
|
291
|
+
For grammar/spelling fixes: Pick the most correct version, don't over-correct.
|
|
292
|
+
For text completion: Choose the most natural, coherent continuation.
|
|
293
|
+
For code: Select the cleanest, most idiomatic solution.
|
|
294
|
+
For lists: You may combine unique items if appropriate.
|
|
295
|
+
|
|
296
|
+
Output ONLY the synthesized response - no explanations or meta-commentary.
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
original_input: str = dspy.InputField(desc="The original input/task that was given to all models")
|
|
300
|
+
model_responses: str = dspy.InputField(desc="Responses from multiple AI models, each labeled with model name")
|
|
301
|
+
synthesized_response: str = dspy.OutputField(desc="The single best response, synthesized from all model outputs. Output ONLY the response content.")
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class JuicedProgram:
|
|
305
|
+
"""Wrapper that runs any DSPy program with configurable juice levels.
|
|
306
|
+
|
|
307
|
+
For levels 0-3, uses a single model with increasing capability.
|
|
308
|
+
For level 4 (ULTIMATE), runs all models in parallel and synthesizes.
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
def __init__(
|
|
312
|
+
self,
|
|
313
|
+
program: dspy.Module,
|
|
314
|
+
juice: JuiceLevel | int = JuiceLevel.QUICK,
|
|
315
|
+
reasoning: ReasoningLevel | int | None = None,
|
|
316
|
+
progress_callback: Callable[[str, dict], None] | None = None
|
|
317
|
+
):
|
|
318
|
+
"""Initialize a juiced program.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
program: The DSPy program/module to wrap.
|
|
322
|
+
juice: Juice level (0-4).
|
|
323
|
+
reasoning: Optional reasoning level (0-5). If None, uses juice level's default.
|
|
324
|
+
progress_callback: Optional callback for progress events.
|
|
325
|
+
Called with (event_type, data) where event_type is:
|
|
326
|
+
- "status": General status update
|
|
327
|
+
- "model_start": A model is starting (ultimate mode)
|
|
328
|
+
- "model_complete": A model finished (ultimate mode)
|
|
329
|
+
"""
|
|
330
|
+
self.program = program
|
|
331
|
+
self.juice = JuiceLevel(juice) if isinstance(juice, int) else juice
|
|
332
|
+
self.reasoning = ReasoningLevel(reasoning) if isinstance(reasoning, int) else reasoning
|
|
333
|
+
self.progress_callback = progress_callback
|
|
334
|
+
|
|
335
|
+
def _emit(self, event_type: str, data: dict):
|
|
336
|
+
"""Emit a progress event if callback is set."""
|
|
337
|
+
if self.progress_callback:
|
|
338
|
+
self.progress_callback(event_type, data)
|
|
339
|
+
|
|
340
|
+
def __call__(self, **kwargs) -> Any:
|
|
341
|
+
"""Run the program with the configured juice level."""
|
|
342
|
+
if self.juice == JuiceLevel.ULTIMATE:
|
|
343
|
+
return self._run_ultimate(**kwargs)
|
|
344
|
+
else:
|
|
345
|
+
return self._run_single(**kwargs)
|
|
346
|
+
|
|
347
|
+
def _run_single(self, **kwargs) -> Any:
|
|
348
|
+
"""Run with a single model at the specified juice level."""
|
|
349
|
+
config = JUICE_MODELS[self.juice]
|
|
350
|
+
model_name = config.model.split("/")[-1]
|
|
351
|
+
|
|
352
|
+
reasoning_desc = ""
|
|
353
|
+
if self.reasoning is not None:
|
|
354
|
+
reasoning_desc = f" (reasoning={self.reasoning.name})"
|
|
355
|
+
|
|
356
|
+
self._emit("status", {
|
|
357
|
+
"step": "calling_model",
|
|
358
|
+
"model": model_name,
|
|
359
|
+
"model_full": config.model,
|
|
360
|
+
"reasoning_level": self.reasoning.value if self.reasoning else None,
|
|
361
|
+
})
|
|
362
|
+
|
|
363
|
+
lm = get_lm(self.juice, self.reasoning)
|
|
364
|
+
with dspy.context(lm=lm):
|
|
365
|
+
result = self.program(**kwargs)
|
|
366
|
+
|
|
367
|
+
self._emit("status", {
|
|
368
|
+
"step": "model_complete",
|
|
369
|
+
"model": model_name
|
|
370
|
+
})
|
|
371
|
+
|
|
372
|
+
return result
|
|
373
|
+
|
|
374
|
+
def _run_ultimate(self, **kwargs) -> Any:
|
|
375
|
+
"""Run with all models in PARALLEL and merge results."""
|
|
376
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
377
|
+
import threading
|
|
378
|
+
|
|
379
|
+
# Track which models are running
|
|
380
|
+
model_names = [cfg.model.split("/")[-1] for cfg in ULTIMATE_MODELS]
|
|
381
|
+
models_status = {name: "pending" for name in model_names}
|
|
382
|
+
status_lock = threading.Lock()
|
|
383
|
+
|
|
384
|
+
self._emit("status", {
|
|
385
|
+
"step": "starting_multi_model",
|
|
386
|
+
"models": model_names,
|
|
387
|
+
"total": len(model_names),
|
|
388
|
+
"reasoning_level": self.reasoning.value if self.reasoning else None,
|
|
389
|
+
})
|
|
390
|
+
|
|
391
|
+
def run_model(config):
|
|
392
|
+
"""Run a single model - called in parallel."""
|
|
393
|
+
lm_kwargs = config.to_lm_kwargs()
|
|
394
|
+
|
|
395
|
+
# Apply reasoning level overrides if specified AND model supports reasoning
|
|
396
|
+
if self.reasoning is not None and self.reasoning != ReasoningLevel.OFF and config.supports_reasoning:
|
|
397
|
+
reasoning_config = REASONING_CONFIGS[self.reasoning]
|
|
398
|
+
model = config.model.lower()
|
|
399
|
+
|
|
400
|
+
# Determine provider
|
|
401
|
+
is_anthropic = "anthropic/" in model or "claude" in model
|
|
402
|
+
|
|
403
|
+
# Apply temperature and max_tokens
|
|
404
|
+
if reasoning_config.get("temperature") is not None:
|
|
405
|
+
lm_kwargs["temperature"] = reasoning_config["temperature"]
|
|
406
|
+
if reasoning_config.get("max_tokens") is not None:
|
|
407
|
+
lm_kwargs["max_tokens"] = reasoning_config["max_tokens"]
|
|
408
|
+
|
|
409
|
+
if is_anthropic:
|
|
410
|
+
# Anthropic uses thinking parameter with budget_tokens
|
|
411
|
+
budget = reasoning_config.get("budget_tokens", 1024)
|
|
412
|
+
lm_kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget}
|
|
413
|
+
lm_kwargs.pop("reasoning_effort", None)
|
|
414
|
+
else:
|
|
415
|
+
# Other providers use reasoning_effort
|
|
416
|
+
if reasoning_config["reasoning_effort"] is not None:
|
|
417
|
+
lm_kwargs["reasoning_effort"] = reasoning_config["reasoning_effort"]
|
|
418
|
+
|
|
419
|
+
lm = dspy.LM(**lm_kwargs)
|
|
420
|
+
model_name = config.model.split("/")[-1]
|
|
421
|
+
|
|
422
|
+
# Emit model start
|
|
423
|
+
with status_lock:
|
|
424
|
+
models_status[model_name] = "running"
|
|
425
|
+
self._emit("model_start", {
|
|
426
|
+
"model": model_name,
|
|
427
|
+
"models_status": dict(models_status)
|
|
428
|
+
})
|
|
429
|
+
|
|
430
|
+
try:
|
|
431
|
+
with dspy.context(lm=lm):
|
|
432
|
+
result = self.program(**kwargs)
|
|
433
|
+
|
|
434
|
+
# Extract response text from DSPy Prediction for streaming
|
|
435
|
+
response_data = {}
|
|
436
|
+
if hasattr(result, "_store") and result._store:
|
|
437
|
+
response_data = dict(result._store)
|
|
438
|
+
|
|
439
|
+
# Emit model complete WITH the actual response
|
|
440
|
+
with status_lock:
|
|
441
|
+
models_status[model_name] = "complete"
|
|
442
|
+
self._emit("model_complete", {
|
|
443
|
+
"model": model_name,
|
|
444
|
+
"success": True,
|
|
445
|
+
"models_status": dict(models_status),
|
|
446
|
+
"response": response_data, # Include actual response!
|
|
447
|
+
})
|
|
448
|
+
|
|
449
|
+
return {"model": model_name, "result": result, "error": None}
|
|
450
|
+
except Exception as e:
|
|
451
|
+
# Emit model error
|
|
452
|
+
with status_lock:
|
|
453
|
+
models_status[model_name] = "error"
|
|
454
|
+
self._emit("model_complete", {
|
|
455
|
+
"model": model_name,
|
|
456
|
+
"success": False,
|
|
457
|
+
"error": str(e),
|
|
458
|
+
"models_status": dict(models_status),
|
|
459
|
+
"response": None,
|
|
460
|
+
})
|
|
461
|
+
return {"model": model_name, "result": None, "error": str(e)}
|
|
462
|
+
|
|
463
|
+
# Run all 4 models in parallel
|
|
464
|
+
model_results = []
|
|
465
|
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
466
|
+
futures = [executor.submit(run_model, config) for config in ULTIMATE_MODELS]
|
|
467
|
+
for future in as_completed(futures):
|
|
468
|
+
model_results.append(future.result())
|
|
469
|
+
|
|
470
|
+
# Emit synthesizing status
|
|
471
|
+
self._emit("status", {
|
|
472
|
+
"step": "synthesizing",
|
|
473
|
+
"models_completed": len([r for r in model_results if r["result"] is not None])
|
|
474
|
+
})
|
|
475
|
+
|
|
476
|
+
# Merge results using AI synthesis
|
|
477
|
+
return self._merge_results(model_results, kwargs)
|
|
478
|
+
|
|
479
|
+
def _merge_results(self, model_results: list, original_input: dict) -> Any:
|
|
480
|
+
"""Merge results from multiple models using AI synthesis.
|
|
481
|
+
|
|
482
|
+
Uses SYNTHESIZER_MODEL to intelligently combine responses from all models.
|
|
483
|
+
"""
|
|
484
|
+
# Get successful results
|
|
485
|
+
successful = [r for r in model_results if r["result"] is not None]
|
|
486
|
+
if not successful:
|
|
487
|
+
# All failed - return error
|
|
488
|
+
errors = [r["error"] for r in model_results if r["error"]]
|
|
489
|
+
raise RuntimeError(f"All models failed: {errors}")
|
|
490
|
+
|
|
491
|
+
# If only one model succeeded, just return its result
|
|
492
|
+
if len(successful) == 1:
|
|
493
|
+
result = successful[0]["result"]
|
|
494
|
+
if hasattr(result, "_store"):
|
|
495
|
+
result._individual_responses = [{
|
|
496
|
+
"model": successful[0]["model"],
|
|
497
|
+
"response": str(result._store),
|
|
498
|
+
"error": None
|
|
499
|
+
}]
|
|
500
|
+
return result
|
|
501
|
+
|
|
502
|
+
# Collect individual responses
|
|
503
|
+
individual_responses = []
|
|
504
|
+
model_outputs = {} # model_name -> {field: value}
|
|
505
|
+
|
|
506
|
+
for r in model_results:
|
|
507
|
+
model_name = r["model"]
|
|
508
|
+
if r["result"] is not None and hasattr(r["result"], "_store"):
|
|
509
|
+
store = r["result"]._store
|
|
510
|
+
model_outputs[model_name] = dict(store)
|
|
511
|
+
# Get main output text for display
|
|
512
|
+
output_text = None
|
|
513
|
+
for key, value in store.items():
|
|
514
|
+
if isinstance(value, str) and len(value) > 10:
|
|
515
|
+
output_text = value
|
|
516
|
+
break
|
|
517
|
+
individual_responses.append({
|
|
518
|
+
"model": model_name,
|
|
519
|
+
"response": output_text or str(store),
|
|
520
|
+
"error": None
|
|
521
|
+
})
|
|
522
|
+
elif r["error"]:
|
|
523
|
+
individual_responses.append({
|
|
524
|
+
"model": model_name,
|
|
525
|
+
"response": None,
|
|
526
|
+
"error": r["error"]
|
|
527
|
+
})
|
|
528
|
+
|
|
529
|
+
# Use first result as template for output fields
|
|
530
|
+
base_result = successful[0]["result"]
|
|
531
|
+
base_store = base_result._store if hasattr(base_result, "_store") else {}
|
|
532
|
+
|
|
533
|
+
# Format original input for synthesizer
|
|
534
|
+
input_text = self._format_input(original_input)
|
|
535
|
+
|
|
536
|
+
# Create synthesized result
|
|
537
|
+
merged = {}
|
|
538
|
+
|
|
539
|
+
# Configure synthesizer LM
|
|
540
|
+
synth_lm = dspy.LM(**SYNTHESIZER_MODEL.to_lm_kwargs())
|
|
541
|
+
|
|
542
|
+
# Synthesize each output field
|
|
543
|
+
for field_name, base_value in base_store.items():
|
|
544
|
+
# Collect this field's values from all models
|
|
545
|
+
field_values = {}
|
|
546
|
+
for model_name, outputs in model_outputs.items():
|
|
547
|
+
if field_name in outputs:
|
|
548
|
+
field_values[model_name] = outputs[field_name]
|
|
549
|
+
|
|
550
|
+
if not field_values:
|
|
551
|
+
merged[field_name] = base_value
|
|
552
|
+
continue
|
|
553
|
+
|
|
554
|
+
# Check if it's a list field (like synonyms)
|
|
555
|
+
if isinstance(base_value, list):
|
|
556
|
+
# For lists, combine unique values from all models
|
|
557
|
+
combined = []
|
|
558
|
+
seen = set()
|
|
559
|
+
for model_name, values in field_values.items():
|
|
560
|
+
if isinstance(values, list):
|
|
561
|
+
for item in values:
|
|
562
|
+
# Get hashable key for deduplication
|
|
563
|
+
# Pydantic models aren't hashable, so convert to JSON
|
|
564
|
+
try:
|
|
565
|
+
if hasattr(item, 'model_dump_json'):
|
|
566
|
+
# Pydantic v2 model
|
|
567
|
+
item_key = item.model_dump_json()
|
|
568
|
+
elif hasattr(item, 'json'):
|
|
569
|
+
# Pydantic v1 model
|
|
570
|
+
item_key = item.json()
|
|
571
|
+
else:
|
|
572
|
+
# Regular hashable item
|
|
573
|
+
item_key = item
|
|
574
|
+
except TypeError:
|
|
575
|
+
# Fallback: convert to string representation
|
|
576
|
+
item_key = str(item)
|
|
577
|
+
|
|
578
|
+
if item_key not in seen:
|
|
579
|
+
combined.append(item)
|
|
580
|
+
seen.add(item_key)
|
|
581
|
+
merged[field_name] = combined
|
|
582
|
+
else:
|
|
583
|
+
# For string/text fields, use AI synthesis
|
|
584
|
+
responses_text = "\n\n".join([
|
|
585
|
+
f"=== {model_name} ===\n{value}"
|
|
586
|
+
for model_name, value in field_values.items()
|
|
587
|
+
])
|
|
588
|
+
|
|
589
|
+
self._emit("status", {
|
|
590
|
+
"step": "synthesizing_field",
|
|
591
|
+
"field": field_name,
|
|
592
|
+
"model": SYNTHESIZER_MODEL.model.split("/")[-1]
|
|
593
|
+
})
|
|
594
|
+
|
|
595
|
+
try:
|
|
596
|
+
with dspy.context(lm=synth_lm):
|
|
597
|
+
predictor = dspy.Predict(SynthesizeResponses)
|
|
598
|
+
synth_result = predictor(
|
|
599
|
+
original_input=input_text,
|
|
600
|
+
model_responses=responses_text
|
|
601
|
+
)
|
|
602
|
+
merged[field_name] = synth_result.synthesized_response
|
|
603
|
+
except Exception as e:
|
|
604
|
+
# Fallback to first model's response on synthesis error
|
|
605
|
+
print(f"[Synthesis] Error synthesizing {field_name}: {e}")
|
|
606
|
+
merged[field_name] = base_value
|
|
607
|
+
|
|
608
|
+
# Return a result object with merged data
|
|
609
|
+
class MergedResult:
|
|
610
|
+
pass
|
|
611
|
+
|
|
612
|
+
result = MergedResult()
|
|
613
|
+
for key, value in merged.items():
|
|
614
|
+
setattr(result, key, value)
|
|
615
|
+
result._store = merged # For extract_result in server.py
|
|
616
|
+
result._individual_responses = individual_responses # For UI display
|
|
617
|
+
result._synthesized = True # Mark as AI-synthesized
|
|
618
|
+
|
|
619
|
+
return result
|
|
620
|
+
|
|
621
|
+
def _format_input(self, kwargs: dict) -> str:
|
|
622
|
+
"""Format input kwargs as a readable string."""
|
|
623
|
+
parts = []
|
|
624
|
+
for key, value in kwargs.items():
|
|
625
|
+
parts.append(f"{key}: {value}")
|
|
626
|
+
return "\n".join(parts)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def juiced(juice: JuiceLevel | int = JuiceLevel.QUICK):
|
|
630
|
+
"""Decorator to run a DSPy program with a specific juice level.
|
|
631
|
+
|
|
632
|
+
Usage:
|
|
633
|
+
@juiced(JuiceLevel.DEEP)
|
|
634
|
+
def my_program():
|
|
635
|
+
return dspy.ChainOfThought(MySignature)
|
|
636
|
+
"""
|
|
637
|
+
def decorator(func: Callable) -> Callable:
|
|
638
|
+
def wrapper(*args, **kwargs):
|
|
639
|
+
program = func(*args, **kwargs)
|
|
640
|
+
return JuicedProgram(program, juice)
|
|
641
|
+
return wrapper
|
|
642
|
+
return decorator
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def run_with_juice(
|
|
646
|
+
program: dspy.Module,
|
|
647
|
+
juice: JuiceLevel | int,
|
|
648
|
+
reasoning: ReasoningLevel | int | None = None,
|
|
649
|
+
**kwargs
|
|
650
|
+
) -> Any:
|
|
651
|
+
"""Convenience function to run a program with a specific juice level.
|
|
652
|
+
|
|
653
|
+
Args:
|
|
654
|
+
program: The DSPy program to run.
|
|
655
|
+
juice: Juice level (0-4).
|
|
656
|
+
reasoning: Optional reasoning level (0-5). If None, uses juice level's default.
|
|
657
|
+
**kwargs: Arguments to pass to the program.
|
|
658
|
+
|
|
659
|
+
Returns:
|
|
660
|
+
The program result.
|
|
661
|
+
"""
|
|
662
|
+
juiced_program = JuicedProgram(program, juice, reasoning=reasoning)
|
|
663
|
+
return juiced_program(**kwargs)
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
# Juice level descriptions for CLI/UI
|
|
667
|
+
JUICE_DESCRIPTIONS = {
|
|
668
|
+
JuiceLevel.QUICK: "Quick (Kimi K2) - Fast & cheap",
|
|
669
|
+
JuiceLevel.BALANCED: "Balanced (Sonnet 4.5) - Good quality",
|
|
670
|
+
JuiceLevel.DEEP: "Deep (Gemini 3 thinking) - Thorough reasoning",
|
|
671
|
+
JuiceLevel.MAXIMUM: "Maximum (Opus 4.5 thinking) - Best single model",
|
|
672
|
+
JuiceLevel.ULTIMATE: "Ultimate (Multi-model merger) - All models synthesized",
|
|
673
|
+
}
|