@every-env/compound-plugin 0.3.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/{plugins/compound-engineering → .claude}/commands/release-docs.md +0 -1
- package/.claude-plugin/marketplace.json +2 -2
- package/.github/workflows/ci.yml +1 -1
- package/.github/workflows/deploy-docs.yml +3 -3
- package/.github/workflows/publish.yml +37 -0
- package/README.md +12 -3
- package/docs/index.html +13 -13
- package/docs/pages/changelog.html +39 -0
- package/docs/plans/2026-02-08-feat-convert-local-md-settings-for-opencode-codex-plan.md +143 -0
- package/docs/plans/2026-02-08-feat-simplify-plugin-settings-plan.md +195 -0
- package/docs/plans/2026-02-09-refactor-dspy-ruby-skill-update-plan.md +104 -0
- package/docs/plans/2026-02-12-feat-add-cursor-cli-target-provider-plan.md +306 -0
- package/docs/specs/cursor.md +85 -0
- package/package.json +1 -1
- package/plugins/compound-engineering/.claude-plugin/plugin.json +2 -2
- package/plugins/compound-engineering/CHANGELOG.md +38 -0
- package/plugins/compound-engineering/README.md +5 -3
- package/plugins/compound-engineering/commands/workflows/brainstorm.md +6 -1
- package/plugins/compound-engineering/commands/workflows/compound.md +1 -0
- package/plugins/compound-engineering/commands/workflows/review.md +23 -21
- package/plugins/compound-engineering/commands/workflows/work.md +29 -15
- package/plugins/compound-engineering/skills/dspy-ruby/SKILL.md +539 -396
- package/plugins/compound-engineering/skills/dspy-ruby/assets/config-template.rb +159 -331
- package/plugins/compound-engineering/skills/dspy-ruby/assets/module-template.rb +210 -236
- package/plugins/compound-engineering/skills/dspy-ruby/assets/signature-template.rb +173 -95
- package/plugins/compound-engineering/skills/dspy-ruby/references/core-concepts.md +552 -143
- package/plugins/compound-engineering/skills/dspy-ruby/references/observability.md +366 -0
- package/plugins/compound-engineering/skills/dspy-ruby/references/optimization.md +440 -460
- package/plugins/compound-engineering/skills/dspy-ruby/references/providers.md +305 -225
- package/plugins/compound-engineering/skills/dspy-ruby/references/toolsets.md +502 -0
- package/plugins/compound-engineering/skills/setup/SKILL.md +168 -0
- package/src/commands/convert.ts +10 -5
- package/src/commands/install.ts +18 -10
- package/src/converters/claude-to-codex.ts +7 -2
- package/src/converters/claude-to-cursor.ts +166 -0
- package/src/converters/claude-to-droid.ts +174 -0
- package/src/converters/claude-to-opencode.ts +8 -2
- package/src/targets/cursor.ts +48 -0
- package/src/targets/droid.ts +50 -0
- package/src/targets/index.ts +18 -0
- package/src/types/cursor.ts +29 -0
- package/src/types/droid.ts +20 -0
- package/tests/cli.test.ts +62 -0
- package/tests/codex-converter.test.ts +62 -0
- package/tests/converter.test.ts +61 -0
- package/tests/cursor-converter.test.ts +347 -0
- package/tests/cursor-writer.test.ts +137 -0
- package/tests/droid-converter.test.ts +277 -0
- package/tests/droid-writer.test.ts +100 -0
- package/plugins/compound-engineering/commands/technical_review.md +0 -8
|
@@ -1,623 +1,603 @@
|
|
|
1
|
-
# DSPy.rb
|
|
1
|
+
# DSPy.rb Optimization
|
|
2
2
|
|
|
3
|
-
##
|
|
3
|
+
## MIPROv2
|
|
4
4
|
|
|
5
|
-
DSPy.rb
|
|
5
|
+
MIPROv2 (Multi-prompt Instruction Proposal with Retrieval Optimization) is the primary instruction tuner in DSPy.rb. It proposes new instructions and few-shot demonstrations per predictor, evaluates them on mini-batches, and retains candidates that improve the metric. It ships as a separate gem to keep the Gaussian Process dependency tree out of apps that do not need it.
|
|
6
6
|
|
|
7
|
-
###
|
|
7
|
+
### Installation
|
|
8
8
|
|
|
9
9
|
```ruby
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
RSpec.describe EmailClassifier do
|
|
14
|
-
before do
|
|
15
|
-
DSPy.configure do |c|
|
|
16
|
-
c.lm = DSPy::LM.new('openai/gpt-4o-mini', api_key: ENV['OPENAI_API_KEY'])
|
|
17
|
-
end
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
describe '#classify' do
|
|
21
|
-
it 'classifies technical support emails correctly' do
|
|
22
|
-
classifier = EmailClassifier.new
|
|
23
|
-
result = classifier.forward(
|
|
24
|
-
email_subject: "Can't log in",
|
|
25
|
-
email_body: "I'm unable to access my account"
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
expect(result[:category]).to eq('Technical')
|
|
29
|
-
expect(result[:priority]).to be_in(['High', 'Medium', 'Low'])
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
end
|
|
10
|
+
# Gemfile
|
|
11
|
+
gem "dspy"
|
|
12
|
+
gem "dspy-miprov2"
|
|
33
13
|
```
|
|
34
14
|
|
|
35
|
-
|
|
15
|
+
Bundler auto-requires `dspy/miprov2`. No additional `require` statement is needed.
|
|
16
|
+
|
|
17
|
+
### AutoMode presets
|
|
36
18
|
|
|
37
|
-
|
|
19
|
+
Use `DSPy::Teleprompt::MIPROv2::AutoMode` for preconfigured optimizers:
|
|
38
20
|
|
|
39
21
|
```ruby
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
mock_predictor = instance_double(DSPy::Predict)
|
|
44
|
-
allow(mock_predictor).to receive(:forward).and_return({
|
|
45
|
-
category: 'Technical',
|
|
46
|
-
priority: 'High',
|
|
47
|
-
confidence: 0.95
|
|
48
|
-
})
|
|
49
|
-
|
|
50
|
-
# Inject mock into your module
|
|
51
|
-
module_instance = MyModule.new
|
|
52
|
-
module_instance.instance_variable_set(:@predictor, mock_predictor)
|
|
53
|
-
|
|
54
|
-
result = module_instance.forward(input: 'test data')
|
|
55
|
-
expect(result[:category]).to eq('Technical')
|
|
56
|
-
end
|
|
57
|
-
end
|
|
22
|
+
light = DSPy::Teleprompt::MIPROv2::AutoMode.light(metric: metric) # 6 trials, greedy
|
|
23
|
+
medium = DSPy::Teleprompt::MIPROv2::AutoMode.medium(metric: metric) # 12 trials, adaptive
|
|
24
|
+
heavy = DSPy::Teleprompt::MIPROv2::AutoMode.heavy(metric: metric) # 18 trials, Bayesian
|
|
58
25
|
```
|
|
59
26
|
|
|
60
|
-
|
|
27
|
+
| Preset | Trials | Strategy | Use case |
|
|
28
|
+
|----------|--------|------------|-----------------------------------------------------|
|
|
29
|
+
| `light` | 6 | `:greedy` | Quick wins on small datasets or during prototyping. |
|
|
30
|
+
| `medium` | 12 | `:adaptive`| Balanced exploration vs. runtime for most pilots. |
|
|
31
|
+
| `heavy` | 18 | `:bayesian`| Highest accuracy targets or multi-stage programs. |
|
|
32
|
+
|
|
33
|
+
### Manual configuration with dry-configurable
|
|
34
|
+
|
|
35
|
+
`DSPy::Teleprompt::MIPROv2` includes `Dry::Configurable`. Configure at the class level (defaults for all instances) or instance level (overrides class defaults).
|
|
61
36
|
|
|
62
|
-
|
|
37
|
+
**Class-level defaults:**
|
|
63
38
|
|
|
64
39
|
```ruby
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
email_subject: 'Test',
|
|
72
|
-
email_body: 'Test body'
|
|
73
|
-
)
|
|
74
|
-
expect(result[:category]).to be_a(String)
|
|
40
|
+
DSPy::Teleprompt::MIPROv2.configure do |config|
|
|
41
|
+
config.optimization_strategy = :bayesian
|
|
42
|
+
config.num_trials = 30
|
|
43
|
+
config.bootstrap_sets = 10
|
|
44
|
+
end
|
|
45
|
+
```
|
|
75
46
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
47
|
+
**Instance-level overrides:**
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
optimizer = DSPy::Teleprompt::MIPROv2.new(metric: metric)
|
|
51
|
+
optimizer.configure do |config|
|
|
52
|
+
config.num_trials = 15
|
|
53
|
+
config.num_instruction_candidates = 6
|
|
54
|
+
config.bootstrap_sets = 5
|
|
55
|
+
config.max_bootstrapped_examples = 4
|
|
56
|
+
config.max_labeled_examples = 16
|
|
57
|
+
config.optimization_strategy = :adaptive # :greedy, :adaptive, :bayesian
|
|
58
|
+
config.early_stopping_patience = 3
|
|
59
|
+
config.init_temperature = 1.0
|
|
60
|
+
config.final_temperature = 0.1
|
|
61
|
+
config.minibatch_size = nil # nil = auto
|
|
62
|
+
config.auto_seed = 42
|
|
82
63
|
end
|
|
83
64
|
```
|
|
84
65
|
|
|
85
|
-
|
|
66
|
+
The `optimization_strategy` setting accepts symbols (`:greedy`, `:adaptive`, `:bayesian`) and coerces them internally to `DSPy::Teleprompt::OptimizationStrategy` T::Enum values.
|
|
67
|
+
|
|
68
|
+
The old `config:` constructor parameter is removed. Passing `config:` raises `ArgumentError`.
|
|
86
69
|
|
|
87
|
-
|
|
70
|
+
### Auto presets via configure
|
|
71
|
+
|
|
72
|
+
Instead of `AutoMode`, set the preset through the configure block:
|
|
88
73
|
|
|
89
74
|
```ruby
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
email_body: ''
|
|
96
|
-
)
|
|
97
|
-
# Define expected behavior for edge case
|
|
98
|
-
expect(result[:category]).to eq('General')
|
|
99
|
-
end
|
|
75
|
+
optimizer = DSPy::Teleprompt::MIPROv2.new(metric: metric)
|
|
76
|
+
optimizer.configure do |config|
|
|
77
|
+
config.auto_preset = DSPy::Teleprompt::AutoPreset.deserialize("medium")
|
|
78
|
+
end
|
|
79
|
+
```
|
|
100
80
|
|
|
101
|
-
|
|
102
|
-
long_body = 'word ' * 10000
|
|
103
|
-
classifier = EmailClassifier.new
|
|
81
|
+
### Compile and inspect
|
|
104
82
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
email_subject: 'Test',
|
|
108
|
-
email_body: long_body
|
|
109
|
-
)
|
|
110
|
-
}.not_to raise_error
|
|
111
|
-
end
|
|
83
|
+
```ruby
|
|
84
|
+
program = DSPy::Predict.new(MySignature)
|
|
112
85
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
)
|
|
86
|
+
result = optimizer.compile(
|
|
87
|
+
program,
|
|
88
|
+
trainset: train_examples,
|
|
89
|
+
valset: val_examples
|
|
90
|
+
)
|
|
119
91
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
end
|
|
92
|
+
optimized_program = result.optimized_program
|
|
93
|
+
puts "Best score: #{result.best_score_value}"
|
|
123
94
|
```
|
|
124
95
|
|
|
125
|
-
|
|
96
|
+
The `result` object exposes:
|
|
97
|
+
- `optimized_program` -- ready-to-use predictor with updated instruction and demos.
|
|
98
|
+
- `optimization_trace[:trial_logs]` -- per-trial record of instructions, demos, and scores.
|
|
99
|
+
- `metadata[:optimizer]` -- `"MIPROv2"`, useful when persisting experiments from multiple optimizers.
|
|
100
|
+
|
|
101
|
+
### Multi-stage programs
|
|
102
|
+
|
|
103
|
+
MIPROv2 generates dataset summaries for each predictor and proposes per-stage instructions. For a ReAct agent with `thought_generator` and `observation_processor` predictors, the optimizer handles credit assignment internally. The metric only needs to evaluate the final output.
|
|
104
|
+
|
|
105
|
+
### Bootstrap sampling
|
|
106
|
+
|
|
107
|
+
During the bootstrap phase MIPROv2:
|
|
108
|
+
1. Generates dataset summaries from the training set.
|
|
109
|
+
2. Bootstraps few-shot demonstrations by running the baseline program.
|
|
110
|
+
3. Proposes candidate instructions grounded in the summaries and bootstrapped examples.
|
|
111
|
+
4. Evaluates each candidate on mini-batches drawn from the validation set.
|
|
126
112
|
|
|
127
|
-
|
|
113
|
+
Control the bootstrap phase with `bootstrap_sets`, `max_bootstrapped_examples`, and `max_labeled_examples`.
|
|
114
|
+
|
|
115
|
+
### Bayesian optimization
|
|
116
|
+
|
|
117
|
+
When `optimization_strategy` is `:bayesian` (or when using the `heavy` preset), MIPROv2 fits a Gaussian Process surrogate over past trial scores to select the next candidate. This replaces random search with informed exploration, reducing the number of trials needed to find high-scoring instructions.
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## GEPA
|
|
122
|
+
|
|
123
|
+
GEPA (Genetic-Pareto Reflective Prompt Evolution) is a feedback-driven optimizer. It runs the program on a small batch, collects scores and textual feedback, and asks a reflection LM to rewrite the instruction. Improved candidates are retained on a Pareto frontier.
|
|
124
|
+
|
|
125
|
+
### Installation
|
|
128
126
|
|
|
129
127
|
```ruby
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
128
|
+
# Gemfile
|
|
129
|
+
gem "dspy"
|
|
130
|
+
gem "dspy-gepa"
|
|
131
|
+
```
|
|
133
132
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
133
|
+
The `dspy-gepa` gem depends on the `gepa` core optimizer gem automatically.
|
|
134
|
+
|
|
135
|
+
### Metric contract
|
|
136
|
+
|
|
137
|
+
GEPA metrics return `DSPy::Prediction` with both a numeric score and a feedback string. Do not return a plain boolean.
|
|
138
138
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
139
|
+
```ruby
|
|
140
|
+
metric = lambda do |example, prediction|
|
|
141
|
+
expected = example.expected_values[:label]
|
|
142
|
+
predicted = prediction.label
|
|
143
|
+
|
|
144
|
+
score = predicted == expected ? 1.0 : 0.0
|
|
145
|
+
feedback = if score == 1.0
|
|
146
|
+
"Correct (#{expected}) for: \"#{example.input_values[:text][0..60]}\""
|
|
147
|
+
else
|
|
148
|
+
"Misclassified (expected #{expected}, got #{predicted}) for: \"#{example.input_values[:text][0..60]}\""
|
|
144
149
|
end
|
|
150
|
+
|
|
151
|
+
DSPy::Prediction.new(score: score, feedback: feedback)
|
|
145
152
|
end
|
|
146
153
|
```
|
|
147
154
|
|
|
148
|
-
|
|
155
|
+
Keep the score in `[0, 1]`. Always include a short feedback message explaining what happened -- GEPA hands this text to the reflection model so it can reason about failures.
|
|
156
|
+
|
|
157
|
+
### Feedback maps
|
|
149
158
|
|
|
150
|
-
|
|
159
|
+
`feedback_map` targets individual predictors inside a composite module. Each entry receives keyword arguments and returns a `DSPy::Prediction`:
|
|
151
160
|
|
|
152
161
|
```ruby
|
|
153
|
-
|
|
162
|
+
feedback_map = {
|
|
163
|
+
'self' => lambda do |predictor_output:, predictor_inputs:, module_inputs:, module_outputs:, captured_trace:|
|
|
164
|
+
expected = module_inputs.expected_values[:label]
|
|
165
|
+
predicted = predictor_output.label
|
|
166
|
+
|
|
167
|
+
DSPy::Prediction.new(
|
|
168
|
+
score: predicted == expected ? 1.0 : 0.0,
|
|
169
|
+
feedback: "Classifier saw \"#{predictor_inputs[:text][0..80]}\" -> #{predicted} (expected #{expected})"
|
|
170
|
+
)
|
|
171
|
+
end
|
|
172
|
+
}
|
|
173
|
+
```
|
|
154
174
|
|
|
155
|
-
|
|
156
|
-
config.cassette_library_dir = 'spec/vcr_cassettes'
|
|
157
|
-
config.hook_into :webmock
|
|
158
|
-
config.filter_sensitive_data('<OPENAI_API_KEY>') { ENV['OPENAI_API_KEY'] }
|
|
159
|
-
end
|
|
175
|
+
For single-predictor programs, key the map with `'self'`. For multi-predictor chains, add entries per component so the reflection LM sees localized context at each step. Omit `feedback_map` entirely if the top-level metric already covers the basics.
|
|
160
176
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
177
|
+
### Configuring the teleprompter
|
|
178
|
+
|
|
179
|
+
```ruby
|
|
180
|
+
teleprompter = DSPy::Teleprompt::GEPA.new(
|
|
181
|
+
metric: metric,
|
|
182
|
+
reflection_lm: DSPy::ReflectionLM.new('openai/gpt-4o-mini', api_key: ENV['OPENAI_API_KEY']),
|
|
183
|
+
feedback_map: feedback_map,
|
|
184
|
+
config: {
|
|
185
|
+
max_metric_calls: 600,
|
|
186
|
+
minibatch_size: 6,
|
|
187
|
+
skip_perfect_score: false
|
|
188
|
+
}
|
|
189
|
+
)
|
|
174
190
|
```
|
|
175
191
|
|
|
176
|
-
|
|
192
|
+
Key configuration knobs:
|
|
193
|
+
|
|
194
|
+
| Knob | Purpose |
|
|
195
|
+
|----------------------|-------------------------------------------------------------------------------------------|
|
|
196
|
+
| `max_metric_calls` | Hard budget on evaluation calls. Set to at least the validation set size plus a few minibatches. |
|
|
197
|
+
| `minibatch_size` | Examples per reflective replay batch. Smaller = cheaper iterations, noisier scores. |
|
|
198
|
+
| `skip_perfect_score` | Set `true` to stop early when a candidate reaches score `1.0`. |
|
|
177
199
|
|
|
178
|
-
|
|
200
|
+
### Minibatch sizing
|
|
179
201
|
|
|
180
|
-
|
|
202
|
+
| Goal | Suggested size | Rationale |
|
|
203
|
+
|-------------------------------------------------|----------------|------------------------------------------------------------|
|
|
204
|
+
| Explore many candidates within a tight budget | 3--6 | Cheap iterations, more prompt variants, noisier metrics. |
|
|
205
|
+
| Stable metrics when each rollout is costly | 8--12 | Smoother scores, fewer candidates unless budget is raised. |
|
|
206
|
+
| Investigate specific failure modes | 3--4 then 8+ | Start with breadth, increase once patterns emerge. |
|
|
181
207
|
|
|
182
|
-
|
|
208
|
+
### Compile and evaluate
|
|
183
209
|
|
|
184
210
|
```ruby
|
|
185
|
-
|
|
211
|
+
program = DSPy::Predict.new(MySignature)
|
|
186
212
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def initialize
|
|
190
|
-
super
|
|
191
|
-
@predictor = DSPy::ChainOfThought.new(EmailClassificationSignature)
|
|
192
|
-
end
|
|
213
|
+
result = teleprompter.compile(program, trainset: train, valset: val)
|
|
214
|
+
optimized_program = result.optimized_program
|
|
193
215
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
end
|
|
197
|
-
end
|
|
216
|
+
test_metrics = evaluate(optimized_program, test)
|
|
217
|
+
```
|
|
198
218
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
expected_output: { category: 'Technical', priority: 'High' }
|
|
204
|
-
},
|
|
205
|
-
{
|
|
206
|
-
input: { email_subject: "Billing question", email_body: "How much does premium cost?" },
|
|
207
|
-
expected_output: { category: 'Billing', priority: 'Medium' }
|
|
208
|
-
},
|
|
209
|
-
# Add more examples...
|
|
210
|
-
]
|
|
211
|
-
|
|
212
|
-
# Define evaluation metric
|
|
213
|
-
def accuracy_metric(example, prediction)
|
|
214
|
-
(example[:expected_output][:category] == prediction[:category]) ? 1.0 : 0.0
|
|
215
|
-
end
|
|
219
|
+
The `result` object exposes:
|
|
220
|
+
- `optimized_program` -- predictor with updated instruction and few-shot examples.
|
|
221
|
+
- `best_score_value` -- validation score for the best candidate.
|
|
222
|
+
- `metadata` -- candidate counts, trace hashes, and telemetry IDs.
|
|
216
223
|
|
|
217
|
-
|
|
218
|
-
optimizer = DSPy::MIPROv2.new(
|
|
219
|
-
metric: method(:accuracy_metric),
|
|
220
|
-
num_candidates: 10,
|
|
221
|
-
num_threads: 4
|
|
222
|
-
)
|
|
224
|
+
### Reflection LM
|
|
223
225
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
)
|
|
226
|
+
Swap `DSPy::ReflectionLM` for any callable object that accepts the reflection prompt hash and returns a string. The default reflection signature extracts the new instruction from triple backticks in the response.
|
|
227
|
+
|
|
228
|
+
### Experiment tracking
|
|
228
229
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
230
|
+
Plug `GEPA::Logging::ExperimentTracker` into a persistence layer:
|
|
231
|
+
|
|
232
|
+
```ruby
|
|
233
|
+
tracker = GEPA::Logging::ExperimentTracker.new
|
|
234
|
+
tracker.with_subscriber { |event| MyModel.create!(payload: event) }
|
|
235
|
+
|
|
236
|
+
teleprompter = DSPy::Teleprompt::GEPA.new(
|
|
237
|
+
metric: metric,
|
|
238
|
+
reflection_lm: reflection_lm,
|
|
239
|
+
experiment_tracker: tracker,
|
|
240
|
+
config: { max_metric_calls: 900 }
|
|
233
241
|
)
|
|
234
242
|
```
|
|
235
243
|
|
|
236
|
-
|
|
244
|
+
The tracker emits Pareto update events, merge decisions, and candidate evolution records as JSONL.
|
|
237
245
|
|
|
238
|
-
|
|
246
|
+
### Pareto frontier
|
|
239
247
|
|
|
240
|
-
|
|
241
|
-
require 'dspy/teleprompt'
|
|
248
|
+
GEPA maintains a diverse candidate pool and samples from the Pareto frontier instead of mutating only the top-scoring program. This balances exploration and prevents the search from collapsing onto a single lineage.
|
|
242
249
|
|
|
243
|
-
|
|
244
|
-
teleprompter = DSPy::BootstrapFewShot.new(
|
|
245
|
-
metric: method(:accuracy_metric),
|
|
246
|
-
max_bootstrapped_demos: 5,
|
|
247
|
-
max_labeled_demos: 3
|
|
248
|
-
)
|
|
250
|
+
Enable the merge proposer after multiple strong lineages emerge:
|
|
249
251
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
252
|
+
```ruby
|
|
253
|
+
config: {
|
|
254
|
+
max_metric_calls: 900,
|
|
255
|
+
enable_merge_proposer: true
|
|
256
|
+
}
|
|
255
257
|
```
|
|
256
258
|
|
|
257
|
-
|
|
259
|
+
Premature merges eat budget without meaningful gains. Gate merge on having several validated candidates first.
|
|
258
260
|
|
|
259
|
-
|
|
261
|
+
### Advanced options
|
|
260
262
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
263
|
+
- `acceptance_strategy:` -- plug in bespoke Pareto filters or early-stop heuristics.
|
|
264
|
+
- Telemetry spans emit via `GEPA::Telemetry`. Enable global observability with `DSPy.configure { |c| c.observability = true }` to stream spans to an OpenTelemetry exporter.
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## Evaluation Framework
|
|
264
269
|
|
|
265
|
-
|
|
266
|
-
score += 0.6 if example[:expected_output][:category] == prediction[:category]
|
|
270
|
+
`DSPy::Evals` provides batch evaluation of predictors against test datasets with built-in and custom metrics.
|
|
267
271
|
|
|
268
|
-
|
|
269
|
-
score += 0.4 if example[:expected_output][:priority] == prediction[:priority]
|
|
272
|
+
### Basic usage
|
|
270
273
|
|
|
271
|
-
|
|
274
|
+
```ruby
|
|
275
|
+
metric = proc do |example, prediction|
|
|
276
|
+
prediction.answer == example.expected_values[:answer]
|
|
272
277
|
end
|
|
273
278
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
279
|
+
evaluator = DSPy::Evals.new(predictor, metric: metric)
|
|
280
|
+
|
|
281
|
+
result = evaluator.evaluate(
|
|
282
|
+
test_examples,
|
|
283
|
+
display_table: true,
|
|
284
|
+
display_progress: true
|
|
278
285
|
)
|
|
286
|
+
|
|
287
|
+
puts "Pass rate: #{(result.pass_rate * 100).round(1)}%"
|
|
288
|
+
puts "Passed: #{result.passed_examples}/#{result.total_examples}"
|
|
279
289
|
```
|
|
280
290
|
|
|
281
|
-
###
|
|
291
|
+
### DSPy::Example
|
|
282
292
|
|
|
283
|
-
|
|
293
|
+
Convert raw data into `DSPy::Example` instances before passing to optimizers or evaluators. Each example carries `input_values` and `expected_values`:
|
|
284
294
|
|
|
285
295
|
```ruby
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
end
|
|
292
|
-
|
|
293
|
-
def forward(input)
|
|
294
|
-
@predictor.forward(input)
|
|
295
|
-
end
|
|
296
|
+
examples = rows.map do |row|
|
|
297
|
+
DSPy::Example.new(
|
|
298
|
+
input_values: { text: row[:text] },
|
|
299
|
+
expected_values: { label: row[:label] }
|
|
300
|
+
)
|
|
296
301
|
end
|
|
297
302
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
def initialize
|
|
301
|
-
super
|
|
302
|
-
@predictor = DSPy::ReAct.new(
|
|
303
|
-
EmailClassificationSignature,
|
|
304
|
-
tools: [KnowledgeBaseTool.new]
|
|
305
|
-
)
|
|
306
|
-
end
|
|
303
|
+
train, val, test = split_examples(examples, train_ratio: 0.6, val_ratio: 0.2, seed: 42)
|
|
304
|
+
```
|
|
307
305
|
|
|
308
|
-
|
|
309
|
-
@predictor.forward(input)
|
|
310
|
-
end
|
|
311
|
-
end
|
|
306
|
+
Hold back a test set from the optimization loop. Optimizers work on train/val; only the test set proves generalization.
|
|
312
307
|
|
|
313
|
-
|
|
314
|
-
def evaluate_approach(approach_class, test_set)
|
|
315
|
-
approach = approach_class.new
|
|
316
|
-
scores = test_set.map do |example|
|
|
317
|
-
prediction = approach.forward(example[:input])
|
|
318
|
-
accuracy_metric(example, prediction)
|
|
319
|
-
end
|
|
320
|
-
scores.sum / scores.size
|
|
321
|
-
end
|
|
308
|
+
### Built-in metrics
|
|
322
309
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
puts "Approach A accuracy: #{approach_a_score}"
|
|
327
|
-
puts "Approach B accuracy: #{approach_b_score}"
|
|
328
|
-
```
|
|
310
|
+
```ruby
|
|
311
|
+
# Exact match -- prediction must exactly equal expected value
|
|
312
|
+
metric = DSPy::Metrics.exact_match(field: :answer, case_sensitive: true)
|
|
329
313
|
|
|
330
|
-
|
|
314
|
+
# Contains -- prediction must contain expected substring
|
|
315
|
+
metric = DSPy::Metrics.contains(field: :answer, case_sensitive: false)
|
|
331
316
|
|
|
332
|
-
|
|
317
|
+
# Numeric difference -- numeric output within tolerance
|
|
318
|
+
metric = DSPy::Metrics.numeric_difference(field: :answer, tolerance: 0.01)
|
|
333
319
|
|
|
334
|
-
|
|
320
|
+
# Composite AND -- all sub-metrics must pass
|
|
321
|
+
metric = DSPy::Metrics.composite_and(
|
|
322
|
+
DSPy::Metrics.exact_match(field: :answer),
|
|
323
|
+
DSPy::Metrics.contains(field: :reasoning)
|
|
324
|
+
)
|
|
325
|
+
```
|
|
335
326
|
|
|
336
|
-
|
|
327
|
+
### Custom metrics
|
|
337
328
|
|
|
338
329
|
```ruby
|
|
339
|
-
|
|
340
|
-
|
|
330
|
+
quality_metric = lambda do |example, prediction|
|
|
331
|
+
return false unless prediction
|
|
341
332
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
333
|
+
score = 0.0
|
|
334
|
+
score += 0.5 if prediction.answer == example.expected_values[:answer]
|
|
335
|
+
score += 0.3 if prediction.explanation && prediction.explanation.length > 50
|
|
336
|
+
score += 0.2 if prediction.confidence && prediction.confidence > 0.8
|
|
337
|
+
score >= 0.7
|
|
346
338
|
end
|
|
347
339
|
|
|
348
|
-
|
|
349
|
-
predictor = DSPy::Predict.new(MySignature)
|
|
350
|
-
result = predictor.forward(input: 'data')
|
|
351
|
-
# Traces are automatically sent to your OpenTelemetry collector
|
|
340
|
+
evaluator = DSPy::Evals.new(predictor, metric: quality_metric)
|
|
352
341
|
```
|
|
353
342
|
|
|
354
|
-
|
|
343
|
+
Access prediction fields with dot notation (`prediction.answer`), not hash notation.
|
|
344
|
+
|
|
345
|
+
### Observability hooks
|
|
355
346
|
|
|
356
|
-
|
|
347
|
+
Register callbacks without editing the evaluator:
|
|
357
348
|
|
|
358
349
|
```ruby
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
#
|
|
362
|
-
DSPy.configure do |c|
|
|
363
|
-
c.lm = DSPy::LM.new('openai/gpt-4o-mini', api_key: ENV['OPENAI_API_KEY'])
|
|
364
|
-
c.langfuse = {
|
|
365
|
-
public_key: ENV['LANGFUSE_PUBLIC_KEY'],
|
|
366
|
-
secret_key: ENV['LANGFUSE_SECRET_KEY'],
|
|
367
|
-
host: ENV['LANGFUSE_HOST'] || 'https://cloud.langfuse.com'
|
|
368
|
-
}
|
|
350
|
+
DSPy::Evals.before_example do |payload|
|
|
351
|
+
example = payload[:example]
|
|
352
|
+
DSPy.logger.info("Evaluating example #{example.id}") if example.respond_to?(:id)
|
|
369
353
|
end
|
|
370
354
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
355
|
+
DSPy::Evals.after_batch do |payload|
|
|
356
|
+
result = payload[:result]
|
|
357
|
+
Langfuse.event(
|
|
358
|
+
name: 'eval.batch',
|
|
359
|
+
metadata: {
|
|
360
|
+
total: result.total_examples,
|
|
361
|
+
passed: result.passed_examples,
|
|
362
|
+
score: result.score
|
|
363
|
+
}
|
|
364
|
+
)
|
|
365
|
+
end
|
|
375
366
|
```
|
|
376
367
|
|
|
377
|
-
|
|
368
|
+
Available hooks: `before_example`, `after_example`, `before_batch`, `after_batch`.
|
|
378
369
|
|
|
379
|
-
|
|
370
|
+
### Langfuse score export
|
|
371
|
+
|
|
372
|
+
Enable `export_scores: true` to emit `score.create` events for each evaluated example and a batch score at the end:
|
|
380
373
|
|
|
381
374
|
```ruby
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
375
|
+
evaluator = DSPy::Evals.new(
|
|
376
|
+
predictor,
|
|
377
|
+
metric: metric,
|
|
378
|
+
export_scores: true,
|
|
379
|
+
score_name: 'qa_accuracy' # default: 'evaluation'
|
|
380
|
+
)
|
|
387
381
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
duration = Time.now - start_time
|
|
382
|
+
result = evaluator.evaluate(test_examples)
|
|
383
|
+
# Emits per-example scores + overall batch score via DSPy::Scores::Exporter
|
|
384
|
+
```
|
|
392
385
|
|
|
393
|
-
|
|
394
|
-
tokens = result.metadata[:usage][:total_tokens] rescue 0
|
|
395
|
-
@total_tokens += tokens
|
|
396
|
-
@request_count += 1
|
|
386
|
+
Scores attach to the current trace context automatically and flow to Langfuse asynchronously.
|
|
397
387
|
|
|
398
|
-
|
|
399
|
-
puts "Total tokens used: #{@total_tokens}"
|
|
388
|
+
### Evaluation results
|
|
400
389
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
end
|
|
390
|
+
```ruby
|
|
391
|
+
result = evaluator.evaluate(test_examples)
|
|
404
392
|
|
|
405
|
-
#
|
|
406
|
-
|
|
407
|
-
|
|
393
|
+
result.score # Overall score (0.0 to 1.0)
|
|
394
|
+
result.passed_count # Examples that passed
|
|
395
|
+
result.failed_count # Examples that failed
|
|
396
|
+
result.error_count # Examples that errored
|
|
408
397
|
|
|
409
|
-
result
|
|
398
|
+
result.results.each do |r|
|
|
399
|
+
r.passed # Boolean
|
|
400
|
+
r.score # Numeric score
|
|
401
|
+
r.error # Error message if the example errored
|
|
402
|
+
end
|
|
410
403
|
```
|
|
411
404
|
|
|
412
|
-
###
|
|
413
|
-
|
|
414
|
-
Add detailed logging to your modules:
|
|
405
|
+
### Integration with optimizers
|
|
415
406
|
|
|
416
407
|
```ruby
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
end
|
|
408
|
+
metric = proc do |example, prediction|
|
|
409
|
+
expected = example.expected_values[:answer].to_s.strip.downcase
|
|
410
|
+
predicted = prediction.answer.to_s.strip.downcase
|
|
411
|
+
!expected.empty? && predicted.include?(expected)
|
|
412
|
+
end
|
|
423
413
|
|
|
424
|
-
|
|
425
|
-
@logger.info "Classifying email: #{input[:email_subject]}"
|
|
414
|
+
optimizer = DSPy::Teleprompt::MIPROv2::AutoMode.medium(metric: metric)
|
|
426
415
|
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
416
|
+
result = optimizer.compile(
|
|
417
|
+
DSPy::Predict.new(QASignature),
|
|
418
|
+
trainset: train_examples,
|
|
419
|
+
valset: val_examples
|
|
420
|
+
)
|
|
430
421
|
|
|
431
|
-
|
|
422
|
+
evaluator = DSPy::Evals.new(result.optimized_program, metric: metric)
|
|
423
|
+
test_result = evaluator.evaluate(test_examples, display_table: true)
|
|
424
|
+
puts "Test accuracy: #{(test_result.pass_rate * 100).round(2)}%"
|
|
425
|
+
```
|
|
432
426
|
|
|
433
|
-
|
|
434
|
-
@logger.debug "Reasoning: #{result[:reasoning]}"
|
|
435
|
-
end
|
|
427
|
+
---
|
|
436
428
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
429
|
+
## Storage System
|
|
430
|
+
|
|
431
|
+
`DSPy::Storage` persists optimization results, tracks history, and manages multiple versions of optimized programs.
|
|
432
|
+
|
|
433
|
+
### ProgramStorage (low-level)
|
|
434
|
+
|
|
435
|
+
```ruby
|
|
436
|
+
storage = DSPy::Storage::ProgramStorage.new(storage_path: "./dspy_storage")
|
|
437
|
+
|
|
438
|
+
# Save
|
|
439
|
+
saved = storage.save_program(
|
|
440
|
+
result.optimized_program,
|
|
441
|
+
result,
|
|
442
|
+
metadata: {
|
|
443
|
+
signature_class: 'ClassifyText',
|
|
444
|
+
optimizer: 'MIPROv2',
|
|
445
|
+
examples_count: examples.size
|
|
446
|
+
}
|
|
447
|
+
)
|
|
448
|
+
puts "Stored with ID: #{saved.program_id}"
|
|
449
|
+
|
|
450
|
+
# Load
|
|
451
|
+
saved = storage.load_program(program_id)
|
|
452
|
+
predictor = saved.program
|
|
453
|
+
score = saved.optimization_result[:best_score_value]
|
|
454
|
+
|
|
455
|
+
# List
|
|
456
|
+
storage.list_programs.each do |p|
|
|
457
|
+
puts "#{p[:program_id]} -- score: #{p[:best_score]} -- saved: #{p[:saved_at]}"
|
|
442
458
|
end
|
|
443
459
|
```
|
|
444
460
|
|
|
445
|
-
###
|
|
446
|
-
|
|
447
|
-
Monitor latency and performance metrics:
|
|
461
|
+
### StorageManager (recommended)
|
|
448
462
|
|
|
449
463
|
```ruby
|
|
450
|
-
|
|
451
|
-
def initialize
|
|
452
|
-
@metrics = {
|
|
453
|
-
total_requests: 0,
|
|
454
|
-
total_duration: 0.0,
|
|
455
|
-
errors: 0,
|
|
456
|
-
success_count: 0
|
|
457
|
-
}
|
|
458
|
-
end
|
|
464
|
+
manager = DSPy::Storage::StorageManager.new
|
|
459
465
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
@metrics[:success_count] += 1
|
|
467
|
-
result
|
|
468
|
-
rescue => e
|
|
469
|
-
@metrics[:errors] += 1
|
|
470
|
-
raise
|
|
471
|
-
ensure
|
|
472
|
-
duration = Time.now - start_time
|
|
473
|
-
@metrics[:total_duration] += duration
|
|
474
|
-
|
|
475
|
-
if @metrics[:total_requests] % 10 == 0
|
|
476
|
-
print_stats
|
|
477
|
-
end
|
|
478
|
-
end
|
|
479
|
-
end
|
|
466
|
+
# Save with tags
|
|
467
|
+
saved = manager.save_optimization_result(
|
|
468
|
+
result,
|
|
469
|
+
tags: ['production', 'sentiment-analysis'],
|
|
470
|
+
description: 'Optimized sentiment classifier v2'
|
|
471
|
+
)
|
|
480
472
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
473
|
+
# Find programs
|
|
474
|
+
programs = manager.find_programs(
|
|
475
|
+
optimizer: 'MIPROv2',
|
|
476
|
+
min_score: 0.85,
|
|
477
|
+
tags: ['production']
|
|
478
|
+
)
|
|
484
479
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
puts "Errors: #{@metrics[:errors]}"
|
|
490
|
-
puts "========================\n"
|
|
491
|
-
end
|
|
492
|
-
end
|
|
480
|
+
recent = manager.find_programs(
|
|
481
|
+
max_age_days: 7,
|
|
482
|
+
signature_class: 'ClassifyText'
|
|
483
|
+
)
|
|
493
484
|
|
|
494
|
-
#
|
|
495
|
-
|
|
496
|
-
predictor =
|
|
485
|
+
# Get best program for a signature
|
|
486
|
+
best = manager.get_best_program('ClassifyText')
|
|
487
|
+
predictor = best.program
|
|
488
|
+
```
|
|
497
489
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
490
|
+
Global shorthand:
|
|
491
|
+
|
|
492
|
+
```ruby
|
|
493
|
+
DSPy::Storage::StorageManager.save(result, metadata: { version: '2.0' })
|
|
494
|
+
DSPy::Storage::StorageManager.load(program_id)
|
|
495
|
+
DSPy::Storage::StorageManager.best('ClassifyText')
|
|
501
496
|
```
|
|
502
497
|
|
|
503
|
-
###
|
|
498
|
+
### Checkpoints
|
|
504
499
|
|
|
505
|
-
|
|
500
|
+
Create and restore checkpoints during long-running optimizations:
|
|
506
501
|
|
|
507
502
|
```ruby
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
503
|
+
# Save a checkpoint
|
|
504
|
+
manager.create_checkpoint(
|
|
505
|
+
current_result,
|
|
506
|
+
'iteration_50',
|
|
507
|
+
metadata: { iteration: 50, current_score: 0.87 }
|
|
508
|
+
)
|
|
514
509
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
510
|
+
# Restore
|
|
511
|
+
restored = manager.restore_checkpoint('iteration_50')
|
|
512
|
+
program = restored.program
|
|
518
513
|
|
|
519
|
-
|
|
520
|
-
|
|
514
|
+
# Auto-checkpoint every N iterations
|
|
515
|
+
if iteration % 10 == 0
|
|
516
|
+
manager.create_checkpoint(current_result, "auto_checkpoint_#{iteration}")
|
|
517
|
+
end
|
|
518
|
+
```
|
|
521
519
|
|
|
522
|
-
|
|
523
|
-
end
|
|
520
|
+
### Import and export
|
|
524
521
|
|
|
525
|
-
|
|
522
|
+
Share programs between environments:
|
|
526
523
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
failures.to_f / @recent_results.size
|
|
530
|
-
end
|
|
524
|
+
```ruby
|
|
525
|
+
storage = DSPy::Storage::ProgramStorage.new
|
|
531
526
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
end
|
|
527
|
+
# Export
|
|
528
|
+
storage.export_programs(['abc123', 'def456'], './export_backup.json')
|
|
529
|
+
|
|
530
|
+
# Import
|
|
531
|
+
imported = storage.import_programs('./export_backup.json')
|
|
532
|
+
puts "Imported #{imported.size} programs"
|
|
539
533
|
```
|
|
540
534
|
|
|
541
|
-
|
|
535
|
+
### Optimization history
|
|
542
536
|
|
|
543
|
-
|
|
537
|
+
```ruby
|
|
538
|
+
history = manager.get_optimization_history
|
|
544
539
|
|
|
545
|
-
|
|
540
|
+
history[:summary][:total_programs]
|
|
541
|
+
history[:summary][:avg_score]
|
|
546
542
|
|
|
547
|
-
|
|
548
|
-
#
|
|
549
|
-
test_cases = [
|
|
550
|
-
{ input: {...}, expected: {...} },
|
|
551
|
-
# More test cases...
|
|
552
|
-
]
|
|
553
|
-
|
|
554
|
-
# Ensure baseline functionality
|
|
555
|
-
test_cases.each do |tc|
|
|
556
|
-
result = module.forward(tc[:input])
|
|
557
|
-
assert result[:category] == tc[:expected][:category]
|
|
543
|
+
history[:optimizer_stats].each do |optimizer, stats|
|
|
544
|
+
puts "#{optimizer}: #{stats[:count]} programs, best: #{stats[:best_score]}"
|
|
558
545
|
end
|
|
559
546
|
|
|
560
|
-
|
|
561
|
-
optimized = optimizer.compile(module, trainset: test_cases)
|
|
547
|
+
history[:trends][:improvement_percentage]
|
|
562
548
|
```
|
|
563
549
|
|
|
564
|
-
###
|
|
565
|
-
|
|
566
|
-
Define metrics that align with business goals:
|
|
550
|
+
### Program comparison
|
|
567
551
|
|
|
568
552
|
```ruby
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
else
|
|
574
|
-
return prediction[:category] == example[:expected_output][:category] ? 0.8 : 0.0
|
|
575
|
-
end
|
|
576
|
-
end
|
|
553
|
+
comparison = manager.compare_programs(id_a, id_b)
|
|
554
|
+
comparison[:comparison][:score_difference]
|
|
555
|
+
comparison[:comparison][:better_program]
|
|
556
|
+
comparison[:comparison][:age_difference_hours]
|
|
577
557
|
```
|
|
578
558
|
|
|
579
|
-
###
|
|
580
|
-
|
|
581
|
-
Always track production performance:
|
|
559
|
+
### Storage configuration
|
|
582
560
|
|
|
583
561
|
```ruby
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
@error_tracker = ErrorRateMonitor.new
|
|
590
|
-
end
|
|
562
|
+
config = DSPy::Storage::StorageManager::StorageConfig.new
|
|
563
|
+
config.storage_path = Rails.root.join('dspy_storage')
|
|
564
|
+
config.auto_save = true
|
|
565
|
+
config.save_intermediate_results = false
|
|
566
|
+
config.max_stored_programs = 100
|
|
591
567
|
|
|
592
|
-
|
|
593
|
-
@monitor.monitor_request do
|
|
594
|
-
result = @predictor.forward(input)
|
|
595
|
-
@error_tracker.track_result(success: true)
|
|
596
|
-
result
|
|
597
|
-
rescue => e
|
|
598
|
-
@error_tracker.track_result(success: false)
|
|
599
|
-
raise
|
|
600
|
-
end
|
|
601
|
-
end
|
|
602
|
-
end
|
|
568
|
+
manager = DSPy::Storage::StorageManager.new(config: config)
|
|
603
569
|
```
|
|
604
570
|
|
|
605
|
-
###
|
|
571
|
+
### Cleanup
|
|
606
572
|
|
|
607
|
-
|
|
573
|
+
Remove old programs. Cleanup retains the best performing and most recent programs using a weighted score (70% performance, 30% recency):
|
|
608
574
|
|
|
609
575
|
```ruby
|
|
610
|
-
|
|
611
|
-
|
|
576
|
+
deleted_count = manager.cleanup_old_programs
|
|
577
|
+
```
|
|
612
578
|
|
|
613
|
-
|
|
614
|
-
super
|
|
615
|
-
@predictor = DSPy::ChainOfThought.new(EmailClassificationSignature)
|
|
616
|
-
end
|
|
579
|
+
### Storage events
|
|
617
580
|
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
581
|
+
The storage system emits structured log events for monitoring:
|
|
582
|
+
- `dspy.storage.save_start`, `dspy.storage.save_complete`, `dspy.storage.save_error`
|
|
583
|
+
- `dspy.storage.load_start`, `dspy.storage.load_complete`, `dspy.storage.load_error`
|
|
584
|
+
- `dspy.storage.delete`, `dspy.storage.export`, `dspy.storage.import`, `dspy.storage.cleanup`
|
|
585
|
+
|
|
586
|
+
### File layout
|
|
587
|
+
|
|
588
|
+
```
|
|
589
|
+
dspy_storage/
|
|
590
|
+
programs/
|
|
591
|
+
abc123def456.json
|
|
592
|
+
789xyz012345.json
|
|
593
|
+
history.json
|
|
623
594
|
```
|
|
595
|
+
|
|
596
|
+
---
|
|
597
|
+
|
|
598
|
+
## API rules
|
|
599
|
+
|
|
600
|
+
- Call predictors with `.call()`, not `.forward()`.
|
|
601
|
+
- Access prediction fields with dot notation (`result.answer`), not hash notation (`result[:answer]`).
|
|
602
|
+
- GEPA metrics return `DSPy::Prediction.new(score:, feedback:)`, not a boolean.
|
|
603
|
+
- MIPROv2 metrics may return `true`/`false`, a numeric score, or `DSPy::Prediction`.
|