ag-cortex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/commands/test-browser.md +339 -0
- package/.agent/rules/00-constitution.md +46 -0
- package/.agent/rules/project-rules.md +49 -0
- package/.agent/skills/agent-browser/SKILL.md +223 -0
- package/.agent/skills/agent-native-architecture/SKILL.md +435 -0
- package/.agent/skills/agent-native-architecture/references/action-parity-discipline.md +409 -0
- package/.agent/skills/agent-native-architecture/references/agent-execution-patterns.md +467 -0
- package/.agent/skills/agent-native-architecture/references/agent-native-testing.md +582 -0
- package/.agent/skills/agent-native-architecture/references/architecture-patterns.md +478 -0
- package/.agent/skills/agent-native-architecture/references/dynamic-context-injection.md +338 -0
- package/.agent/skills/agent-native-architecture/references/files-universal-interface.md +301 -0
- package/.agent/skills/agent-native-architecture/references/from-primitives-to-domain-tools.md +359 -0
- package/.agent/skills/agent-native-architecture/references/mcp-tool-design.md +506 -0
- package/.agent/skills/agent-native-architecture/references/mobile-patterns.md +871 -0
- package/.agent/skills/agent-native-architecture/references/product-implications.md +443 -0
- package/.agent/skills/agent-native-architecture/references/refactoring-to-prompt-native.md +317 -0
- package/.agent/skills/agent-native-architecture/references/self-modification.md +269 -0
- package/.agent/skills/agent-native-architecture/references/shared-workspace-architecture.md +680 -0
- package/.agent/skills/agent-native-architecture/references/system-prompt-design.md +250 -0
- package/.agent/skills/agent-native-reviewer/SKILL.md +246 -0
- package/.agent/skills/andrew-kane-gem-writer/SKILL.md +184 -0
- package/.agent/skills/andrew-kane-gem-writer/references/database-adapters.md +231 -0
- package/.agent/skills/andrew-kane-gem-writer/references/module-organization.md +121 -0
- package/.agent/skills/andrew-kane-gem-writer/references/rails-integration.md +183 -0
- package/.agent/skills/andrew-kane-gem-writer/references/resources.md +119 -0
- package/.agent/skills/andrew-kane-gem-writer/references/testing-patterns.md +261 -0
- package/.agent/skills/ankane-readme-writer/SKILL.md +50 -0
- package/.agent/skills/architecture-strategist/SKILL.md +52 -0
- package/.agent/skills/best-practices-researcher/SKILL.md +100 -0
- package/.agent/skills/bug-reproduction-validator/SKILL.md +67 -0
- package/.agent/skills/code-simplicity-reviewer/SKILL.md +85 -0
- package/.agent/skills/coding-tutor/.claude-plugin/plugin.json +9 -0
- package/.agent/skills/coding-tutor/README.md +37 -0
- package/.agent/skills/coding-tutor/commands/quiz-me.md +1 -0
- package/.agent/skills/coding-tutor/commands/sync-tutorials.md +25 -0
- package/.agent/skills/coding-tutor/commands/teach-me.md +1 -0
- package/.agent/skills/coding-tutor/skills/coding-tutor/SKILL.md +214 -0
- package/.agent/skills/coding-tutor/skills/coding-tutor/scripts/create_tutorial.py +202 -0
- package/.agent/skills/coding-tutor/skills/coding-tutor/scripts/index_tutorials.py +203 -0
- package/.agent/skills/coding-tutor/skills/coding-tutor/scripts/quiz_priority.py +190 -0
- package/.agent/skills/coding-tutor/skills/coding-tutor/scripts/setup_tutorials.py +132 -0
- package/.agent/skills/compound-docs/SKILL.md +510 -0
- package/.agent/skills/compound-docs/assets/critical-pattern-template.md +34 -0
- package/.agent/skills/compound-docs/assets/resolution-template.md +93 -0
- package/.agent/skills/compound-docs/references/yaml-schema.md +65 -0
- package/.agent/skills/compound-docs/schema.yaml +176 -0
- package/.agent/skills/create-agent-skills/SKILL.md +299 -0
- package/.agent/skills/create-agent-skills/references/api-security.md +226 -0
- package/.agent/skills/create-agent-skills/references/be-clear-and-direct.md +531 -0
- package/.agent/skills/create-agent-skills/references/best-practices.md +404 -0
- package/.agent/skills/create-agent-skills/references/common-patterns.md +595 -0
- package/.agent/skills/create-agent-skills/references/core-principles.md +437 -0
- package/.agent/skills/create-agent-skills/references/executable-code.md +175 -0
- package/.agent/skills/create-agent-skills/references/iteration-and-testing.md +474 -0
- package/.agent/skills/create-agent-skills/references/official-spec.md +185 -0
- package/.agent/skills/create-agent-skills/references/recommended-structure.md +168 -0
- package/.agent/skills/create-agent-skills/references/skill-structure.md +372 -0
- package/.agent/skills/create-agent-skills/references/using-scripts.md +113 -0
- package/.agent/skills/create-agent-skills/references/using-templates.md +112 -0
- package/.agent/skills/create-agent-skills/references/workflows-and-validation.md +510 -0
- package/.agent/skills/create-agent-skills/templates/router-skill.md +73 -0
- package/.agent/skills/create-agent-skills/templates/simple-skill.md +33 -0
- package/.agent/skills/create-agent-skills/workflows/add-reference.md +96 -0
- package/.agent/skills/create-agent-skills/workflows/add-script.md +93 -0
- package/.agent/skills/create-agent-skills/workflows/add-template.md +74 -0
- package/.agent/skills/create-agent-skills/workflows/add-workflow.md +120 -0
- package/.agent/skills/create-agent-skills/workflows/audit-skill.md +138 -0
- package/.agent/skills/create-agent-skills/workflows/create-domain-expertise-skill.md +605 -0
- package/.agent/skills/create-agent-skills/workflows/create-new-skill.md +191 -0
- package/.agent/skills/create-agent-skills/workflows/get-guidance.md +121 -0
- package/.agent/skills/create-agent-skills/workflows/upgrade-to-router.md +161 -0
- package/.agent/skills/create-agent-skills/workflows/verify-skill.md +204 -0
- package/.agent/skills/data-integrity-guardian/SKILL.md +70 -0
- package/.agent/skills/data-migration-expert/SKILL.md +97 -0
- package/.agent/skills/deployment-verification-agent/SKILL.md +159 -0
- package/.agent/skills/design-implementation-reviewer/SKILL.md +85 -0
- package/.agent/skills/design-iterator/SKILL.md +197 -0
- package/.agent/skills/dhh-rails-reviewer/SKILL.md +45 -0
- package/.agent/skills/dhh-rails-style/SKILL.md +184 -0
- package/.agent/skills/dhh-rails-style/references/architecture.md +653 -0
- package/.agent/skills/dhh-rails-style/references/controllers.md +303 -0
- package/.agent/skills/dhh-rails-style/references/frontend.md +510 -0
- package/.agent/skills/dhh-rails-style/references/gems.md +266 -0
- package/.agent/skills/dhh-rails-style/references/models.md +359 -0
- package/.agent/skills/dhh-rails-style/references/testing.md +338 -0
- package/.agent/skills/dspy-ruby/SKILL.md +594 -0
- package/.agent/skills/dspy-ruby/assets/config-template.rb +359 -0
- package/.agent/skills/dspy-ruby/assets/module-template.rb +326 -0
- package/.agent/skills/dspy-ruby/assets/signature-template.rb +143 -0
- package/.agent/skills/dspy-ruby/references/core-concepts.md +265 -0
- package/.agent/skills/dspy-ruby/references/optimization.md +623 -0
- package/.agent/skills/dspy-ruby/references/providers.md +305 -0
- package/.agent/skills/every-style-editor/SKILL.md +134 -0
- package/.agent/skills/every-style-editor/references/EVERY_WRITE_STYLE.md +529 -0
- package/.agent/skills/figma-design-sync/SKILL.md +166 -0
- package/.agent/skills/file-todos/SKILL.md +251 -0
- package/.agent/skills/file-todos/assets/todo-template.md +155 -0
- package/.agent/skills/framework-docs-researcher/SKILL.md +83 -0
- package/.agent/skills/frontend-design/SKILL.md +42 -0
- package/.agent/skills/gemini-imagegen/SKILL.md +237 -0
- package/.agent/skills/gemini-imagegen/requirements.txt +2 -0
- package/.agent/skills/gemini-imagegen/scripts/compose_images.py +168 -0
- package/.agent/skills/gemini-imagegen/scripts/edit_image.py +157 -0
- package/.agent/skills/gemini-imagegen/scripts/gemini_images.py +265 -0
- package/.agent/skills/gemini-imagegen/scripts/generate_image.py +147 -0
- package/.agent/skills/gemini-imagegen/scripts/multi_turn_chat.py +215 -0
- package/.agent/skills/git-history-analyzer/SKILL.md +42 -0
- package/.agent/skills/git-worktree/SKILL.md +302 -0
- package/.agent/skills/git-worktree/scripts/worktree-manager.sh +345 -0
- package/.agent/skills/julik-frontend-races-reviewer/SKILL.md +222 -0
- package/.agent/skills/kieran-python-reviewer/SKILL.md +104 -0
- package/.agent/skills/kieran-rails-reviewer/SKILL.md +86 -0
- package/.agent/skills/kieran-typescript-reviewer/SKILL.md +95 -0
- package/.agent/skills/lint/SKILL.md +16 -0
- package/.agent/skills/pattern-recognition-specialist/SKILL.md +57 -0
- package/.agent/skills/performance-oracle/SKILL.md +110 -0
- package/.agent/skills/pr-comment-resolver/SKILL.md +69 -0
- package/.agent/skills/rclone/SKILL.md +150 -0
- package/.agent/skills/rclone/scripts/check_setup.sh +60 -0
- package/.agent/skills/repo-research-analyst/SKILL.md +113 -0
- package/.agent/skills/security-sentinel/SKILL.md +93 -0
- package/.agent/skills/skill-creator/SKILL.md +209 -0
- package/.agent/skills/skill-creator/scripts/init_skill.py +304 -0
- package/.agent/skills/skill-creator/scripts/package_skill.py +112 -0
- package/.agent/skills/skill-creator/scripts/quick_validate.py +72 -0
- package/.agent/skills/spec-flow-analyzer/SKILL.md +113 -0
- package/.agent/skills/test-agent/SKILL.md +4 -0
- package/.agent/workflows/agent-native-audit.md +277 -0
- package/.agent/workflows/ask-user-question.md +21 -0
- package/.agent/workflows/changelog.md +137 -0
- package/.agent/workflows/compound.md +202 -0
- package/.agent/workflows/create-agent-skill.md +8 -0
- package/.agent/workflows/deepen-plan-research.md +334 -0
- package/.agent/workflows/deepen-plan-synthesis.md +182 -0
- package/.agent/workflows/deepen-plan.md +79 -0
- package/.agent/workflows/feature-video.md +342 -0
- package/.agent/workflows/generate-command.md +162 -0
- package/.agent/workflows/heal-skill.md +142 -0
- package/.agent/workflows/lfg.md +20 -0
- package/.agent/workflows/plan-analysis.md +67 -0
- package/.agent/workflows/plan-next-steps.md +63 -0
- package/.agent/workflows/plan-review.md +33 -0
- package/.agent/workflows/plan-synthesis.md +106 -0
- package/.agent/workflows/plan.md +49 -0
- package/.agent/workflows/report-bug.md +150 -0
- package/.agent/workflows/reproduce-bug.md +99 -0
- package/.agent/workflows/resolve-parallel.md +34 -0
- package/.agent/workflows/resolve-pr-parallel.md +49 -0
- package/.agent/workflows/resolve-todo-parallel.md +35 -0
- package/.agent/workflows/review-analysis.md +145 -0
- package/.agent/workflows/review-synthesis.md +262 -0
- package/.agent/workflows/review.md +64 -0
- package/.agent/workflows/ship.md +90 -0
- package/.agent/workflows/test-command.md +3 -0
- package/.agent/workflows/triage.md +310 -0
- package/.agent/workflows/work.md +157 -0
- package/.agent/workflows/xcode-test.md +332 -0
- package/LICENSE +22 -0
- package/README.md +49 -0
- package/bin/ag-cortex.js +54 -0
- package/lib/core.js +165 -0
- package/package.json +31 -0
|
@@ -0,0 +1,623 @@
|
|
|
1
|
+
# DSPy.rb Testing, Optimization & Observability
|
|
2
|
+
|
|
3
|
+
## Testing
|
|
4
|
+
|
|
5
|
+
DSPy.rb enables standard RSpec testing patterns for LLM logic, making your AI applications testable and maintainable.
|
|
6
|
+
|
|
7
|
+
### Basic Testing Setup
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
require 'rspec'
|
|
11
|
+
require 'dspy'
|
|
12
|
+
|
|
13
|
+
RSpec.describe EmailClassifier do
|
|
14
|
+
before do
|
|
15
|
+
DSPy.configure do |c|
|
|
16
|
+
c.lm = DSPy::LM.new('openai/gpt-4o-mini', api_key: ENV['OPENAI_API_KEY'])
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
describe '#classify' do
|
|
21
|
+
it 'classifies technical support emails correctly' do
|
|
22
|
+
classifier = EmailClassifier.new
|
|
23
|
+
result = classifier.forward(
|
|
24
|
+
email_subject: "Can't log in",
|
|
25
|
+
email_body: "I'm unable to access my account"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
expect(result[:category]).to eq('Technical')
|
|
29
|
+
expect(result[:priority]).to be_in(['High', 'Medium', 'Low'])
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Mocking LLM Responses
|
|
36
|
+
|
|
37
|
+
Test your modules without making actual API calls:
|
|
38
|
+
|
|
39
|
+
```ruby
|
|
40
|
+
RSpec.describe MyModule do
|
|
41
|
+
it 'handles mock responses correctly' do
|
|
42
|
+
# Create a mock predictor that returns predetermined results
|
|
43
|
+
mock_predictor = instance_double(DSPy::Predict)
|
|
44
|
+
allow(mock_predictor).to receive(:forward).and_return({
|
|
45
|
+
category: 'Technical',
|
|
46
|
+
priority: 'High',
|
|
47
|
+
confidence: 0.95
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
# Inject mock into your module
|
|
51
|
+
module_instance = MyModule.new
|
|
52
|
+
module_instance.instance_variable_set(:@predictor, mock_predictor)
|
|
53
|
+
|
|
54
|
+
result = module_instance.forward(input: 'test data')
|
|
55
|
+
expect(result[:category]).to eq('Technical')
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Testing Type Safety
|
|
61
|
+
|
|
62
|
+
Verify that signatures enforce type constraints:
|
|
63
|
+
|
|
64
|
+
```ruby
|
|
65
|
+
RSpec.describe EmailClassificationSignature do
|
|
66
|
+
it 'validates output types' do
|
|
67
|
+
predictor = DSPy::Predict.new(EmailClassificationSignature)
|
|
68
|
+
|
|
69
|
+
# This should work
|
|
70
|
+
result = predictor.forward(
|
|
71
|
+
email_subject: 'Test',
|
|
72
|
+
email_body: 'Test body'
|
|
73
|
+
)
|
|
74
|
+
expect(result[:category]).to be_a(String)
|
|
75
|
+
|
|
76
|
+
# Test that invalid types are caught
|
|
77
|
+
expect {
|
|
78
|
+
# Simulate LLM returning invalid type
|
|
79
|
+
predictor.send(:validate_output, { category: 123 })
|
|
80
|
+
}.to raise_error(DSPy::ValidationError)
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Testing Edge Cases
|
|
86
|
+
|
|
87
|
+
Always test boundary conditions and error scenarios:
|
|
88
|
+
|
|
89
|
+
```ruby
|
|
90
|
+
RSpec.describe EmailClassifier do
|
|
91
|
+
it 'handles empty emails' do
|
|
92
|
+
classifier = EmailClassifier.new
|
|
93
|
+
result = classifier.forward(
|
|
94
|
+
email_subject: '',
|
|
95
|
+
email_body: ''
|
|
96
|
+
)
|
|
97
|
+
# Define expected behavior for edge case
|
|
98
|
+
expect(result[:category]).to eq('General')
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
it 'handles very long emails' do
|
|
102
|
+
long_body = 'word ' * 10000
|
|
103
|
+
classifier = EmailClassifier.new
|
|
104
|
+
|
|
105
|
+
expect {
|
|
106
|
+
classifier.forward(
|
|
107
|
+
email_subject: 'Test',
|
|
108
|
+
email_body: long_body
|
|
109
|
+
)
|
|
110
|
+
}.not_to raise_error
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
it 'handles special characters' do
|
|
114
|
+
classifier = EmailClassifier.new
|
|
115
|
+
result = classifier.forward(
|
|
116
|
+
email_subject: 'Test <script>alert("xss")</script>',
|
|
117
|
+
email_body: 'Body with émojis 🎉 and spëcial çharacters'
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
expect(result[:category]).to be_in(['Technical', 'Billing', 'General'])
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Integration Testing
|
|
126
|
+
|
|
127
|
+
Test complete workflows end-to-end:
|
|
128
|
+
|
|
129
|
+
```ruby
|
|
130
|
+
RSpec.describe EmailProcessingPipeline do
|
|
131
|
+
it 'processes email through complete pipeline' do
|
|
132
|
+
pipeline = EmailProcessingPipeline.new
|
|
133
|
+
|
|
134
|
+
result = pipeline.forward(
|
|
135
|
+
email_subject: 'Billing question',
|
|
136
|
+
email_body: 'How do I update my payment method?'
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Verify the complete pipeline output
|
|
140
|
+
expect(result[:classification]).to eq('Billing')
|
|
141
|
+
expect(result[:priority]).to eq('Medium')
|
|
142
|
+
expect(result[:suggested_response]).to include('payment')
|
|
143
|
+
expect(result[:assigned_team]).to eq('billing_support')
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### VCR for Deterministic Tests
|
|
149
|
+
|
|
150
|
+
Use VCR to record and replay API responses:
|
|
151
|
+
|
|
152
|
+
```ruby
|
|
153
|
+
require 'vcr'
|
|
154
|
+
|
|
155
|
+
VCR.configure do |config|
|
|
156
|
+
config.cassette_library_dir = 'spec/vcr_cassettes'
|
|
157
|
+
config.hook_into :webmock
|
|
158
|
+
config.filter_sensitive_data('<OPENAI_API_KEY>') { ENV['OPENAI_API_KEY'] }
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
RSpec.describe EmailClassifier do
|
|
162
|
+
it 'classifies emails consistently', :vcr do
|
|
163
|
+
VCR.use_cassette('email_classification') do
|
|
164
|
+
classifier = EmailClassifier.new
|
|
165
|
+
result = classifier.forward(
|
|
166
|
+
email_subject: 'Test subject',
|
|
167
|
+
email_body: 'Test body'
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
expect(result[:category]).to eq('Technical')
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Optimization
|
|
177
|
+
|
|
178
|
+
DSPy.rb provides powerful optimization capabilities to automatically improve your prompts and modules.
|
|
179
|
+
|
|
180
|
+
### MIPROv2 Optimization
|
|
181
|
+
|
|
182
|
+
MIPROv2 is an advanced multi-prompt optimization technique that uses bootstrap sampling, instruction generation, and Bayesian optimization.
|
|
183
|
+
|
|
184
|
+
```ruby
|
|
185
|
+
require 'dspy/mipro'
|
|
186
|
+
|
|
187
|
+
# Define your module to optimize
|
|
188
|
+
class EmailClassifier < DSPy::Module
|
|
189
|
+
def initialize
|
|
190
|
+
super
|
|
191
|
+
@predictor = DSPy::ChainOfThought.new(EmailClassificationSignature)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def forward(input)
|
|
195
|
+
@predictor.forward(input)
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Prepare training data
|
|
200
|
+
training_examples = [
|
|
201
|
+
{
|
|
202
|
+
input: { email_subject: "Can't log in", email_body: "Password reset not working" },
|
|
203
|
+
expected_output: { category: 'Technical', priority: 'High' }
|
|
204
|
+
},
|
|
205
|
+
{
|
|
206
|
+
input: { email_subject: "Billing question", email_body: "How much does premium cost?" },
|
|
207
|
+
expected_output: { category: 'Billing', priority: 'Medium' }
|
|
208
|
+
},
|
|
209
|
+
# Add more examples...
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
# Define evaluation metric
|
|
213
|
+
def accuracy_metric(example, prediction)
|
|
214
|
+
(example[:expected_output][:category] == prediction[:category]) ? 1.0 : 0.0
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Run optimization
|
|
218
|
+
optimizer = DSPy::MIPROv2.new(
|
|
219
|
+
metric: method(:accuracy_metric),
|
|
220
|
+
num_candidates: 10,
|
|
221
|
+
num_threads: 4
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
optimized_module = optimizer.compile(
|
|
225
|
+
EmailClassifier.new,
|
|
226
|
+
trainset: training_examples
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Use optimized module
|
|
230
|
+
result = optimized_module.forward(
|
|
231
|
+
email_subject: "New email",
|
|
232
|
+
email_body: "New email content"
|
|
233
|
+
)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### Bootstrap Few-Shot Learning
|
|
237
|
+
|
|
238
|
+
Automatically generate few-shot examples from your training data:
|
|
239
|
+
|
|
240
|
+
```ruby
|
|
241
|
+
require 'dspy/teleprompt'
|
|
242
|
+
|
|
243
|
+
# Create a teleprompter for few-shot optimization
|
|
244
|
+
teleprompter = DSPy::BootstrapFewShot.new(
|
|
245
|
+
metric: method(:accuracy_metric),
|
|
246
|
+
max_bootstrapped_demos: 5,
|
|
247
|
+
max_labeled_demos: 3
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Compile the optimized module
|
|
251
|
+
optimized = teleprompter.compile(
|
|
252
|
+
MyModule.new,
|
|
253
|
+
trainset: training_examples
|
|
254
|
+
)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Custom Optimization Metrics
|
|
258
|
+
|
|
259
|
+
Define custom metrics for your specific use case:
|
|
260
|
+
|
|
261
|
+
```ruby
|
|
262
|
+
def custom_metric(example, prediction)
|
|
263
|
+
score = 0.0
|
|
264
|
+
|
|
265
|
+
# Category accuracy (60% weight)
|
|
266
|
+
score += 0.6 if example[:expected_output][:category] == prediction[:category]
|
|
267
|
+
|
|
268
|
+
# Priority accuracy (40% weight)
|
|
269
|
+
score += 0.4 if example[:expected_output][:priority] == prediction[:priority]
|
|
270
|
+
|
|
271
|
+
score
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
# Use in optimization
|
|
275
|
+
optimizer = DSPy::MIPROv2.new(
|
|
276
|
+
metric: method(:custom_metric),
|
|
277
|
+
num_candidates: 10
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### A/B Testing Different Approaches
|
|
282
|
+
|
|
283
|
+
Compare different module implementations:
|
|
284
|
+
|
|
285
|
+
```ruby
|
|
286
|
+
# Approach A: ChainOfThought
|
|
287
|
+
class ApproachA < DSPy::Module
|
|
288
|
+
def initialize
|
|
289
|
+
super
|
|
290
|
+
@predictor = DSPy::ChainOfThought.new(EmailClassificationSignature)
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def forward(input)
|
|
294
|
+
@predictor.forward(input)
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Approach B: ReAct with tools
|
|
299
|
+
class ApproachB < DSPy::Module
|
|
300
|
+
def initialize
|
|
301
|
+
super
|
|
302
|
+
@predictor = DSPy::ReAct.new(
|
|
303
|
+
EmailClassificationSignature,
|
|
304
|
+
tools: [KnowledgeBaseTool.new]
|
|
305
|
+
)
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def forward(input)
|
|
309
|
+
@predictor.forward(input)
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# Evaluate both approaches
|
|
314
|
+
def evaluate_approach(approach_class, test_set)
|
|
315
|
+
approach = approach_class.new
|
|
316
|
+
scores = test_set.map do |example|
|
|
317
|
+
prediction = approach.forward(example[:input])
|
|
318
|
+
accuracy_metric(example, prediction)
|
|
319
|
+
end
|
|
320
|
+
scores.sum / scores.size
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
approach_a_score = evaluate_approach(ApproachA, test_examples)
|
|
324
|
+
approach_b_score = evaluate_approach(ApproachB, test_examples)
|
|
325
|
+
|
|
326
|
+
puts "Approach A accuracy: #{approach_a_score}"
|
|
327
|
+
puts "Approach B accuracy: #{approach_b_score}"
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
## Observability
|
|
331
|
+
|
|
332
|
+
Track your LLM application's performance, token usage, and behavior in production.
|
|
333
|
+
|
|
334
|
+
### OpenTelemetry Integration
|
|
335
|
+
|
|
336
|
+
DSPy.rb automatically integrates with OpenTelemetry when configured:
|
|
337
|
+
|
|
338
|
+
```ruby
|
|
339
|
+
require 'opentelemetry/sdk'
|
|
340
|
+
require 'dspy'
|
|
341
|
+
|
|
342
|
+
# Configure OpenTelemetry
|
|
343
|
+
OpenTelemetry::SDK.configure do |c|
|
|
344
|
+
c.service_name = 'my-dspy-app'
|
|
345
|
+
c.use_all # Use all available instrumentation
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# DSPy automatically creates traces for predictions
|
|
349
|
+
predictor = DSPy::Predict.new(MySignature)
|
|
350
|
+
result = predictor.forward(input: 'data')
|
|
351
|
+
# Traces are automatically sent to your OpenTelemetry collector
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
### Langfuse Integration
|
|
355
|
+
|
|
356
|
+
Track detailed LLM execution traces with Langfuse:
|
|
357
|
+
|
|
358
|
+
```ruby
|
|
359
|
+
require 'dspy/langfuse'
|
|
360
|
+
|
|
361
|
+
# Configure Langfuse
|
|
362
|
+
DSPy.configure do |c|
|
|
363
|
+
c.lm = DSPy::LM.new('openai/gpt-4o-mini', api_key: ENV['OPENAI_API_KEY'])
|
|
364
|
+
c.langfuse = {
|
|
365
|
+
public_key: ENV['LANGFUSE_PUBLIC_KEY'],
|
|
366
|
+
secret_key: ENV['LANGFUSE_SECRET_KEY'],
|
|
367
|
+
host: ENV['LANGFUSE_HOST'] || 'https://cloud.langfuse.com'
|
|
368
|
+
}
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
# All predictions are automatically traced
|
|
372
|
+
predictor = DSPy::Predict.new(MySignature)
|
|
373
|
+
result = predictor.forward(input: 'data')
|
|
374
|
+
# View detailed traces in Langfuse dashboard
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
### Manual Token Tracking
|
|
378
|
+
|
|
379
|
+
Track token usage without external services:
|
|
380
|
+
|
|
381
|
+
```ruby
|
|
382
|
+
class TokenTracker
|
|
383
|
+
def initialize
|
|
384
|
+
@total_tokens = 0
|
|
385
|
+
@request_count = 0
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
def track_prediction(predictor, input)
|
|
389
|
+
start_time = Time.now
|
|
390
|
+
result = predictor.forward(input)
|
|
391
|
+
duration = Time.now - start_time
|
|
392
|
+
|
|
393
|
+
# Get token usage from response metadata
|
|
394
|
+
tokens = result.metadata[:usage][:total_tokens] rescue 0
|
|
395
|
+
@total_tokens += tokens
|
|
396
|
+
@request_count += 1
|
|
397
|
+
|
|
398
|
+
puts "Request ##{@request_count}: #{tokens} tokens in #{duration}s"
|
|
399
|
+
puts "Total tokens used: #{@total_tokens}"
|
|
400
|
+
|
|
401
|
+
result
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
# Usage
|
|
406
|
+
tracker = TokenTracker.new
|
|
407
|
+
predictor = DSPy::Predict.new(MySignature)
|
|
408
|
+
|
|
409
|
+
result = tracker.track_prediction(predictor, { input: 'data' })
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
### Custom Logging
|
|
413
|
+
|
|
414
|
+
Add detailed logging to your modules:
|
|
415
|
+
|
|
416
|
+
```ruby
|
|
417
|
+
class EmailClassifier < DSPy::Module
|
|
418
|
+
def initialize
|
|
419
|
+
super
|
|
420
|
+
@predictor = DSPy::ChainOfThought.new(EmailClassificationSignature)
|
|
421
|
+
@logger = Logger.new(STDOUT)
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
def forward(input)
|
|
425
|
+
@logger.info "Classifying email: #{input[:email_subject]}"
|
|
426
|
+
|
|
427
|
+
start_time = Time.now
|
|
428
|
+
result = @predictor.forward(input)
|
|
429
|
+
duration = Time.now - start_time
|
|
430
|
+
|
|
431
|
+
@logger.info "Classification: #{result[:category]} (#{duration}s)"
|
|
432
|
+
|
|
433
|
+
if result[:reasoning]
|
|
434
|
+
@logger.debug "Reasoning: #{result[:reasoning]}"
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
result
|
|
438
|
+
rescue => e
|
|
439
|
+
@logger.error "Classification failed: #{e.message}"
|
|
440
|
+
raise
|
|
441
|
+
end
|
|
442
|
+
end
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
### Performance Monitoring
|
|
446
|
+
|
|
447
|
+
Monitor latency and performance metrics:
|
|
448
|
+
|
|
449
|
+
```ruby
|
|
450
|
+
class PerformanceMonitor
|
|
451
|
+
def initialize
|
|
452
|
+
@metrics = {
|
|
453
|
+
total_requests: 0,
|
|
454
|
+
total_duration: 0.0,
|
|
455
|
+
errors: 0,
|
|
456
|
+
success_count: 0
|
|
457
|
+
}
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
def monitor_request
|
|
461
|
+
start_time = Time.now
|
|
462
|
+
@metrics[:total_requests] += 1
|
|
463
|
+
|
|
464
|
+
begin
|
|
465
|
+
result = yield
|
|
466
|
+
@metrics[:success_count] += 1
|
|
467
|
+
result
|
|
468
|
+
rescue => e
|
|
469
|
+
@metrics[:errors] += 1
|
|
470
|
+
raise
|
|
471
|
+
ensure
|
|
472
|
+
duration = Time.now - start_time
|
|
473
|
+
@metrics[:total_duration] += duration
|
|
474
|
+
|
|
475
|
+
if @metrics[:total_requests] % 10 == 0
|
|
476
|
+
print_stats
|
|
477
|
+
end
|
|
478
|
+
end
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
def print_stats
|
|
482
|
+
avg_duration = @metrics[:total_duration] / @metrics[:total_requests]
|
|
483
|
+
success_rate = @metrics[:success_count].to_f / @metrics[:total_requests]
|
|
484
|
+
|
|
485
|
+
puts "\n=== Performance Stats ==="
|
|
486
|
+
puts "Total requests: #{@metrics[:total_requests]}"
|
|
487
|
+
puts "Average duration: #{avg_duration.round(3)}s"
|
|
488
|
+
puts "Success rate: #{(success_rate * 100).round(2)}%"
|
|
489
|
+
puts "Errors: #{@metrics[:errors]}"
|
|
490
|
+
puts "========================\n"
|
|
491
|
+
end
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
# Usage
|
|
495
|
+
monitor = PerformanceMonitor.new
|
|
496
|
+
predictor = DSPy::Predict.new(MySignature)
|
|
497
|
+
|
|
498
|
+
result = monitor.monitor_request do
|
|
499
|
+
predictor.forward(input: 'data')
|
|
500
|
+
end
|
|
501
|
+
```
|
|
502
|
+
|
|
503
|
+
### Error Rate Tracking
|
|
504
|
+
|
|
505
|
+
Monitor and alert on error rates:
|
|
506
|
+
|
|
507
|
+
```ruby
|
|
508
|
+
class ErrorRateMonitor
|
|
509
|
+
def initialize(alert_threshold: 0.1)
|
|
510
|
+
@alert_threshold = alert_threshold
|
|
511
|
+
@recent_results = []
|
|
512
|
+
@window_size = 100
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
def track_result(success:)
|
|
516
|
+
@recent_results << success
|
|
517
|
+
@recent_results.shift if @recent_results.size > @window_size
|
|
518
|
+
|
|
519
|
+
error_rate = calculate_error_rate
|
|
520
|
+
alert_if_needed(error_rate)
|
|
521
|
+
|
|
522
|
+
error_rate
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
private
|
|
526
|
+
|
|
527
|
+
def calculate_error_rate
|
|
528
|
+
failures = @recent_results.count(false)
|
|
529
|
+
failures.to_f / @recent_results.size
|
|
530
|
+
end
|
|
531
|
+
|
|
532
|
+
def alert_if_needed(error_rate)
|
|
533
|
+
if error_rate > @alert_threshold
|
|
534
|
+
puts "⚠️ ALERT: Error rate #{(error_rate * 100).round(2)}% exceeds threshold!"
|
|
535
|
+
# Send notification, page oncall, etc.
|
|
536
|
+
end
|
|
537
|
+
end
|
|
538
|
+
end
|
|
539
|
+
```
|
|
540
|
+
|
|
541
|
+
## Best Practices
|
|
542
|
+
|
|
543
|
+
### 1. Start with Tests
|
|
544
|
+
|
|
545
|
+
Write tests before optimizing:
|
|
546
|
+
|
|
547
|
+
```ruby
|
|
548
|
+
# Define test cases first
|
|
549
|
+
test_cases = [
|
|
550
|
+
{ input: {...}, expected: {...} },
|
|
551
|
+
# More test cases...
|
|
552
|
+
]
|
|
553
|
+
|
|
554
|
+
# Ensure baseline functionality
|
|
555
|
+
test_cases.each do |tc|
|
|
556
|
+
result = module.forward(tc[:input])
|
|
557
|
+
assert result[:category] == tc[:expected][:category]
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
# Then optimize
|
|
561
|
+
optimized = optimizer.compile(module, trainset: test_cases)
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
### 2. Use Meaningful Metrics
|
|
565
|
+
|
|
566
|
+
Define metrics that align with business goals:
|
|
567
|
+
|
|
568
|
+
```ruby
|
|
569
|
+
def business_aligned_metric(example, prediction)
|
|
570
|
+
# High-priority errors are more costly
|
|
571
|
+
if example[:expected_output][:priority] == 'High'
|
|
572
|
+
return prediction[:priority] == 'High' ? 1.0 : 0.0
|
|
573
|
+
else
|
|
574
|
+
return prediction[:category] == example[:expected_output][:category] ? 0.8 : 0.0
|
|
575
|
+
end
|
|
576
|
+
end
|
|
577
|
+
```
|
|
578
|
+
|
|
579
|
+
### 3. Monitor in Production
|
|
580
|
+
|
|
581
|
+
Always track production performance:
|
|
582
|
+
|
|
583
|
+
```ruby
|
|
584
|
+
class ProductionModule < DSPy::Module
|
|
585
|
+
def initialize
|
|
586
|
+
super
|
|
587
|
+
@predictor = DSPy::ChainOfThought.new(MySignature)
|
|
588
|
+
@monitor = PerformanceMonitor.new
|
|
589
|
+
@error_tracker = ErrorRateMonitor.new
|
|
590
|
+
end
|
|
591
|
+
|
|
592
|
+
def forward(input)
|
|
593
|
+
@monitor.monitor_request do
|
|
594
|
+
result = @predictor.forward(input)
|
|
595
|
+
@error_tracker.track_result(success: true)
|
|
596
|
+
result
|
|
597
|
+
rescue => e
|
|
598
|
+
@error_tracker.track_result(success: false)
|
|
599
|
+
raise
|
|
600
|
+
end
|
|
601
|
+
end
|
|
602
|
+
end
|
|
603
|
+
```
|
|
604
|
+
|
|
605
|
+
### 4. Version Your Modules
|
|
606
|
+
|
|
607
|
+
Track which version of your module is deployed:
|
|
608
|
+
|
|
609
|
+
```ruby
|
|
610
|
+
class EmailClassifierV2 < DSPy::Module
|
|
611
|
+
VERSION = '2.1.0'
|
|
612
|
+
|
|
613
|
+
def initialize
|
|
614
|
+
super
|
|
615
|
+
@predictor = DSPy::ChainOfThought.new(EmailClassificationSignature)
|
|
616
|
+
end
|
|
617
|
+
|
|
618
|
+
def forward(input)
|
|
619
|
+
result = @predictor.forward(input)
|
|
620
|
+
result.merge(model_version: VERSION)
|
|
621
|
+
end
|
|
622
|
+
end
|
|
623
|
+
```
|