dspy-evals 0.29.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +45 -0
- data/README.md +249 -0
- data/lib/dspy/evals/version.rb +7 -0
- data/lib/dspy/evals.rb +820 -0
- metadata +88 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: f39ede0bd93df0362c4cf8205ed8c5340cd52100cb0ba83a650f39583e496d76
|
|
4
|
+
data.tar.gz: 96eafcbb25a32b13d4c5b18e1685e7867f71e97fb59ae2aaa20a7aa4940d0db7
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 8f2f94c8cc7f3660a4083a04cf6e4720d473baab59bc3dad06671068c8003731bd256859438ba13f575815c09c098fca4e24bd6b322ac9b07ad0d6c196e3ec1e
|
|
7
|
+
data.tar.gz: 863806288464a5859e8b9ee04b8ac19def6820ce31a88dfe5fc7b32605a638cb509f67d25b5dfd96641a09cac44627f4d42531489118d0c41c4ca597a6895a57
|
data/LICENSE
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Vicente Services SL
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
This project is a Ruby port of the original Python [DSPy library](https://github.com/stanfordnlp/dspy), which is licensed under the MIT License:
|
|
24
|
+
|
|
25
|
+
MIT License
|
|
26
|
+
|
|
27
|
+
Copyright (c) 2023 Stanford Future Data Systems
|
|
28
|
+
|
|
29
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
30
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
31
|
+
in the Software without restriction, including without limitation the rights
|
|
32
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
33
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
34
|
+
furnished to do so, subject to the following conditions:
|
|
35
|
+
|
|
36
|
+
The above copyright notice and this permission notice shall be included in all
|
|
37
|
+
copies or substantial portions of the Software.
|
|
38
|
+
|
|
39
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
40
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
41
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
42
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
43
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
44
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
45
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# DSPy.rb
|
|
2
|
+
|
|
3
|
+
[](https://rubygems.org/gems/dspy)
|
|
4
|
+
[](https://rubygems.org/gems/dspy)
|
|
5
|
+
[](https://github.com/vicentereig/dspy.rb/actions/workflows/ruby.yml)
|
|
6
|
+
[](https://vicentereig.github.io/dspy.rb/)
|
|
7
|
+
|
|
8
|
+
> [!NOTE]
|
|
9
|
+
> The core Prompt Engineering Framework is production-ready with
|
|
10
|
+
> comprehensive documentation. I am focusing now on educational content on systematic Prompt Optimization and Context Engineering.
|
|
11
|
+
> Your feedback is invaluable. if you encounter issues, please open an [issue](https://github.com/vicentereig/dspy.rb/issues). If you have suggestions, open a [new thread](https://github.com/vicentereig/dspy.rb/discussions).
|
|
12
|
+
>
|
|
13
|
+
> If you want to contribute, feel free to reach out to me to coordinate efforts: hey at vicente.services
|
|
14
|
+
>
|
|
15
|
+
> And, yes, this is 100% a legit project. :)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
**Build reliable LLM applications in idiomatic Ruby using composable, type-safe modules.**
|
|
19
|
+
|
|
20
|
+
The Ruby framework for programming with large language models. DSPy.rb brings structured LLM programming to Ruby developers, programmatic Prompt Engineering and Context Engineering.
|
|
21
|
+
Instead of wrestling with prompt strings and parsing responses, you define typed signatures using idiomatic Ruby to compose and decompose AI Worklows and AI Agents.
|
|
22
|
+
|
|
23
|
+
**Prompts are the just Functions.** Traditional prompting is like writing code with string concatenation: it works until it doesn't. DSPy.rb brings you
|
|
24
|
+
the programming approach pioneered by [dspy.ai](https://dspy.ai/): instead of crafting fragile prompts, you define modular
|
|
25
|
+
signatures and let the framework handle the messy details.
|
|
26
|
+
|
|
27
|
+
DSPy.rb is an idiomatic Ruby surgical port of Stanford's [DSPy framework](https://github.com/stanfordnlp/dspy). While implementing
|
|
28
|
+
the core concepts of signatures, predictors, and the main optimization algorithms from the original Python library, DSPy.rb embraces Ruby
|
|
29
|
+
conventions and adds Ruby-specific innovations like Sorbet-base Typed system, ReAct loops, and production-ready integrations like non-blocking Open Telemetry Instrumentation.
|
|
30
|
+
|
|
31
|
+
**What you get?** Ruby LLM applications that actually scale and don't break when you sneeze.
|
|
32
|
+
|
|
33
|
+
Check the [examples](examples/) and take them for a spin!
|
|
34
|
+
|
|
35
|
+
## Your First DSPy Program
|
|
36
|
+
### Installation
|
|
37
|
+
|
|
38
|
+
Add to your Gemfile:
|
|
39
|
+
|
|
40
|
+
```ruby
|
|
41
|
+
gem 'dspy'
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
and
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
bundle install
|
|
48
|
+
```
|
|
49
|
+
### Your First Reliable Predictor
|
|
50
|
+
|
|
51
|
+
```ruby
|
|
52
|
+
|
|
53
|
+
# Configure DSPy globablly to use your fave LLM - you can override this on an instance levle.
|
|
54
|
+
DSPy.configure do |c|
|
|
55
|
+
c.lm = DSPy::LM.new('openai/gpt-4o-mini',
|
|
56
|
+
api_key: ENV['OPENAI_API_KEY'],
|
|
57
|
+
structured_outputs: true) # Enable OpenAI's native JSON mode
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Define a signature for sentiment classification - instead of writing a full prompt!
|
|
61
|
+
class Classify < DSPy::Signature
|
|
62
|
+
description "Classify sentiment of a given sentence." # sets the goal of the underlying prompt
|
|
63
|
+
|
|
64
|
+
class Sentiment < T::Enum
|
|
65
|
+
enums do
|
|
66
|
+
Positive = new('positive')
|
|
67
|
+
Negative = new('negative')
|
|
68
|
+
Neutral = new('neutral')
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Structured Inputs: makes sure you are sending only valid prompt inputs to your model
|
|
73
|
+
input do
|
|
74
|
+
const :sentence, String, description: 'The sentence to analyze'
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Structured Outputs: your predictor will validate the output of the model too.
|
|
78
|
+
output do
|
|
79
|
+
const :sentiment, Sentiment, description: 'The sentiment of the sentence'
|
|
80
|
+
const :confidence, Float, description: 'A number between 0.0 and 1.0'
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Wire it to the simplest prompting technique - a Predictn.
|
|
85
|
+
classify = DSPy::Predict.new(Classify)
|
|
86
|
+
# it may raise an error if you mess the inputs or your LLM messes the outputs.
|
|
87
|
+
result = classify.call(sentence: "This book was super fun to read!")
|
|
88
|
+
|
|
89
|
+
puts result.sentiment # => #<Sentiment::Positive>
|
|
90
|
+
puts result.confidence # => 0.85
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Access to 200+ Models Across 5 Providers
|
|
94
|
+
|
|
95
|
+
DSPy.rb provides unified access to major LLM providers with provider-specific optimizations:
|
|
96
|
+
|
|
97
|
+
```ruby
|
|
98
|
+
# OpenAI (GPT-4, GPT-4o, GPT-4o-mini, GPT-5, etc.)
|
|
99
|
+
DSPy.configure do |c|
|
|
100
|
+
c.lm = DSPy::LM.new('openai/gpt-4o-mini',
|
|
101
|
+
api_key: ENV['OPENAI_API_KEY'],
|
|
102
|
+
structured_outputs: true) # Native JSON mode
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Google Gemini (Gemini 1.5 Pro, Flash, Gemini 2.0, etc.)
|
|
106
|
+
DSPy.configure do |c|
|
|
107
|
+
c.lm = DSPy::LM.new('gemini/gemini-2.5-flash',
|
|
108
|
+
api_key: ENV['GEMINI_API_KEY'],
|
|
109
|
+
structured_outputs: true) # Native structured outputs
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Anthropic Claude (Claude 3.5, Claude 4, etc.)
|
|
113
|
+
DSPy.configure do |c|
|
|
114
|
+
c.lm = DSPy::LM.new('anthropic/claude-sonnet-4-5-20250929',
|
|
115
|
+
api_key: ENV['ANTHROPIC_API_KEY'],
|
|
116
|
+
structured_outputs: true) # Tool-based extraction (default)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Ollama - Run any local model (Llama, Mistral, Gemma, etc.)
|
|
120
|
+
DSPy.configure do |c|
|
|
121
|
+
c.lm = DSPy::LM.new('ollama/llama3.2') # Free, runs locally, no API key needed
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# OpenRouter - Access to 200+ models from multiple providers
|
|
125
|
+
DSPy.configure do |c|
|
|
126
|
+
c.lm = DSPy::LM.new('openrouter/deepseek/deepseek-chat-v3.1:free',
|
|
127
|
+
api_key: ENV['OPENROUTER_API_KEY'])
|
|
128
|
+
end
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## What You Get
|
|
132
|
+
|
|
133
|
+
**Developer Experience:**
|
|
134
|
+
- LLM provider support using official Ruby clients:
|
|
135
|
+
- [OpenAI Ruby](https://github.com/openai/openai-ruby) with vision model support
|
|
136
|
+
- [Anthropic Ruby SDK](https://github.com/anthropics/anthropic-sdk-ruby) with multimodal capabilities
|
|
137
|
+
- [Google Gemini API](https://ai.google.dev/) with native structured outputs
|
|
138
|
+
- [Ollama](https://ollama.com/) via OpenAI compatibility layer for local models
|
|
139
|
+
- **Multimodal Support** - Complete image analysis with DSPy::Image, type-safe bounding boxes, vision-capable models
|
|
140
|
+
- Runtime type checking with [Sorbet](https://sorbet.org/) including T::Enum and union types
|
|
141
|
+
- Type-safe tool definitions for ReAct agents
|
|
142
|
+
- Comprehensive instrumentation and observability
|
|
143
|
+
|
|
144
|
+
**Core Building Blocks:**
|
|
145
|
+
- **Signatures** - Define input/output schemas using Sorbet types with T::Enum and union type support
|
|
146
|
+
- **Predict** - LLM completion with structured data extraction and multimodal support
|
|
147
|
+
- **Chain of Thought** - Step-by-step reasoning for complex problems with automatic prompt optimization
|
|
148
|
+
- **ReAct** - Tool-using agents with type-safe tool definitions and error recovery
|
|
149
|
+
- **Module Composition** - Combine multiple LLM calls into production-ready workflows
|
|
150
|
+
|
|
151
|
+
**Optimization & Evaluation:**
|
|
152
|
+
- **Prompt Objects** - Manipulate prompts as first-class objects instead of strings
|
|
153
|
+
- **Typed Examples** - Type-safe training data with automatic validation
|
|
154
|
+
- **Evaluation Framework** - Advanced metrics beyond simple accuracy with error-resilient pipelines
|
|
155
|
+
- **MIPROv2 Optimization** - Advanced Bayesian optimization with Gaussian Processes, multiple optimization strategies, auto-config presets, and storage persistence
|
|
156
|
+
|
|
157
|
+
**Production Features:**
|
|
158
|
+
- **Reliable JSON Extraction** - Native structured outputs for OpenAI and Gemini, Anthropic tool-based extraction, and automatic strategy selection with fallback
|
|
159
|
+
- **Type-Safe Configuration** - Strategy enums with automatic provider optimization (Strict/Compatible modes)
|
|
160
|
+
- **Smart Retry Logic** - Progressive fallback with exponential backoff for handling transient failures
|
|
161
|
+
- **Zero-Config Langfuse Integration** - Set env vars and get automatic OpenTelemetry traces in Langfuse
|
|
162
|
+
- **Performance Caching** - Schema and capability caching for faster repeated operations
|
|
163
|
+
- **File-based Storage** - Optimization result persistence with versioning
|
|
164
|
+
- **Structured Logging** - JSON and key=value formats with span tracking
|
|
165
|
+
|
|
166
|
+
## Recent Achievements
|
|
167
|
+
|
|
168
|
+
DSPy.rb has rapidly evolved from experimental to production-ready:
|
|
169
|
+
|
|
170
|
+
### Foundation
|
|
171
|
+
- ✅ **JSON Parsing Reliability** - Native OpenAI structured outputs with adaptive retry logic and schema-aware fallbacks
|
|
172
|
+
- ✅ **Type-Safe Strategy Configuration** - Provider-optimized strategy selection and enum-backed optimizer presets
|
|
173
|
+
- ✅ **Core Module System** - Predict, ChainOfThought, ReAct, CodeAct with type safety
|
|
174
|
+
- ✅ **Production Observability** - OpenTelemetry, New Relic, and Langfuse integration
|
|
175
|
+
- ✅ **Advanced Optimization** - MIPROv2 with Bayesian optimization, Gaussian Processes, and multi-mode search
|
|
176
|
+
|
|
177
|
+
### Recent Advances
|
|
178
|
+
- ✅ **MIPROv2 ADE Integrity (v0.29.1)** - Stratified train/val/test splits, honest precision accounting, and enum-driven `--auto` presets with integration coverage
|
|
179
|
+
- ✅ **Instruction Deduplication (v0.29.1)** - Candidate generation now filters repeated programs so optimization logs highlight unique strategies
|
|
180
|
+
- ✅ **GEPA Teleprompter (v0.29.0)** - Genetic-Pareto reflective prompt evolution with merge proposer scheduling, reflective mutation, and ADE demo parity
|
|
181
|
+
- ✅ **Optimizer Utilities Parity (v0.29.0)** - Bootstrap strategies, dataset summaries, and Layer 3 utilities unlock multi-predictor programs on Ruby
|
|
182
|
+
- ✅ **Observability Hardening (v0.29.0)** - OTLP exporter runs on a single-thread executor preventing frozen SSL contexts without blocking spans
|
|
183
|
+
- ✅ **Documentation Refresh (v0.29.x)** - New GEPA guide plus ADE optimization docs covering presets, stratified splits, and error-handling defaults
|
|
184
|
+
|
|
185
|
+
**Current Focus Areas:**
|
|
186
|
+
|
|
187
|
+
### Production Readiness
|
|
188
|
+
- 🚧 **Production Patterns** - Real-world usage validation and performance optimization
|
|
189
|
+
- 🚧 **Ruby Ecosystem Integration** - Rails integration, Sidekiq compatibility, deployment patterns
|
|
190
|
+
|
|
191
|
+
### Community & Adoption
|
|
192
|
+
- 🚧 **Community Examples** - Real-world applications and case studies
|
|
193
|
+
- 🚧 **Contributor Experience** - Making it easier to contribute and extend
|
|
194
|
+
- 🚧 **Performance Benchmarks** - Comparative analysis vs other frameworks
|
|
195
|
+
|
|
196
|
+
**v1.0 Philosophy:**
|
|
197
|
+
v1.0 will be released after extensive production battle-testing, not after checking off features.
|
|
198
|
+
The API is already stable - v1.0 represents confidence in production reliability backed by real-world validation.
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
## Documentation
|
|
202
|
+
|
|
203
|
+
📖 **[Complete Documentation Website](https://vicentereig.github.io/dspy.rb/)**
|
|
204
|
+
|
|
205
|
+
### LLM-Friendly Documentation
|
|
206
|
+
|
|
207
|
+
For LLMs and AI assistants working with DSPy.rb:
|
|
208
|
+
- **[llms.txt](https://vicentereig.github.io/dspy.rb/llms.txt)** - Concise reference optimized for LLMs
|
|
209
|
+
- **[llms-full.txt](https://vicentereig.github.io/dspy.rb/llms-full.txt)** - Comprehensive API documentation
|
|
210
|
+
|
|
211
|
+
### Getting Started
|
|
212
|
+
- **[Installation & Setup](docs/src/getting-started/installation.md)** - Detailed installation and configuration
|
|
213
|
+
- **[Quick Start Guide](docs/src/getting-started/quick-start.md)** - Your first DSPy programs
|
|
214
|
+
- **[Core Concepts](docs/src/getting-started/core-concepts.md)** - Understanding signatures, predictors, and modules
|
|
215
|
+
|
|
216
|
+
### Prompt Engineering
|
|
217
|
+
- **[Signatures & Types](docs/src/core-concepts/signatures.md)** - Define typed interfaces for LLM operations
|
|
218
|
+
- **[Predictors](docs/src/core-concepts/predictors.md)** - Predict, ChainOfThought, ReAct, and more
|
|
219
|
+
- **[Modules & Pipelines](docs/src/core-concepts/modules.md)** - Compose complex multi-stage workflows
|
|
220
|
+
- **[Multimodal Support](docs/src/core-concepts/multimodal.md)** - Image analysis with vision-capable models
|
|
221
|
+
- **[Examples & Validation](docs/src/core-concepts/examples.md)** - Type-safe training data
|
|
222
|
+
- **[Rich Types](docs/src/advanced/complex-types.md)** - Sorbet type integration with automatic coercion for structs, enums, and arrays
|
|
223
|
+
- **[Composable Pipelines](docs/src/advanced/pipelines.md)** - Manual module composition patterns
|
|
224
|
+
|
|
225
|
+
### Prompt Optimization
|
|
226
|
+
- **[Evaluation Framework](docs/src/optimization/evaluation.md)** - Advanced metrics beyond simple accuracy
|
|
227
|
+
- **[Prompt Optimization](docs/src/optimization/prompt-optimization.md)** - Manipulate prompts as objects
|
|
228
|
+
- **[MIPROv2 Optimizer](docs/src/optimization/miprov2.md)** - Advanced Bayesian optimization with Gaussian Processes
|
|
229
|
+
- **[GEPA Optimizer](docs/src/optimization/gepa.md)** *(beta)* - Reflective mutation with optional reflection LMs
|
|
230
|
+
|
|
231
|
+
### Context Engineering
|
|
232
|
+
- **[Tools](docs/src/core-concepts/toolsets.md)** - Tool wieldint agents.
|
|
233
|
+
- **[Agentic Memory](docs/src/core-concepts/memory.md)** - Memory Tools & Agentic Loops
|
|
234
|
+
- **[RAG Patterns](docs/src/advanced/rag.md)** - Manual RAG implementation with external services
|
|
235
|
+
|
|
236
|
+
### Production Features
|
|
237
|
+
- **[Observability](docs/src/production/observability.md)** - Zero-config Langfuse integration with a dedicated export worker that never blocks your LLMs
|
|
238
|
+
- **[Storage System](docs/src/production/storage.md)** - Persistence and optimization result storage
|
|
239
|
+
- **[Custom Metrics](docs/src/advanced/custom-metrics.md)** - Proc-based evaluation logic
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
## License
|
|
249
|
+
This project is licensed under the MIT License.
|
data/lib/dspy/evals.rb
ADDED
|
@@ -0,0 +1,820 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'polars'
|
|
5
|
+
require 'concurrent'
|
|
6
|
+
require 'sorbet-runtime'
|
|
7
|
+
require_relative 'example'
|
|
8
|
+
require_relative 'callbacks'
|
|
9
|
+
|
|
10
|
+
module DSPy
|
|
11
|
+
# Core evaluation framework for DSPy programs
|
|
12
|
+
# Supports single evaluations, batch evaluations, and optimization workflows
|
|
13
|
+
class Evals
|
|
14
|
+
extend T::Sig
|
|
15
|
+
|
|
16
|
+
# Result of evaluating a single example
|
|
17
|
+
class EvaluationResult
|
|
18
|
+
extend T::Sig
|
|
19
|
+
|
|
20
|
+
sig { returns(T.untyped) }
|
|
21
|
+
attr_reader :example
|
|
22
|
+
|
|
23
|
+
sig { returns(T.untyped) }
|
|
24
|
+
attr_reader :prediction
|
|
25
|
+
|
|
26
|
+
sig { returns(T.untyped) }
|
|
27
|
+
attr_reader :trace
|
|
28
|
+
|
|
29
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
30
|
+
attr_reader :metrics
|
|
31
|
+
|
|
32
|
+
sig { returns(T::Boolean) }
|
|
33
|
+
attr_reader :passed
|
|
34
|
+
|
|
35
|
+
sig do
|
|
36
|
+
params(
|
|
37
|
+
example: T.untyped,
|
|
38
|
+
prediction: T.untyped,
|
|
39
|
+
trace: T.untyped,
|
|
40
|
+
metrics: T::Hash[Symbol, T.untyped],
|
|
41
|
+
passed: T::Boolean
|
|
42
|
+
).void
|
|
43
|
+
end
|
|
44
|
+
def initialize(example:, prediction:, trace:, metrics:, passed:)
|
|
45
|
+
@example = example
|
|
46
|
+
@prediction = prediction
|
|
47
|
+
@trace = trace
|
|
48
|
+
@metrics = metrics
|
|
49
|
+
@passed = passed
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
53
|
+
def to_h
|
|
54
|
+
{
|
|
55
|
+
example: @example,
|
|
56
|
+
prediction: @prediction.respond_to?(:to_h) ? @prediction.to_h : @prediction,
|
|
57
|
+
trace: @trace,
|
|
58
|
+
metrics: @metrics,
|
|
59
|
+
passed: @passed
|
|
60
|
+
}
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Batch evaluation results with aggregated metrics
|
|
65
|
+
class BatchEvaluationResult
|
|
66
|
+
extend T::Sig
|
|
67
|
+
|
|
68
|
+
sig { returns(T::Array[EvaluationResult]) }
|
|
69
|
+
attr_reader :results
|
|
70
|
+
|
|
71
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
72
|
+
attr_reader :aggregated_metrics
|
|
73
|
+
|
|
74
|
+
sig { returns(Integer) }
|
|
75
|
+
attr_reader :total_examples
|
|
76
|
+
|
|
77
|
+
sig { returns(Integer) }
|
|
78
|
+
attr_reader :passed_examples
|
|
79
|
+
|
|
80
|
+
sig { returns(Float) }
|
|
81
|
+
attr_reader :pass_rate
|
|
82
|
+
|
|
83
|
+
sig { returns(Float) }
|
|
84
|
+
attr_reader :score
|
|
85
|
+
|
|
86
|
+
sig do
|
|
87
|
+
params(
|
|
88
|
+
results: T::Array[EvaluationResult],
|
|
89
|
+
aggregated_metrics: T::Hash[Symbol, T.untyped]
|
|
90
|
+
).void
|
|
91
|
+
end
|
|
92
|
+
def initialize(results:, aggregated_metrics:)
|
|
93
|
+
@results = results.freeze
|
|
94
|
+
@aggregated_metrics = aggregated_metrics.freeze
|
|
95
|
+
@total_examples = results.length
|
|
96
|
+
@passed_examples = results.count(&:passed)
|
|
97
|
+
@pass_rate = @total_examples > 0 ? @passed_examples.to_f / @total_examples : 0.0
|
|
98
|
+
score_avg = aggregated_metrics[:score_avg] || @pass_rate
|
|
99
|
+
@score = (score_avg * 100).round(2)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
103
|
+
def to_h
|
|
104
|
+
{
|
|
105
|
+
total_examples: @total_examples,
|
|
106
|
+
passed_examples: @passed_examples,
|
|
107
|
+
pass_rate: @pass_rate,
|
|
108
|
+
score: @score,
|
|
109
|
+
aggregated_metrics: @aggregated_metrics,
|
|
110
|
+
results: @results.map(&:to_h)
|
|
111
|
+
}
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
sig { returns(Polars::DataFrame) }
|
|
115
|
+
def to_polars
|
|
116
|
+
rows = @results.each_with_index.map do |result, index|
|
|
117
|
+
{
|
|
118
|
+
"index" => index,
|
|
119
|
+
"passed" => result.passed,
|
|
120
|
+
"score" => result.metrics[:score],
|
|
121
|
+
"example" => serialize_for_polars(result.example),
|
|
122
|
+
"prediction" => serialize_for_polars(result.prediction),
|
|
123
|
+
"metrics" => serialize_for_polars(result.metrics),
|
|
124
|
+
"trace" => serialize_for_polars(result.trace)
|
|
125
|
+
}
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
Polars::DataFrame.new(rows)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
private
|
|
132
|
+
|
|
133
|
+
def serialize_for_polars(value)
|
|
134
|
+
case value
|
|
135
|
+
when NilClass, TrueClass, FalseClass, Numeric, String
|
|
136
|
+
value
|
|
137
|
+
when Hash
|
|
138
|
+
JSON.generate(value)
|
|
139
|
+
when Array
|
|
140
|
+
JSON.generate(value)
|
|
141
|
+
else
|
|
142
|
+
if value.respond_to?(:to_h)
|
|
143
|
+
JSON.generate(value.to_h)
|
|
144
|
+
else
|
|
145
|
+
value.to_s
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
sig { returns(T.untyped) }
|
|
152
|
+
attr_reader :program
|
|
153
|
+
|
|
154
|
+
sig { returns(T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T::Boolean))) }
|
|
155
|
+
attr_reader :metric
|
|
156
|
+
|
|
157
|
+
sig { returns(T.nilable(Integer)) }
|
|
158
|
+
attr_reader :num_threads
|
|
159
|
+
|
|
160
|
+
sig { returns(T.nilable(Integer)) }
|
|
161
|
+
attr_reader :max_errors
|
|
162
|
+
|
|
163
|
+
sig { returns(T::Boolean) }
|
|
164
|
+
attr_reader :provide_traceback
|
|
165
|
+
|
|
166
|
+
sig { returns(Float) }
|
|
167
|
+
attr_reader :failure_score
|
|
168
|
+
|
|
169
|
+
sig { returns(T.nilable(EvaluationResult)) }
|
|
170
|
+
attr_reader :last_example_result
|
|
171
|
+
|
|
172
|
+
sig { returns(T.nilable(BatchEvaluationResult)) }
|
|
173
|
+
attr_reader :last_batch_result
|
|
174
|
+
|
|
175
|
+
include DSPy::Callbacks
|
|
176
|
+
|
|
177
|
+
create_before_callback :call, wrap: false
|
|
178
|
+
create_after_callback :call, wrap: false
|
|
179
|
+
create_before_callback :evaluate, wrap: false
|
|
180
|
+
create_after_callback :evaluate, wrap: false
|
|
181
|
+
|
|
182
|
+
class << self
|
|
183
|
+
def before_example(callback = nil, &block)
|
|
184
|
+
before(callback, target: :call, &block)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def after_example(callback = nil, &block)
|
|
188
|
+
after(callback, target: :call, &block)
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def before_batch(callback = nil, &block)
|
|
192
|
+
before(callback, target: :evaluate, &block)
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def after_batch(callback = nil, &block)
|
|
196
|
+
after(callback, target: :evaluate, &block)
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def reset_callbacks!
|
|
200
|
+
@callbacks = {}
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
sig do
|
|
205
|
+
params(
|
|
206
|
+
program: T.untyped,
|
|
207
|
+
metric: T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T::Boolean)),
|
|
208
|
+
num_threads: T.nilable(Integer),
|
|
209
|
+
max_errors: T.nilable(Integer),
|
|
210
|
+
failure_score: T.nilable(Numeric),
|
|
211
|
+
provide_traceback: T::Boolean
|
|
212
|
+
).void
|
|
213
|
+
end
|
|
214
|
+
def initialize(program, metric: nil, num_threads: 1, max_errors: 5, failure_score: 0.0, provide_traceback: true)
|
|
215
|
+
@program = program
|
|
216
|
+
@metric = metric
|
|
217
|
+
@num_threads = num_threads || 1
|
|
218
|
+
@max_errors = max_errors || 5
|
|
219
|
+
@provide_traceback = provide_traceback
|
|
220
|
+
@failure_score = failure_score ? failure_score.to_f : 0.0
|
|
221
|
+
@last_example_result = nil
|
|
222
|
+
@last_batch_result = nil
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Evaluate program on a single example
|
|
226
|
+
sig { params(example: T.untyped, trace: T.nilable(T.untyped)).returns(EvaluationResult) }
|
|
227
|
+
def call(example, trace: nil)
|
|
228
|
+
run_callbacks(:before, :call, example: example)
|
|
229
|
+
|
|
230
|
+
DSPy::Context.with_span(
|
|
231
|
+
operation: 'evaluation.example',
|
|
232
|
+
'dspy.module' => 'Evaluator',
|
|
233
|
+
'evaluation.program' => @program.class.name,
|
|
234
|
+
'evaluation.has_metric' => !@metric.nil?
|
|
235
|
+
) do
|
|
236
|
+
begin
|
|
237
|
+
perform_call(example, trace: trace)
|
|
238
|
+
rescue => e
|
|
239
|
+
build_error_result(example, e, trace: trace)
|
|
240
|
+
end
|
|
241
|
+
end.then do |result|
|
|
242
|
+
@last_example_result = result
|
|
243
|
+
emit_example_observation(example, result)
|
|
244
|
+
run_callbacks(:after, :call, example: example, result: result)
|
|
245
|
+
result
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# Evaluate program on multiple examples
|
|
250
|
+
sig do
|
|
251
|
+
params(
|
|
252
|
+
devset: T::Array[T.untyped],
|
|
253
|
+
display_progress: T::Boolean,
|
|
254
|
+
display_table: T::Boolean,
|
|
255
|
+
return_outputs: T::Boolean
|
|
256
|
+
).returns(BatchEvaluationResult)
|
|
257
|
+
end
|
|
258
|
+
def evaluate(devset, display_progress: true, display_table: false, return_outputs: true)
|
|
259
|
+
run_callbacks(:before, :evaluate, devset: devset)
|
|
260
|
+
|
|
261
|
+
DSPy::Context.with_span(
|
|
262
|
+
operation: 'evaluation.batch',
|
|
263
|
+
'dspy.module' => 'Evaluator',
|
|
264
|
+
'evaluation.program' => @program.class.name,
|
|
265
|
+
'evaluation.num_examples' => devset.length,
|
|
266
|
+
'evaluation.has_metric' => !@metric.nil?,
|
|
267
|
+
'evaluation.num_threads' => @num_threads
|
|
268
|
+
) do
|
|
269
|
+
if display_progress
|
|
270
|
+
puts "Evaluating #{devset.length} examples..."
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
results = if parallel_execution?
|
|
274
|
+
evaluate_in_parallel(devset, display_progress: display_progress)
|
|
275
|
+
else
|
|
276
|
+
evaluate_sequential(devset, display_progress: display_progress)
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
# Aggregate metrics
|
|
280
|
+
aggregated_metrics = aggregate_metrics(results)
|
|
281
|
+
|
|
282
|
+
batch_result = BatchEvaluationResult.new(
|
|
283
|
+
results: results,
|
|
284
|
+
aggregated_metrics: aggregated_metrics
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
if display_table
|
|
288
|
+
display_results_table(batch_result)
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
# Emit batch completion event
|
|
292
|
+
DSPy.log('evaluation.batch_complete', **{
|
|
293
|
+
'evaluation.program_class' => @program.class.name,
|
|
294
|
+
'evaluation.total_examples' => batch_result.total_examples,
|
|
295
|
+
'evaluation.passed_examples' => batch_result.passed_examples,
|
|
296
|
+
'evaluation.pass_rate' => batch_result.pass_rate,
|
|
297
|
+
'evaluation.aggregated_metrics' => aggregated_metrics
|
|
298
|
+
})
|
|
299
|
+
|
|
300
|
+
if display_progress
|
|
301
|
+
puts "Evaluation complete: #{batch_result.passed_examples}/#{batch_result.total_examples} passed (#{(batch_result.pass_rate * 100).round(1)}%)"
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
batch_result
|
|
305
|
+
end.then do |batch_result|
|
|
306
|
+
@last_batch_result = batch_result
|
|
307
|
+
emit_batch_observation(devset, batch_result)
|
|
308
|
+
run_callbacks(:after, :evaluate, devset: devset, result: batch_result)
|
|
309
|
+
batch_result
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
private
|
|
314
|
+
|
|
315
|
+
def parallel_execution?
|
|
316
|
+
(@num_threads || 1) > 1
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
def evaluate_sequential(devset, display_progress:)
|
|
320
|
+
results = []
|
|
321
|
+
errors = 0
|
|
322
|
+
passed_count = 0
|
|
323
|
+
|
|
324
|
+
devset.each_with_index do |example, index|
|
|
325
|
+
break if errors >= @max_errors
|
|
326
|
+
|
|
327
|
+
result = safe_call(example)
|
|
328
|
+
results << result
|
|
329
|
+
|
|
330
|
+
if result.passed
|
|
331
|
+
passed_count += 1
|
|
332
|
+
else
|
|
333
|
+
errors += 1
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
if display_progress && (index + 1) % 10 == 0
|
|
337
|
+
log_progress(index + 1, devset.length, passed_count)
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
results
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
def evaluate_in_parallel(devset, display_progress:)
|
|
345
|
+
total = devset.length
|
|
346
|
+
results = Array.new(total)
|
|
347
|
+
errors = 0
|
|
348
|
+
processed = 0
|
|
349
|
+
passed_count = 0
|
|
350
|
+
|
|
351
|
+
executor = Concurrent::ThreadPoolExecutor.new(
|
|
352
|
+
min_threads: @num_threads,
|
|
353
|
+
max_threads: @num_threads,
|
|
354
|
+
max_queue: [total, 1].max,
|
|
355
|
+
idletime: 60
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
enumerator = devset.each_with_index
|
|
359
|
+
|
|
360
|
+
loop do
|
|
361
|
+
break if errors >= @max_errors
|
|
362
|
+
|
|
363
|
+
batch = []
|
|
364
|
+
@num_threads.times do
|
|
365
|
+
begin
|
|
366
|
+
example = enumerator.next
|
|
367
|
+
batch << { example: example[0], index: example[1] }
|
|
368
|
+
rescue StopIteration
|
|
369
|
+
break
|
|
370
|
+
end
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
break if batch.empty?
|
|
374
|
+
|
|
375
|
+
futures = batch.map do |item|
|
|
376
|
+
Concurrent::Promises.future_on(executor) do
|
|
377
|
+
[:ok, item[:index], safe_call(item[:example])]
|
|
378
|
+
rescue => e
|
|
379
|
+
[:error, item[:index], e]
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
futures.each do |future|
|
|
384
|
+
status, index, payload = future.value!
|
|
385
|
+
example = batch.find { |entry| entry[:index] == index }[:example]
|
|
386
|
+
|
|
387
|
+
result = if status == :ok
|
|
388
|
+
payload
|
|
389
|
+
else
|
|
390
|
+
errors += 1
|
|
391
|
+
puts "Error processing example #{index}: #{payload.message}" if display_progress
|
|
392
|
+
build_error_result(example, payload)
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
results[index] = result
|
|
396
|
+
processed += 1
|
|
397
|
+
if result.passed
|
|
398
|
+
passed_count += 1
|
|
399
|
+
else
|
|
400
|
+
errors += 1 unless status == :error
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
if display_progress && (processed % 10).zero?
|
|
404
|
+
log_progress(processed, total, passed_count)
|
|
405
|
+
end
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
executor.shutdown
|
|
410
|
+
executor.wait_for_termination
|
|
411
|
+
|
|
412
|
+
results.compact
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
def safe_call(example)
|
|
416
|
+
call(example)
|
|
417
|
+
rescue => e
|
|
418
|
+
build_error_result(example, e)
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
def perform_call(example, trace:)
|
|
422
|
+
# Extract input from example - support both hash and object formats
|
|
423
|
+
input_values = extract_input_values(example)
|
|
424
|
+
|
|
425
|
+
# Run prediction
|
|
426
|
+
prediction = @program.call(**input_values)
|
|
427
|
+
|
|
428
|
+
# Calculate metrics if provided
|
|
429
|
+
metrics = {}
|
|
430
|
+
passed = true
|
|
431
|
+
|
|
432
|
+
if @metric
|
|
433
|
+
begin
|
|
434
|
+
metric_result = @metric.call(example, prediction)
|
|
435
|
+
if metric_result.is_a?(Hash)
|
|
436
|
+
metrics = symbolize_keys(metric_result)
|
|
437
|
+
passed_flag = metrics.key?(:passed) ? metrics[:passed] : metrics['passed']
|
|
438
|
+
passed = passed_flag.nil? ? true : !!passed_flag
|
|
439
|
+
else
|
|
440
|
+
passed = !!metric_result
|
|
441
|
+
metrics[:passed] = passed
|
|
442
|
+
end
|
|
443
|
+
rescue => e
|
|
444
|
+
passed = false
|
|
445
|
+
metrics[:error] = e.message
|
|
446
|
+
metrics[:passed] = false
|
|
447
|
+
metrics[:score] = @failure_score
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
metrics[:passed] = passed unless metrics.key?(:passed)
|
|
452
|
+
metrics[:score] = normalize_score(metrics[:score], passed) if metrics.key?(:score)
|
|
453
|
+
metrics[:score] ||= passed ? 1.0 : 0.0
|
|
454
|
+
|
|
455
|
+
EvaluationResult.new(
|
|
456
|
+
example: example,
|
|
457
|
+
prediction: prediction,
|
|
458
|
+
trace: trace,
|
|
459
|
+
metrics: metrics,
|
|
460
|
+
passed: passed
|
|
461
|
+
)
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
def build_error_result(example, error, trace: nil)
|
|
465
|
+
metrics = {
|
|
466
|
+
error: error.message,
|
|
467
|
+
passed: false,
|
|
468
|
+
score: @failure_score
|
|
469
|
+
}
|
|
470
|
+
metrics[:traceback] = error.backtrace&.first(10) || [] if @provide_traceback
|
|
471
|
+
|
|
472
|
+
EvaluationResult.new(
|
|
473
|
+
example: example,
|
|
474
|
+
prediction: nil,
|
|
475
|
+
trace: trace,
|
|
476
|
+
metrics: metrics,
|
|
477
|
+
passed: false
|
|
478
|
+
)
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
def log_progress(processed, total, passed_count)
|
|
482
|
+
puts "Processed #{processed}/#{total} examples (#{passed_count} passed)"
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
# Extract input values from example in various formats
|
|
486
|
+
sig { params(example: T.untyped).returns(T::Hash[Symbol, T.untyped]) }
|
|
487
|
+
def extract_input_values(example)
|
|
488
|
+
case example
|
|
489
|
+
when DSPy::Example
|
|
490
|
+
# Preferred format: DSPy::Example object with type safety
|
|
491
|
+
example.input_values
|
|
492
|
+
when Hash
|
|
493
|
+
# Check if it has an :input key (structured format)
|
|
494
|
+
if example.key?(:input)
|
|
495
|
+
input_data = example[:input]
|
|
496
|
+
input_data.is_a?(Hash) ? input_data.transform_keys(&:to_sym) : input_data
|
|
497
|
+
elsif example.key?('input')
|
|
498
|
+
input_data = example['input']
|
|
499
|
+
input_data.is_a?(Hash) ? input_data.transform_keys(&:to_sym) : input_data
|
|
500
|
+
else
|
|
501
|
+
# Legacy format - assume the whole hash is input
|
|
502
|
+
if example.keys.first.is_a?(String)
|
|
503
|
+
example.transform_keys(&:to_sym)
|
|
504
|
+
else
|
|
505
|
+
example
|
|
506
|
+
end
|
|
507
|
+
end
|
|
508
|
+
when ->(ex) { ex.respond_to?(:input_values) }
|
|
509
|
+
# Object with input_values method (Example-like)
|
|
510
|
+
example.input_values
|
|
511
|
+
when ->(ex) { ex.respond_to?(:input) }
|
|
512
|
+
# Object with input method
|
|
513
|
+
input_data = example.input
|
|
514
|
+
input_data.is_a?(Hash) ? input_data.transform_keys(&:to_sym) : input_data
|
|
515
|
+
when ->(ex) { ex.respond_to?(:to_h) }
|
|
516
|
+
# Object that can be converted to hash
|
|
517
|
+
hash = example.to_h
|
|
518
|
+
if hash.key?(:input)
|
|
519
|
+
input_data = hash[:input]
|
|
520
|
+
input_data.is_a?(Hash) ? input_data.transform_keys(&:to_sym) : input_data
|
|
521
|
+
elsif hash.key?('input')
|
|
522
|
+
input_data = hash['input']
|
|
523
|
+
input_data.is_a?(Hash) ? input_data.transform_keys(&:to_sym) : input_data
|
|
524
|
+
else
|
|
525
|
+
hash.is_a?(Hash) ? hash.transform_keys(&:to_sym) : hash
|
|
526
|
+
end
|
|
527
|
+
else
|
|
528
|
+
# Try to extract by introspection
|
|
529
|
+
if example.respond_to?(:instance_variables)
|
|
530
|
+
vars = {}
|
|
531
|
+
example.instance_variables.each do |var|
|
|
532
|
+
key = var.to_s.delete('@').to_sym
|
|
533
|
+
vars[key] = example.instance_variable_get(var)
|
|
534
|
+
end
|
|
535
|
+
vars
|
|
536
|
+
else
|
|
537
|
+
raise ArgumentError, "Cannot extract input values from example: #{example.class}"
|
|
538
|
+
end
|
|
539
|
+
end
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
# Extract expected values for metric comparison (used internally)
|
|
543
|
+
sig { params(example: T.untyped).returns(T.nilable(T::Hash[Symbol, T.untyped])) }
|
|
544
|
+
def extract_expected_values(example)
|
|
545
|
+
case example
|
|
546
|
+
when DSPy::Example
|
|
547
|
+
example.expected_values
|
|
548
|
+
when Hash
|
|
549
|
+
if example.key?(:expected)
|
|
550
|
+
expected_data = example[:expected]
|
|
551
|
+
expected_data.is_a?(Hash) ? expected_data.transform_keys(&:to_sym) : expected_data
|
|
552
|
+
elsif example.key?('expected')
|
|
553
|
+
expected_data = example['expected']
|
|
554
|
+
expected_data.is_a?(Hash) ? expected_data.transform_keys(&:to_sym) : expected_data
|
|
555
|
+
else
|
|
556
|
+
# Legacy format - no separate expected values
|
|
557
|
+
nil
|
|
558
|
+
end
|
|
559
|
+
when ->(ex) { ex.respond_to?(:expected_values) }
|
|
560
|
+
example.expected_values
|
|
561
|
+
when ->(ex) { ex.respond_to?(:expected) }
|
|
562
|
+
expected_data = example.expected
|
|
563
|
+
expected_data.is_a?(Hash) ? expected_data.transform_keys(&:to_sym) : expected_data
|
|
564
|
+
else
|
|
565
|
+
nil
|
|
566
|
+
end
|
|
567
|
+
end
|
|
568
|
+
|
|
569
|
+
# Aggregate metrics across all results
|
|
570
|
+
sig { params(results: T::Array[EvaluationResult]).returns(T::Hash[Symbol, T.untyped]) }
|
|
571
|
+
def aggregate_metrics(results)
|
|
572
|
+
return {} if results.empty?
|
|
573
|
+
|
|
574
|
+
total = results.length
|
|
575
|
+
passed = results.count(&:passed)
|
|
576
|
+
|
|
577
|
+
aggregated = {
|
|
578
|
+
total_examples: total,
|
|
579
|
+
passed_examples: passed,
|
|
580
|
+
failed_examples: results.count { |r| !r.passed }
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
score_values = results.filter_map do |result|
|
|
584
|
+
score = result.metrics[:score]
|
|
585
|
+
score if score.is_a?(Numeric)
|
|
586
|
+
end
|
|
587
|
+
|
|
588
|
+
if score_values.any?
|
|
589
|
+
aggregated[:score_sum] = score_values.sum
|
|
590
|
+
aggregated[:score_avg] = score_values.sum.to_f / score_values.length
|
|
591
|
+
aggregated[:score_min] = score_values.min
|
|
592
|
+
aggregated[:score_max] = score_values.max
|
|
593
|
+
else
|
|
594
|
+
aggregated[:score_avg] = passed.positive? && total.positive? ? passed.to_f / total : 0.0
|
|
595
|
+
end
|
|
596
|
+
|
|
597
|
+
# Aggregate other numeric metrics
|
|
598
|
+
numeric_metrics = {}
|
|
599
|
+
results.each do |result|
|
|
600
|
+
result.metrics.each do |key, value|
|
|
601
|
+
next if [:error, :traceback, :passed, :score].include?(key)
|
|
602
|
+
next unless value.is_a?(Numeric)
|
|
603
|
+
|
|
604
|
+
numeric_metrics[key] ||= []
|
|
605
|
+
numeric_metrics[key] << value
|
|
606
|
+
end
|
|
607
|
+
end
|
|
608
|
+
|
|
609
|
+
numeric_metrics.each do |key, values|
|
|
610
|
+
aggregated[:"#{key}_avg"] = values.sum.to_f / values.length
|
|
611
|
+
aggregated[:"#{key}_min"] = values.min
|
|
612
|
+
aggregated[:"#{key}_max"] = values.max
|
|
613
|
+
end
|
|
614
|
+
|
|
615
|
+
aggregated[:pass_rate] = total.positive? ? passed.to_f / total : 0.0
|
|
616
|
+
|
|
617
|
+
aggregated
|
|
618
|
+
end
|
|
619
|
+
|
|
620
|
+
# Display results in a table format
|
|
621
|
+
sig { params(batch_result: BatchEvaluationResult).void }
|
|
622
|
+
def display_results_table(batch_result)
|
|
623
|
+
puts "\nEvaluation Results:"
|
|
624
|
+
puts "=" * 50
|
|
625
|
+
puts "Total Examples: #{batch_result.total_examples}"
|
|
626
|
+
puts "Passed: #{batch_result.passed_examples}"
|
|
627
|
+
puts "Failed: #{batch_result.total_examples - batch_result.passed_examples}"
|
|
628
|
+
puts "Pass Rate: #{(batch_result.pass_rate * 100).round(1)}%"
|
|
629
|
+
|
|
630
|
+
if batch_result.aggregated_metrics.any?
|
|
631
|
+
puts "\nAggregated Metrics:"
|
|
632
|
+
batch_result.aggregated_metrics.each do |key, value|
|
|
633
|
+
next if [:total_examples, :passed_examples, :failed_examples, :pass_rate].include?(key)
|
|
634
|
+
puts " #{key}: #{value.is_a?(Float) ? value.round(3) : value}"
|
|
635
|
+
end
|
|
636
|
+
end
|
|
637
|
+
|
|
638
|
+
puts "=" * 50
|
|
639
|
+
end
|
|
640
|
+
|
|
641
|
+
def emit_example_observation(example, result)
|
|
642
|
+
DSPy.event('evals.example.complete', {
|
|
643
|
+
program: @program.class.name,
|
|
644
|
+
example_id: extract_example_id(example),
|
|
645
|
+
passed: result.passed,
|
|
646
|
+
score: result.metrics[:score],
|
|
647
|
+
error: result.metrics[:error]
|
|
648
|
+
})
|
|
649
|
+
rescue => e
|
|
650
|
+
DSPy.log('evals.example.observation_error', error: e.message)
|
|
651
|
+
end
|
|
652
|
+
|
|
653
|
+
def emit_batch_observation(devset, batch_result)
|
|
654
|
+
DSPy.event('evals.batch.complete', {
|
|
655
|
+
program: @program.class.name,
|
|
656
|
+
dataset_size: devset.length,
|
|
657
|
+
total_examples: batch_result.total_examples,
|
|
658
|
+
passed_examples: batch_result.passed_examples,
|
|
659
|
+
pass_rate: batch_result.pass_rate,
|
|
660
|
+
score: batch_result.score
|
|
661
|
+
})
|
|
662
|
+
rescue => e
|
|
663
|
+
DSPy.log('evals.batch.observation_error', error: e.message)
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
def extract_example_id(example)
|
|
667
|
+
if example.respond_to?(:id)
|
|
668
|
+
example.id
|
|
669
|
+
elsif example.is_a?(Hash)
|
|
670
|
+
example[:id] || example['id']
|
|
671
|
+
else
|
|
672
|
+
nil
|
|
673
|
+
end
|
|
674
|
+
rescue
|
|
675
|
+
nil
|
|
676
|
+
end
|
|
677
|
+
|
|
678
|
+
def symbolize_keys(hash)
|
|
679
|
+
hash.each_with_object({}) do |(key, value), memo|
|
|
680
|
+
memo[key.respond_to?(:to_sym) ? key.to_sym : key] = value
|
|
681
|
+
end
|
|
682
|
+
end
|
|
683
|
+
|
|
684
|
+
def normalize_score(value, passed)
|
|
685
|
+
case value
|
|
686
|
+
when Numeric
|
|
687
|
+
value.to_f
|
|
688
|
+
when TrueClass, FalseClass
|
|
689
|
+
value ? 1.0 : 0.0
|
|
690
|
+
else
|
|
691
|
+
passed ? 1.0 : 0.0
|
|
692
|
+
end
|
|
693
|
+
end
|
|
694
|
+
|
|
695
|
+
end
|
|
696
|
+
|
|
697
|
+
# Common metric functions for evaluation
|
|
698
|
+
module Metrics
|
|
699
|
+
extend T::Sig
|
|
700
|
+
|
|
701
|
+
# Exact match metric - checks if prediction exactly matches expected output
|
|
702
|
+
sig do
|
|
703
|
+
params(
|
|
704
|
+
field: Symbol,
|
|
705
|
+
case_sensitive: T::Boolean
|
|
706
|
+
).returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T::Boolean))
|
|
707
|
+
end
|
|
708
|
+
def self.exact_match(field: :answer, case_sensitive: true)
|
|
709
|
+
proc do |example, prediction|
|
|
710
|
+
expected = extract_field(example, field)
|
|
711
|
+
actual = extract_field(prediction, field)
|
|
712
|
+
|
|
713
|
+
next false if expected.nil? || actual.nil?
|
|
714
|
+
|
|
715
|
+
if case_sensitive
|
|
716
|
+
expected.to_s == actual.to_s
|
|
717
|
+
else
|
|
718
|
+
expected.to_s.downcase == actual.to_s.downcase
|
|
719
|
+
end
|
|
720
|
+
end
|
|
721
|
+
end
|
|
722
|
+
|
|
723
|
+
# Contains metric - checks if prediction contains expected substring
|
|
724
|
+
sig do
|
|
725
|
+
params(
|
|
726
|
+
field: Symbol,
|
|
727
|
+
case_sensitive: T::Boolean
|
|
728
|
+
).returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T::Boolean))
|
|
729
|
+
end
|
|
730
|
+
def self.contains(field: :answer, case_sensitive: false)
|
|
731
|
+
proc do |example, prediction|
|
|
732
|
+
expected = extract_field(example, field)
|
|
733
|
+
actual = extract_field(prediction, field)
|
|
734
|
+
|
|
735
|
+
next false if expected.nil? || actual.nil?
|
|
736
|
+
|
|
737
|
+
if case_sensitive
|
|
738
|
+
actual.to_s.include?(expected.to_s)
|
|
739
|
+
else
|
|
740
|
+
actual.to_s.downcase.include?(expected.to_s.downcase)
|
|
741
|
+
end
|
|
742
|
+
end
|
|
743
|
+
end
|
|
744
|
+
|
|
745
|
+
# Numeric difference metric - checks if prediction is within tolerance of expected value
|
|
746
|
+
sig do
|
|
747
|
+
params(
|
|
748
|
+
field: Symbol,
|
|
749
|
+
tolerance: Float
|
|
750
|
+
).returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T::Hash[Symbol, T.untyped]))
|
|
751
|
+
end
|
|
752
|
+
def self.numeric_difference(field: :answer, tolerance: 0.01)
|
|
753
|
+
proc do |example, prediction|
|
|
754
|
+
expected = extract_field(example, field)
|
|
755
|
+
actual = extract_field(prediction, field)
|
|
756
|
+
|
|
757
|
+
next { passed: false, error: "Missing values" } if expected.nil? || actual.nil?
|
|
758
|
+
|
|
759
|
+
begin
|
|
760
|
+
expected_num = Float(expected)
|
|
761
|
+
actual_num = Float(actual)
|
|
762
|
+
difference = (expected_num - actual_num).abs
|
|
763
|
+
passed = difference <= tolerance
|
|
764
|
+
|
|
765
|
+
{
|
|
766
|
+
passed: passed,
|
|
767
|
+
difference: difference,
|
|
768
|
+
expected: expected_num,
|
|
769
|
+
actual: actual_num,
|
|
770
|
+
tolerance: tolerance
|
|
771
|
+
}
|
|
772
|
+
rescue ArgumentError
|
|
773
|
+
{ passed: false, error: "Non-numeric values" }
|
|
774
|
+
end
|
|
775
|
+
end
|
|
776
|
+
end
|
|
777
|
+
|
|
778
|
+
# Composite metric - combines multiple metrics with AND logic
|
|
779
|
+
def self.composite_and(*metrics)
|
|
780
|
+
proc do |example, prediction|
|
|
781
|
+
results = {}
|
|
782
|
+
all_passed = true
|
|
783
|
+
|
|
784
|
+
metrics.each_with_index do |metric, index|
|
|
785
|
+
result = metric.call(example, prediction)
|
|
786
|
+
|
|
787
|
+
if result.is_a?(Hash)
|
|
788
|
+
results[:"metric_#{index}"] = result
|
|
789
|
+
all_passed &&= result[:passed] || result['passed'] || false
|
|
790
|
+
else
|
|
791
|
+
passed = !!result
|
|
792
|
+
results[:"metric_#{index}"] = { passed: passed }
|
|
793
|
+
all_passed &&= passed
|
|
794
|
+
end
|
|
795
|
+
end
|
|
796
|
+
|
|
797
|
+
results[:passed] = all_passed
|
|
798
|
+
results
|
|
799
|
+
end
|
|
800
|
+
end
|
|
801
|
+
|
|
802
|
+
private
|
|
803
|
+
|
|
804
|
+
# Extract field value from example or prediction
|
|
805
|
+
sig { params(obj: T.untyped, field: Symbol).returns(T.untyped) }
|
|
806
|
+
def self.extract_field(obj, field)
|
|
807
|
+
case obj
|
|
808
|
+
when Hash
|
|
809
|
+
obj[field] || obj[field.to_s]
|
|
810
|
+
when ->(o) { o.respond_to?(field) }
|
|
811
|
+
obj.send(field)
|
|
812
|
+
when ->(o) { o.respond_to?(:to_h) }
|
|
813
|
+
hash = obj.to_h
|
|
814
|
+
hash[field] || hash[field.to_s]
|
|
815
|
+
else
|
|
816
|
+
nil
|
|
817
|
+
end
|
|
818
|
+
end
|
|
819
|
+
end
|
|
820
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: dspy-evals
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.29.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Vicente Reig Rincón de Arellano
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 2025-10-23 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: dspy
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - '='
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: 0.29.1
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - '='
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: 0.29.1
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: concurrent-ruby
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '1.3'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '1.3'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: polars-df
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '0.15'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '0.15'
|
|
54
|
+
description: Provides the DSPy::Evals runtime, concurrency, callbacks, and export
|
|
55
|
+
helpers for benchmarking Ruby DSPy programs.
|
|
56
|
+
email:
|
|
57
|
+
- hey@vicente.services
|
|
58
|
+
executables: []
|
|
59
|
+
extensions: []
|
|
60
|
+
extra_rdoc_files: []
|
|
61
|
+
files:
|
|
62
|
+
- LICENSE
|
|
63
|
+
- README.md
|
|
64
|
+
- lib/dspy/evals.rb
|
|
65
|
+
- lib/dspy/evals/version.rb
|
|
66
|
+
homepage: https://github.com/vicentereig/dspy.rb
|
|
67
|
+
licenses:
|
|
68
|
+
- MIT
|
|
69
|
+
metadata:
|
|
70
|
+
github_repo: git@github.com:vicentereig/dspy.rb
|
|
71
|
+
rdoc_options: []
|
|
72
|
+
require_paths:
|
|
73
|
+
- lib
|
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
75
|
+
requirements:
|
|
76
|
+
- - ">="
|
|
77
|
+
- !ruby/object:Gem::Version
|
|
78
|
+
version: 3.3.0
|
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
|
+
requirements:
|
|
81
|
+
- - ">="
|
|
82
|
+
- !ruby/object:Gem::Version
|
|
83
|
+
version: '0'
|
|
84
|
+
requirements: []
|
|
85
|
+
rubygems_version: 3.6.5
|
|
86
|
+
specification_version: 4
|
|
87
|
+
summary: Evaluation utilities for DSPy.rb programs.
|
|
88
|
+
test_files: []
|