dspy 0.33.0 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -4
- data/lib/dspy/evals.rb +45 -2
- data/lib/dspy/scores/data_type.rb +30 -0
- data/lib/dspy/scores/evaluators.rb +279 -0
- data/lib/dspy/scores/score_event.rb +56 -0
- data/lib/dspy/scores.rb +135 -0
- data/lib/dspy/version.rb +1 -1
- data/lib/dspy.rb +1 -0
- metadata +5 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3bf98e1e8f5f939799d7e14717d8859b10830144a9c23f1d4818e6fa021fb46a
|
|
4
|
+
data.tar.gz: 154e27f97ed2c3ae5b8a04f2d3941a93e79a0e88bae8459fd73a85a9d03ed186
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 298305a05b5a38806d67989c01ed2a476f291d13ec6c8a228c8d465fa925d56b178b6c0865d4e9282b4e6ab29aa9655e9b7fc64a228478524d2ce94d3015758f
|
|
7
|
+
data.tar.gz: 207ff7188ff0bcb16bfd6b45893672085363683464889a4d3e3ba666d2989f2daa0260660da00f7e29ba0df360ca85a430536f38fae319a2e2106405e99e0af9
|
data/README.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
[](https://rubygems.org/gems/dspy)
|
|
4
4
|
[](https://rubygems.org/gems/dspy)
|
|
5
5
|
[](https://github.com/vicentereig/dspy.rb/actions/workflows/ruby.yml)
|
|
6
|
-
[](https://oss.vicente.services/dspy.rb/)
|
|
7
7
|
[](https://discord.gg/zWBhrMqn)
|
|
8
8
|
|
|
9
9
|
> [!NOTE]
|
|
@@ -248,13 +248,24 @@ DSPy.rb has gone from experimental to production-ready in three fast releases.
|
|
|
248
248
|
|
|
249
249
|
## Documentation
|
|
250
250
|
|
|
251
|
-
📖 **[Complete Documentation Website](https://
|
|
251
|
+
📖 **[Complete Documentation Website](https://oss.vicente.services/dspy.rb/)**
|
|
252
252
|
|
|
253
253
|
### LLM-Friendly Documentation
|
|
254
254
|
|
|
255
255
|
For LLMs and AI assistants working with DSPy.rb:
|
|
256
|
-
- **[llms.txt](https://
|
|
257
|
-
- **[llms-full.txt](https://
|
|
256
|
+
- **[llms.txt](https://oss.vicente.services/dspy.rb/llms.txt)** - Concise reference optimized for LLMs
|
|
257
|
+
- **[llms-full.txt](https://oss.vicente.services/dspy.rb/llms-full.txt)** - Comprehensive API documentation
|
|
258
|
+
|
|
259
|
+
### Claude Skill
|
|
260
|
+
|
|
261
|
+
A [Claude Skill](https://github.com/vicentereig/dspy-rb-skill) is available to help you build DSPy.rb applications with Claude Code or claude.ai.
|
|
262
|
+
|
|
263
|
+
**Claude Code:**
|
|
264
|
+
```bash
|
|
265
|
+
git clone https://github.com/vicentereig/dspy-rb-skill ~/.claude/skills/dspy-rb
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
**Claude.ai (Pro/Max):** Download the [skill as a ZIP](https://github.com/vicentereig/dspy-rb-skill/archive/refs/heads/main.zip) and upload via Settings > Skills.
|
|
258
269
|
|
|
259
270
|
### Getting Started
|
|
260
271
|
- **[Installation & Setup](docs/src/getting-started/installation.md)** - Detailed installation and configuration
|
data/lib/dspy/evals.rb
CHANGED
|
@@ -191,6 +191,12 @@ module DSPy
|
|
|
191
191
|
sig { returns(T.nilable(BatchEvaluationResult)) }
|
|
192
192
|
attr_reader :last_batch_result
|
|
193
193
|
|
|
194
|
+
sig { returns(T::Boolean) }
|
|
195
|
+
attr_reader :export_scores
|
|
196
|
+
|
|
197
|
+
sig { returns(String) }
|
|
198
|
+
attr_reader :score_name
|
|
199
|
+
|
|
194
200
|
include DSPy::Callbacks
|
|
195
201
|
|
|
196
202
|
create_before_callback :call, wrap: false
|
|
@@ -227,16 +233,20 @@ module DSPy
|
|
|
227
233
|
num_threads: T.nilable(Integer),
|
|
228
234
|
max_errors: T.nilable(Integer),
|
|
229
235
|
failure_score: T.nilable(Numeric),
|
|
230
|
-
provide_traceback: T::Boolean
|
|
236
|
+
provide_traceback: T::Boolean,
|
|
237
|
+
export_scores: T::Boolean,
|
|
238
|
+
score_name: String
|
|
231
239
|
).void
|
|
232
240
|
end
|
|
233
|
-
def initialize(program, metric: nil, num_threads: 1, max_errors: 5, failure_score: 0.0, provide_traceback: true)
|
|
241
|
+
def initialize(program, metric: nil, num_threads: 1, max_errors: 5, failure_score: 0.0, provide_traceback: true, export_scores: false, score_name: 'evaluation')
|
|
234
242
|
@program = program
|
|
235
243
|
@metric = metric
|
|
236
244
|
@num_threads = num_threads || 1
|
|
237
245
|
@max_errors = max_errors || 5
|
|
238
246
|
@provide_traceback = provide_traceback
|
|
239
247
|
@failure_score = failure_score ? failure_score.to_f : 0.0
|
|
248
|
+
@export_scores = export_scores
|
|
249
|
+
@score_name = score_name
|
|
240
250
|
@last_example_result = nil
|
|
241
251
|
@last_batch_result = nil
|
|
242
252
|
end
|
|
@@ -665,6 +675,11 @@ module DSPy
|
|
|
665
675
|
score: result.metrics[:score],
|
|
666
676
|
error: result.metrics[:error]
|
|
667
677
|
})
|
|
678
|
+
|
|
679
|
+
# Export score to Langfuse if enabled
|
|
680
|
+
if @export_scores
|
|
681
|
+
export_example_score(example, result)
|
|
682
|
+
end
|
|
668
683
|
rescue => e
|
|
669
684
|
DSPy.log('evals.example.observation_error', error: e.message)
|
|
670
685
|
end
|
|
@@ -678,10 +693,38 @@ module DSPy
|
|
|
678
693
|
pass_rate: batch_result.pass_rate,
|
|
679
694
|
score: batch_result.score
|
|
680
695
|
})
|
|
696
|
+
|
|
697
|
+
# Export batch score to Langfuse if enabled
|
|
698
|
+
if @export_scores
|
|
699
|
+
export_batch_score(batch_result)
|
|
700
|
+
end
|
|
681
701
|
rescue => e
|
|
682
702
|
DSPy.log('evals.batch.observation_error', error: e.message)
|
|
683
703
|
end
|
|
684
704
|
|
|
705
|
+
def export_example_score(example, result)
|
|
706
|
+
score_value = result.metrics[:score] || (result.passed ? 1.0 : 0.0)
|
|
707
|
+
example_id = extract_example_id(example)
|
|
708
|
+
|
|
709
|
+
DSPy.score(
|
|
710
|
+
@score_name,
|
|
711
|
+
score_value,
|
|
712
|
+
comment: "Example: #{example_id || 'unknown'}, passed: #{result.passed}"
|
|
713
|
+
)
|
|
714
|
+
rescue => e
|
|
715
|
+
DSPy.log('evals.score_export_error', error: e.message)
|
|
716
|
+
end
|
|
717
|
+
|
|
718
|
+
def export_batch_score(batch_result)
|
|
719
|
+
DSPy.score(
|
|
720
|
+
"#{@score_name}_batch",
|
|
721
|
+
batch_result.pass_rate,
|
|
722
|
+
comment: "Batch: #{batch_result.passed_examples}/#{batch_result.total_examples} passed"
|
|
723
|
+
)
|
|
724
|
+
rescue => e
|
|
725
|
+
DSPy.log('evals.batch_score_export_error', error: e.message)
|
|
726
|
+
end
|
|
727
|
+
|
|
685
728
|
def extract_example_id(example)
|
|
686
729
|
if example.respond_to?(:id)
|
|
687
730
|
example.id
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'sorbet-runtime'
|
|
4
|
+
|
|
5
|
+
module DSPy
|
|
6
|
+
module Scores
|
|
7
|
+
# Langfuse score data types
|
|
8
|
+
# Maps to: NUMERIC, BOOLEAN, CATEGORICAL
|
|
9
|
+
class DataType < T::Enum
|
|
10
|
+
extend T::Sig
|
|
11
|
+
|
|
12
|
+
enums do
|
|
13
|
+
Numeric = new('NUMERIC')
|
|
14
|
+
Boolean = new('BOOLEAN')
|
|
15
|
+
Categorical = new('CATEGORICAL')
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
sig { params(value: String).returns(DataType) }
|
|
19
|
+
def self.deserialize(value)
|
|
20
|
+
case value
|
|
21
|
+
when 'NUMERIC' then Numeric
|
|
22
|
+
when 'BOOLEAN' then Boolean
|
|
23
|
+
when 'CATEGORICAL' then Categorical
|
|
24
|
+
else
|
|
25
|
+
raise ArgumentError, "Unknown DataType: #{value}"
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'sorbet-runtime'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module DSPy
|
|
7
|
+
module Scores
|
|
8
|
+
# Built-in evaluators for common evaluation patterns
|
|
9
|
+
# Each evaluator returns a ScoreEvent that can be exported to Langfuse
|
|
10
|
+
module Evaluators
|
|
11
|
+
extend T::Sig
|
|
12
|
+
|
|
13
|
+
# Exact string match evaluator
|
|
14
|
+
# Returns 1.0 if output exactly matches expected, 0.0 otherwise
|
|
15
|
+
sig do
|
|
16
|
+
params(
|
|
17
|
+
output: String,
|
|
18
|
+
expected: String,
|
|
19
|
+
name: String,
|
|
20
|
+
ignore_case: T::Boolean,
|
|
21
|
+
comment: T.nilable(String),
|
|
22
|
+
trace_id: T.nilable(String),
|
|
23
|
+
observation_id: T.nilable(String),
|
|
24
|
+
emit: T::Boolean
|
|
25
|
+
).returns(ScoreEvent)
|
|
26
|
+
end
|
|
27
|
+
def self.exact_match(
|
|
28
|
+
output:,
|
|
29
|
+
expected:,
|
|
30
|
+
name: 'exact_match',
|
|
31
|
+
ignore_case: false,
|
|
32
|
+
comment: nil,
|
|
33
|
+
trace_id: nil,
|
|
34
|
+
observation_id: nil,
|
|
35
|
+
emit: true
|
|
36
|
+
)
|
|
37
|
+
match = if ignore_case
|
|
38
|
+
output.downcase == expected.downcase
|
|
39
|
+
else
|
|
40
|
+
output == expected
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
DSPy::Scores.create(
|
|
44
|
+
name: name,
|
|
45
|
+
value: match ? 1.0 : 0.0,
|
|
46
|
+
data_type: DataType::Numeric,
|
|
47
|
+
comment: comment || (match ? 'Exact match' : 'No match'),
|
|
48
|
+
trace_id: trace_id,
|
|
49
|
+
observation_id: observation_id,
|
|
50
|
+
emit: emit
|
|
51
|
+
)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Substring containment evaluator
|
|
55
|
+
# Returns 1.0 if output contains expected, 0.0 otherwise
|
|
56
|
+
sig do
|
|
57
|
+
params(
|
|
58
|
+
output: String,
|
|
59
|
+
expected: String,
|
|
60
|
+
name: String,
|
|
61
|
+
ignore_case: T::Boolean,
|
|
62
|
+
comment: T.nilable(String),
|
|
63
|
+
trace_id: T.nilable(String),
|
|
64
|
+
observation_id: T.nilable(String),
|
|
65
|
+
emit: T::Boolean
|
|
66
|
+
).returns(ScoreEvent)
|
|
67
|
+
end
|
|
68
|
+
def self.contains(
|
|
69
|
+
output:,
|
|
70
|
+
expected:,
|
|
71
|
+
name: 'contains',
|
|
72
|
+
ignore_case: false,
|
|
73
|
+
comment: nil,
|
|
74
|
+
trace_id: nil,
|
|
75
|
+
observation_id: nil,
|
|
76
|
+
emit: true
|
|
77
|
+
)
|
|
78
|
+
match = if ignore_case
|
|
79
|
+
output.downcase.include?(expected.downcase)
|
|
80
|
+
else
|
|
81
|
+
output.include?(expected)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
DSPy::Scores.create(
|
|
85
|
+
name: name,
|
|
86
|
+
value: match ? 1.0 : 0.0,
|
|
87
|
+
data_type: DataType::Numeric,
|
|
88
|
+
comment: comment || (match ? 'Contains expected' : 'Does not contain expected'),
|
|
89
|
+
trace_id: trace_id,
|
|
90
|
+
observation_id: observation_id,
|
|
91
|
+
emit: emit
|
|
92
|
+
)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Regular expression match evaluator
|
|
96
|
+
# Returns 1.0 if output matches pattern, 0.0 otherwise
|
|
97
|
+
sig do
|
|
98
|
+
params(
|
|
99
|
+
output: String,
|
|
100
|
+
pattern: T.any(Regexp, String),
|
|
101
|
+
name: String,
|
|
102
|
+
comment: T.nilable(String),
|
|
103
|
+
trace_id: T.nilable(String),
|
|
104
|
+
observation_id: T.nilable(String),
|
|
105
|
+
emit: T::Boolean
|
|
106
|
+
).returns(ScoreEvent)
|
|
107
|
+
end
|
|
108
|
+
def self.regex_match(
|
|
109
|
+
output:,
|
|
110
|
+
pattern:,
|
|
111
|
+
name: 'regex_match',
|
|
112
|
+
comment: nil,
|
|
113
|
+
trace_id: nil,
|
|
114
|
+
observation_id: nil,
|
|
115
|
+
emit: true
|
|
116
|
+
)
|
|
117
|
+
regex = pattern.is_a?(Regexp) ? pattern : Regexp.new(pattern)
|
|
118
|
+
match = regex.match?(output)
|
|
119
|
+
|
|
120
|
+
DSPy::Scores.create(
|
|
121
|
+
name: name,
|
|
122
|
+
value: match ? 1.0 : 0.0,
|
|
123
|
+
data_type: DataType::Numeric,
|
|
124
|
+
comment: comment || (match ? 'Regex matched' : 'Regex did not match'),
|
|
125
|
+
trace_id: trace_id,
|
|
126
|
+
observation_id: observation_id,
|
|
127
|
+
emit: emit
|
|
128
|
+
)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Length check evaluator
|
|
132
|
+
# Returns 1.0 if output length is within range, 0.0 otherwise
|
|
133
|
+
sig do
|
|
134
|
+
params(
|
|
135
|
+
output: String,
|
|
136
|
+
min_length: T.nilable(Integer),
|
|
137
|
+
max_length: T.nilable(Integer),
|
|
138
|
+
name: String,
|
|
139
|
+
comment: T.nilable(String),
|
|
140
|
+
trace_id: T.nilable(String),
|
|
141
|
+
observation_id: T.nilable(String),
|
|
142
|
+
emit: T::Boolean
|
|
143
|
+
).returns(ScoreEvent)
|
|
144
|
+
end
|
|
145
|
+
def self.length_check(
|
|
146
|
+
output:,
|
|
147
|
+
min_length: nil,
|
|
148
|
+
max_length: nil,
|
|
149
|
+
name: 'length_check',
|
|
150
|
+
comment: nil,
|
|
151
|
+
trace_id: nil,
|
|
152
|
+
observation_id: nil,
|
|
153
|
+
emit: true
|
|
154
|
+
)
|
|
155
|
+
length = output.length
|
|
156
|
+
valid = true
|
|
157
|
+
valid = false if min_length && length < min_length
|
|
158
|
+
valid = false if max_length && length > max_length
|
|
159
|
+
|
|
160
|
+
DSPy::Scores.create(
|
|
161
|
+
name: name,
|
|
162
|
+
value: valid ? 1.0 : 0.0,
|
|
163
|
+
data_type: DataType::Numeric,
|
|
164
|
+
comment: comment || "Length: #{length} (min: #{min_length || 'none'}, max: #{max_length || 'none'})",
|
|
165
|
+
trace_id: trace_id,
|
|
166
|
+
observation_id: observation_id,
|
|
167
|
+
emit: emit
|
|
168
|
+
)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Levenshtein similarity evaluator
|
|
172
|
+
# Returns normalized similarity score between 0.0 and 1.0
|
|
173
|
+
sig do
|
|
174
|
+
params(
|
|
175
|
+
output: String,
|
|
176
|
+
expected: String,
|
|
177
|
+
name: String,
|
|
178
|
+
comment: T.nilable(String),
|
|
179
|
+
trace_id: T.nilable(String),
|
|
180
|
+
observation_id: T.nilable(String),
|
|
181
|
+
emit: T::Boolean
|
|
182
|
+
).returns(ScoreEvent)
|
|
183
|
+
end
|
|
184
|
+
def self.similarity(
|
|
185
|
+
output:,
|
|
186
|
+
expected:,
|
|
187
|
+
name: 'similarity',
|
|
188
|
+
comment: nil,
|
|
189
|
+
trace_id: nil,
|
|
190
|
+
observation_id: nil,
|
|
191
|
+
emit: true
|
|
192
|
+
)
|
|
193
|
+
distance = levenshtein_distance(output, expected)
|
|
194
|
+
max_length = [output.length, expected.length].max
|
|
195
|
+
score = max_length.zero? ? 1.0 : 1.0 - (distance.to_f / max_length)
|
|
196
|
+
|
|
197
|
+
DSPy::Scores.create(
|
|
198
|
+
name: name,
|
|
199
|
+
value: score.round(4),
|
|
200
|
+
data_type: DataType::Numeric,
|
|
201
|
+
comment: comment || "Levenshtein distance: #{distance}",
|
|
202
|
+
trace_id: trace_id,
|
|
203
|
+
observation_id: observation_id,
|
|
204
|
+
emit: emit
|
|
205
|
+
)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# JSON validity evaluator
|
|
209
|
+
# Returns 1.0 if output is valid JSON, 0.0 otherwise
|
|
210
|
+
sig do
|
|
211
|
+
params(
|
|
212
|
+
output: String,
|
|
213
|
+
name: String,
|
|
214
|
+
comment: T.nilable(String),
|
|
215
|
+
trace_id: T.nilable(String),
|
|
216
|
+
observation_id: T.nilable(String),
|
|
217
|
+
emit: T::Boolean
|
|
218
|
+
).returns(ScoreEvent)
|
|
219
|
+
end
|
|
220
|
+
def self.json_valid(
|
|
221
|
+
output:,
|
|
222
|
+
name: 'json_valid',
|
|
223
|
+
comment: nil,
|
|
224
|
+
trace_id: nil,
|
|
225
|
+
observation_id: nil,
|
|
226
|
+
emit: true
|
|
227
|
+
)
|
|
228
|
+
valid = begin
|
|
229
|
+
JSON.parse(output)
|
|
230
|
+
true
|
|
231
|
+
rescue JSON::ParserError
|
|
232
|
+
false
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
DSPy::Scores.create(
|
|
236
|
+
name: name,
|
|
237
|
+
value: valid ? 1.0 : 0.0,
|
|
238
|
+
data_type: DataType::Numeric,
|
|
239
|
+
comment: comment || (valid ? 'Valid JSON' : 'Invalid JSON'),
|
|
240
|
+
trace_id: trace_id,
|
|
241
|
+
observation_id: observation_id,
|
|
242
|
+
emit: emit
|
|
243
|
+
)
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# Levenshtein distance implementation
|
|
247
|
+
sig { params(str1: String, str2: String).returns(Integer) }
|
|
248
|
+
def self.levenshtein_distance(str1, str2)
|
|
249
|
+
m = str1.length
|
|
250
|
+
n = str2.length
|
|
251
|
+
|
|
252
|
+
return n if m.zero?
|
|
253
|
+
return m if n.zero?
|
|
254
|
+
|
|
255
|
+
# Create distance matrix
|
|
256
|
+
d = Array.new(m + 1) { Array.new(n + 1, 0) }
|
|
257
|
+
|
|
258
|
+
# Initialize first column
|
|
259
|
+
(0..m).each { |i| d[i][0] = i }
|
|
260
|
+
# Initialize first row
|
|
261
|
+
(0..n).each { |j| d[0][j] = j }
|
|
262
|
+
|
|
263
|
+
# Fill in the rest of the matrix
|
|
264
|
+
(1..m).each do |i|
|
|
265
|
+
(1..n).each do |j|
|
|
266
|
+
cost = str1[i - 1] == str2[j - 1] ? 0 : 1
|
|
267
|
+
d[i][j] = [
|
|
268
|
+
d[i - 1][j] + 1, # deletion
|
|
269
|
+
d[i][j - 1] + 1, # insertion
|
|
270
|
+
d[i - 1][j - 1] + cost # substitution
|
|
271
|
+
].min
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
d[m][n]
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'sorbet-runtime'
|
|
4
|
+
require 'securerandom'
|
|
5
|
+
require_relative 'data_type'
|
|
6
|
+
|
|
7
|
+
module DSPy
|
|
8
|
+
module Scores
|
|
9
|
+
# Represents a score to be sent to Langfuse
|
|
10
|
+
# Immutable struct with all score attributes
|
|
11
|
+
class ScoreEvent < T::Struct
|
|
12
|
+
extend T::Sig
|
|
13
|
+
|
|
14
|
+
# Unique identifier for the score (idempotency key)
|
|
15
|
+
prop :id, String, factory: -> { SecureRandom.uuid }
|
|
16
|
+
|
|
17
|
+
# Score name/identifier (required)
|
|
18
|
+
prop :name, String
|
|
19
|
+
|
|
20
|
+
# Score value - numeric, boolean (0/1), or categorical (string)
|
|
21
|
+
prop :value, T.any(Numeric, String)
|
|
22
|
+
|
|
23
|
+
# Data type for the score
|
|
24
|
+
prop :data_type, DataType, default: DataType::Numeric
|
|
25
|
+
|
|
26
|
+
# Optional human-readable comment
|
|
27
|
+
prop :comment, T.nilable(String), default: nil
|
|
28
|
+
|
|
29
|
+
# Trace ID to link the score to (required for Langfuse)
|
|
30
|
+
prop :trace_id, T.nilable(String), default: nil
|
|
31
|
+
|
|
32
|
+
# Observation/span ID to link the score to (optional)
|
|
33
|
+
prop :observation_id, T.nilable(String), default: nil
|
|
34
|
+
|
|
35
|
+
# Timestamp when the score was created
|
|
36
|
+
prop :timestamp, Time, factory: -> { Time.now }
|
|
37
|
+
|
|
38
|
+
# Serialize to Langfuse API payload format
|
|
39
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
40
|
+
def to_langfuse_payload
|
|
41
|
+
payload = {
|
|
42
|
+
id: id,
|
|
43
|
+
name: name,
|
|
44
|
+
value: value,
|
|
45
|
+
dataType: data_type.serialize
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
payload[:comment] = comment if comment
|
|
49
|
+
payload[:traceId] = trace_id if trace_id
|
|
50
|
+
payload[:observationId] = observation_id if observation_id
|
|
51
|
+
|
|
52
|
+
payload
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
data/lib/dspy/scores.rb
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'scores/data_type'
|
|
4
|
+
require_relative 'scores/score_event'
|
|
5
|
+
require_relative 'scores/evaluators'
|
|
6
|
+
|
|
7
|
+
module DSPy
|
|
8
|
+
# Score reporting for Langfuse integration
|
|
9
|
+
# Provides a simple API for creating and exporting evaluation scores
|
|
10
|
+
module Scores
|
|
11
|
+
extend T::Sig
|
|
12
|
+
|
|
13
|
+
class << self
|
|
14
|
+
extend T::Sig
|
|
15
|
+
|
|
16
|
+
# Create a score event from the current context
|
|
17
|
+
#
|
|
18
|
+
# @param name [String] Score identifier (e.g., "accuracy", "relevance")
|
|
19
|
+
# @param value [Numeric, String] Score value
|
|
20
|
+
# @param data_type [DataType] Type of score (default: Numeric)
|
|
21
|
+
# @param comment [String, nil] Optional human-readable comment
|
|
22
|
+
# @param span [Object, nil] Optional span to attach score to
|
|
23
|
+
# @param emit [Boolean] Whether to emit score.create event (default: true)
|
|
24
|
+
# @return [ScoreEvent] The created score event
|
|
25
|
+
sig do
|
|
26
|
+
params(
|
|
27
|
+
name: String,
|
|
28
|
+
value: T.any(Numeric, String),
|
|
29
|
+
data_type: DataType,
|
|
30
|
+
comment: T.nilable(String),
|
|
31
|
+
span: T.untyped,
|
|
32
|
+
trace_id: T.nilable(String),
|
|
33
|
+
observation_id: T.nilable(String),
|
|
34
|
+
emit: T::Boolean
|
|
35
|
+
).returns(ScoreEvent)
|
|
36
|
+
end
|
|
37
|
+
def create(
|
|
38
|
+
name:,
|
|
39
|
+
value:,
|
|
40
|
+
data_type: DataType::Numeric,
|
|
41
|
+
comment: nil,
|
|
42
|
+
span: nil,
|
|
43
|
+
trace_id: nil,
|
|
44
|
+
observation_id: nil,
|
|
45
|
+
emit: true
|
|
46
|
+
)
|
|
47
|
+
# Extract trace_id from context if not provided
|
|
48
|
+
resolved_trace_id = trace_id || extract_trace_id_from_context
|
|
49
|
+
resolved_observation_id = observation_id || extract_observation_id_from_span(span)
|
|
50
|
+
|
|
51
|
+
event = ScoreEvent.new(
|
|
52
|
+
name: name,
|
|
53
|
+
value: value,
|
|
54
|
+
data_type: data_type,
|
|
55
|
+
comment: comment,
|
|
56
|
+
trace_id: resolved_trace_id,
|
|
57
|
+
observation_id: resolved_observation_id
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Emit score.create event for listeners and exporters
|
|
61
|
+
emit_score_event(event) if emit
|
|
62
|
+
|
|
63
|
+
event
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
sig { returns(T.nilable(String)) }
|
|
69
|
+
def extract_trace_id_from_context
|
|
70
|
+
return nil unless defined?(DSPy::Context)
|
|
71
|
+
|
|
72
|
+
DSPy::Context.current[:trace_id]
|
|
73
|
+
rescue StandardError
|
|
74
|
+
nil
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
sig { params(span: T.untyped).returns(T.nilable(String)) }
|
|
78
|
+
def extract_observation_id_from_span(span)
|
|
79
|
+
return nil unless span
|
|
80
|
+
|
|
81
|
+
if span.respond_to?(:context) && span.context.respond_to?(:span_id)
|
|
82
|
+
span.context.span_id
|
|
83
|
+
elsif span.respond_to?(:span_id)
|
|
84
|
+
span.span_id
|
|
85
|
+
end
|
|
86
|
+
rescue StandardError
|
|
87
|
+
nil
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
sig { params(event: ScoreEvent).void }
|
|
91
|
+
def emit_score_event(event)
|
|
92
|
+
return unless defined?(DSPy) && DSPy.respond_to?(:events)
|
|
93
|
+
|
|
94
|
+
DSPy.events.notify('score.create', {
|
|
95
|
+
score_id: event.id,
|
|
96
|
+
score_name: event.name,
|
|
97
|
+
score_value: event.value,
|
|
98
|
+
score_data_type: event.data_type.serialize,
|
|
99
|
+
score_comment: event.comment,
|
|
100
|
+
trace_id: event.trace_id,
|
|
101
|
+
observation_id: event.observation_id,
|
|
102
|
+
timestamp: event.timestamp.iso8601
|
|
103
|
+
})
|
|
104
|
+
rescue StandardError => e
|
|
105
|
+
DSPy.log('score.emit_error', error: e.message) if DSPy.respond_to?(:log)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Top-level convenience method for creating scores
|
|
111
|
+
#
|
|
112
|
+
# @example Basic usage
|
|
113
|
+
# DSPy.score('accuracy', 0.95)
|
|
114
|
+
#
|
|
115
|
+
# @example With comment
|
|
116
|
+
# DSPy.score('accuracy', 0.95, comment: 'Exact match')
|
|
117
|
+
#
|
|
118
|
+
# @example Boolean score
|
|
119
|
+
# DSPy.score('is_valid', 1, data_type: DSPy::Scores::DataType::Boolean)
|
|
120
|
+
#
|
|
121
|
+
# @example Categorical score
|
|
122
|
+
# DSPy.score('sentiment', 'positive', data_type: DSPy::Scores::DataType::Categorical)
|
|
123
|
+
#
|
|
124
|
+
def self.score(name, value, data_type: Scores::DataType::Numeric, comment: nil, span: nil, trace_id: nil, observation_id: nil)
|
|
125
|
+
Scores.create(
|
|
126
|
+
name: name,
|
|
127
|
+
value: value,
|
|
128
|
+
data_type: data_type,
|
|
129
|
+
comment: comment,
|
|
130
|
+
span: span,
|
|
131
|
+
trace_id: trace_id,
|
|
132
|
+
observation_id: observation_id
|
|
133
|
+
)
|
|
134
|
+
end
|
|
135
|
+
end
|
data/lib/dspy/version.rb
CHANGED
data/lib/dspy.rb
CHANGED
|
@@ -223,6 +223,7 @@ require_relative 'dspy/events/subscriber_mixin'
|
|
|
223
223
|
require_relative 'dspy/chain_of_thought'
|
|
224
224
|
require_relative 'dspy/re_act'
|
|
225
225
|
require_relative 'dspy/evals'
|
|
226
|
+
require_relative 'dspy/scores'
|
|
226
227
|
require_relative 'dspy/teleprompt/teleprompter'
|
|
227
228
|
require_relative 'dspy/teleprompt/utils'
|
|
228
229
|
require_relative 'dspy/teleprompt/data_handler'
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: dspy
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.34.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vicente Reig Rincón de Arellano
|
|
@@ -219,6 +219,10 @@ files:
|
|
|
219
219
|
- lib/dspy/schema/sorbet_toon_adapter.rb
|
|
220
220
|
- lib/dspy/schema/version.rb
|
|
221
221
|
- lib/dspy/schema_adapters.rb
|
|
222
|
+
- lib/dspy/scores.rb
|
|
223
|
+
- lib/dspy/scores/data_type.rb
|
|
224
|
+
- lib/dspy/scores/evaluators.rb
|
|
225
|
+
- lib/dspy/scores/score_event.rb
|
|
222
226
|
- lib/dspy/signature.rb
|
|
223
227
|
- lib/dspy/storage/program_storage.rb
|
|
224
228
|
- lib/dspy/storage/storage_manager.rb
|