langfuse-ruby 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ci.yml +47 -0
- data/.github/workflows/release.yml +51 -0
- data/.gitignore +81 -0
- data/CHANGELOG.md +49 -0
- data/FINAL_SUMMARY.md +191 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +102 -0
- data/LICENSE +21 -0
- data/PROJECT_SUMMARY.md +263 -0
- data/PUBLISH_GUIDE.md +300 -0
- data/README.md +436 -0
- data/RELEASE_CHECKLIST.md +130 -0
- data/Rakefile +23 -0
- data/examples/basic_tracing.rb +196 -0
- data/examples/prompt_management.rb +283 -0
- data/langfuse-ruby.gemspec +51 -0
- data/lib/langfuse/client.rb +278 -0
- data/lib/langfuse/errors.rb +9 -0
- data/lib/langfuse/evaluation.rb +267 -0
- data/lib/langfuse/generation.rb +190 -0
- data/lib/langfuse/prompt.rb +209 -0
- data/lib/langfuse/span.rb +166 -0
- data/lib/langfuse/trace.rb +155 -0
- data/lib/langfuse/utils.rb +41 -0
- data/lib/langfuse/version.rb +3 -0
- data/lib/langfuse.rb +40 -0
- data/scripts/release.sh +120 -0
- data/scripts/verify_release.rb +139 -0
- data/test_basic.rb +183 -0
- data/test_offline.rb +329 -0
- metadata +232 -0
@@ -0,0 +1,278 @@
|
|
1
|
+
require 'faraday'
|
2
|
+
require 'faraday/net_http'
|
3
|
+
require 'json'
|
4
|
+
require 'base64'
|
5
|
+
require 'concurrent'
|
6
|
+
|
7
|
+
module Langfuse
|
8
|
+
class Client
|
9
|
+
attr_reader :public_key, :secret_key, :host, :debug, :timeout, :retries
|
10
|
+
|
11
|
+
def initialize(public_key: nil, secret_key: nil, host: nil, debug: false, timeout: 30, retries: 3)
|
12
|
+
@public_key = public_key || ENV['LANGFUSE_PUBLIC_KEY'] || Langfuse.configuration.public_key
|
13
|
+
@secret_key = secret_key || ENV['LANGFUSE_SECRET_KEY'] || Langfuse.configuration.secret_key
|
14
|
+
@host = host || ENV['LANGFUSE_HOST'] || Langfuse.configuration.host
|
15
|
+
@debug = debug || Langfuse.configuration.debug
|
16
|
+
@timeout = timeout || Langfuse.configuration.timeout
|
17
|
+
@retries = retries || Langfuse.configuration.retries
|
18
|
+
|
19
|
+
raise AuthenticationError, "Public key is required" unless @public_key
|
20
|
+
raise AuthenticationError, "Secret key is required" unless @secret_key
|
21
|
+
|
22
|
+
@connection = build_connection
|
23
|
+
@event_queue = Concurrent::Array.new
|
24
|
+
@flush_thread = start_flush_thread
|
25
|
+
end
|
26
|
+
|
27
|
+
# Trace operations
|
28
|
+
def trace(id: nil, name: nil, user_id: nil, session_id: nil, version: nil, release: nil,
|
29
|
+
input: nil, output: nil, metadata: nil, tags: nil, timestamp: nil, **kwargs)
|
30
|
+
Trace.new(
|
31
|
+
client: self,
|
32
|
+
id: id || Utils.generate_id,
|
33
|
+
name: name,
|
34
|
+
user_id: user_id,
|
35
|
+
session_id: session_id,
|
36
|
+
version: version,
|
37
|
+
release: release,
|
38
|
+
input: input,
|
39
|
+
output: output,
|
40
|
+
metadata: metadata,
|
41
|
+
tags: tags,
|
42
|
+
timestamp: timestamp || Utils.current_timestamp,
|
43
|
+
**kwargs
|
44
|
+
)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Span operations
|
48
|
+
def span(trace_id:, name: nil, start_time: nil, end_time: nil, input: nil, output: nil,
|
49
|
+
metadata: nil, level: nil, status_message: nil, parent_observation_id: nil,
|
50
|
+
version: nil, **kwargs)
|
51
|
+
Span.new(
|
52
|
+
client: self,
|
53
|
+
trace_id: trace_id,
|
54
|
+
name: name,
|
55
|
+
start_time: start_time || Utils.current_timestamp,
|
56
|
+
end_time: end_time,
|
57
|
+
input: input,
|
58
|
+
output: output,
|
59
|
+
metadata: metadata,
|
60
|
+
level: level,
|
61
|
+
status_message: status_message,
|
62
|
+
parent_observation_id: parent_observation_id,
|
63
|
+
version: version,
|
64
|
+
**kwargs
|
65
|
+
)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Generation operations
|
69
|
+
def generation(trace_id:, name: nil, start_time: nil, end_time: nil, completion_start_time: nil,
|
70
|
+
model: nil, model_parameters: nil, input: nil, output: nil, usage: nil,
|
71
|
+
metadata: nil, level: nil, status_message: nil, parent_observation_id: nil,
|
72
|
+
version: nil, **kwargs)
|
73
|
+
Generation.new(
|
74
|
+
client: self,
|
75
|
+
trace_id: trace_id,
|
76
|
+
name: name,
|
77
|
+
start_time: start_time || Utils.current_timestamp,
|
78
|
+
end_time: end_time,
|
79
|
+
completion_start_time: completion_start_time,
|
80
|
+
model: model,
|
81
|
+
model_parameters: model_parameters,
|
82
|
+
input: input,
|
83
|
+
output: output,
|
84
|
+
usage: usage,
|
85
|
+
metadata: metadata,
|
86
|
+
level: level,
|
87
|
+
status_message: status_message,
|
88
|
+
parent_observation_id: parent_observation_id,
|
89
|
+
version: version,
|
90
|
+
**kwargs
|
91
|
+
)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Prompt operations
|
95
|
+
def get_prompt(name, version: nil, label: nil, cache_ttl_seconds: 60)
|
96
|
+
cache_key = "prompt:#{name}:#{version}:#{label}"
|
97
|
+
|
98
|
+
if cached_prompt = @prompt_cache&.dig(cache_key)
|
99
|
+
return cached_prompt[:prompt] if Time.now - cached_prompt[:cached_at] < cache_ttl_seconds
|
100
|
+
end
|
101
|
+
|
102
|
+
path = "/api/public/prompts/#{name}"
|
103
|
+
params = {}
|
104
|
+
params[:version] = version if version
|
105
|
+
params[:label] = label if label
|
106
|
+
|
107
|
+
response = get(path, params)
|
108
|
+
prompt = Prompt.new(response.body)
|
109
|
+
|
110
|
+
# Cache the prompt
|
111
|
+
@prompt_cache ||= {}
|
112
|
+
@prompt_cache[cache_key] = { prompt: prompt, cached_at: Time.now }
|
113
|
+
|
114
|
+
prompt
|
115
|
+
end
|
116
|
+
|
117
|
+
def create_prompt(name:, prompt:, labels: [], config: {}, **kwargs)
|
118
|
+
data = {
|
119
|
+
name: name,
|
120
|
+
prompt: prompt,
|
121
|
+
labels: labels,
|
122
|
+
config: config,
|
123
|
+
**kwargs
|
124
|
+
}
|
125
|
+
|
126
|
+
response = post("/api/public/prompts", data)
|
127
|
+
Prompt.new(response.body)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Score/Evaluation operations
|
131
|
+
def score(trace_id: nil, observation_id: nil, name:, value:, data_type: nil, comment: nil, **kwargs)
|
132
|
+
data = {
|
133
|
+
name: name,
|
134
|
+
value: value,
|
135
|
+
data_type: data_type,
|
136
|
+
comment: comment,
|
137
|
+
**kwargs
|
138
|
+
}
|
139
|
+
|
140
|
+
data[:trace_id] = trace_id if trace_id
|
141
|
+
data[:observation_id] = observation_id if observation_id
|
142
|
+
|
143
|
+
enqueue_event('score-create', data)
|
144
|
+
end
|
145
|
+
|
146
|
+
# HTTP methods
|
147
|
+
def get(path, params = {})
|
148
|
+
request(:get, path, params: params)
|
149
|
+
end
|
150
|
+
|
151
|
+
def post(path, data = {})
|
152
|
+
request(:post, path, json: data)
|
153
|
+
end
|
154
|
+
|
155
|
+
def put(path, data = {})
|
156
|
+
request(:put, path, json: data)
|
157
|
+
end
|
158
|
+
|
159
|
+
def delete(path, params = {})
|
160
|
+
request(:delete, path, params: params)
|
161
|
+
end
|
162
|
+
|
163
|
+
def patch(path, data = {})
|
164
|
+
request(:patch, path, json: data)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Event queue management
|
168
|
+
def enqueue_event(type, body)
|
169
|
+
event = {
|
170
|
+
id: Utils.generate_id,
|
171
|
+
type: type,
|
172
|
+
timestamp: Utils.current_timestamp,
|
173
|
+
body: Utils.deep_stringify_keys(body)
|
174
|
+
}
|
175
|
+
|
176
|
+
@event_queue << event
|
177
|
+
puts "Enqueued event: #{type}" if @debug
|
178
|
+
end
|
179
|
+
|
180
|
+
def flush
|
181
|
+
return if @event_queue.empty?
|
182
|
+
|
183
|
+
events = @event_queue.shift(@event_queue.length)
|
184
|
+
return if events.empty?
|
185
|
+
|
186
|
+
batch_data = {
|
187
|
+
batch: events,
|
188
|
+
metadata: {
|
189
|
+
batch_size: events.length,
|
190
|
+
sdk_name: "langfuse-ruby",
|
191
|
+
sdk_version: Langfuse::VERSION
|
192
|
+
}
|
193
|
+
}
|
194
|
+
|
195
|
+
begin
|
196
|
+
response = post("/api/public/ingestion", batch_data)
|
197
|
+
puts "Flushed #{events.length} events" if @debug
|
198
|
+
rescue => e
|
199
|
+
puts "Failed to flush events: #{e.message}" if @debug
|
200
|
+
# Re-queue events on failure
|
201
|
+
events.each { |event| @event_queue << event }
|
202
|
+
raise
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
def shutdown
|
207
|
+
@flush_thread&.kill
|
208
|
+
flush unless @event_queue.empty?
|
209
|
+
end
|
210
|
+
|
211
|
+
private
|
212
|
+
|
213
|
+
def build_connection
|
214
|
+
Faraday.new(url: @host) do |conn|
|
215
|
+
conn.request :authorization, :basic, @public_key, @secret_key
|
216
|
+
conn.request :json
|
217
|
+
conn.response :json, content_type: /\bjson$/
|
218
|
+
conn.adapter Faraday.default_adapter
|
219
|
+
conn.options.timeout = @timeout
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
def request(method, path, params: {}, json: nil)
|
224
|
+
retries_left = @retries
|
225
|
+
|
226
|
+
begin
|
227
|
+
response = @connection.send(method) do |req|
|
228
|
+
req.url path
|
229
|
+
req.params = params if params.any?
|
230
|
+
req.body = json if json
|
231
|
+
end
|
232
|
+
|
233
|
+
handle_response(response)
|
234
|
+
rescue Faraday::TimeoutError => e
|
235
|
+
raise TimeoutError, "Request timed out: #{e.message}"
|
236
|
+
rescue Faraday::ConnectionFailed => e
|
237
|
+
if retries_left > 0
|
238
|
+
retries_left -= 1
|
239
|
+
sleep(2 ** (@retries - retries_left))
|
240
|
+
retry
|
241
|
+
end
|
242
|
+
raise NetworkError, "Connection failed: #{e.message}"
|
243
|
+
rescue => e
|
244
|
+
raise APIError, "Request failed: #{e.message}"
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
def handle_response(response)
|
249
|
+
case response.status
|
250
|
+
when 200..299
|
251
|
+
response
|
252
|
+
when 401
|
253
|
+
raise AuthenticationError, "Authentication failed: #{response.body}"
|
254
|
+
when 429
|
255
|
+
raise RateLimitError, "Rate limit exceeded: #{response.body}"
|
256
|
+
when 400..499
|
257
|
+
raise ValidationError, "Client error: #{response.body}"
|
258
|
+
when 500..599
|
259
|
+
raise APIError, "Server error: #{response.body}"
|
260
|
+
else
|
261
|
+
raise APIError, "Unexpected response: #{response.status} #{response.body}"
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
def start_flush_thread
|
266
|
+
Thread.new do
|
267
|
+
loop do
|
268
|
+
sleep(5) # Flush every 5 seconds
|
269
|
+
begin
|
270
|
+
flush unless @event_queue.empty?
|
271
|
+
rescue => e
|
272
|
+
puts "Error in flush thread: #{e.message}" if @debug
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
module Langfuse
|
2
|
+
class Error < StandardError; end
|
3
|
+
class AuthenticationError < Error; end
|
4
|
+
class APIError < Error; end
|
5
|
+
class NetworkError < Error; end
|
6
|
+
class ValidationError < Error; end
|
7
|
+
class RateLimitError < Error; end
|
8
|
+
class TimeoutError < Error; end
|
9
|
+
end
|
@@ -0,0 +1,267 @@
|
|
1
|
+
module Langfuse
|
2
|
+
class Evaluation
|
3
|
+
attr_reader :id, :name, :value, :data_type, :comment, :trace_id, :observation_id, :created_at
|
4
|
+
|
5
|
+
def initialize(data)
|
6
|
+
@data = data.is_a?(Hash) ? Utils.deep_symbolize_keys(data) : data
|
7
|
+
|
8
|
+
@id = @data[:id]
|
9
|
+
@name = @data[:name]
|
10
|
+
@value = @data[:value]
|
11
|
+
@data_type = @data[:data_type]
|
12
|
+
@comment = @data[:comment]
|
13
|
+
@trace_id = @data[:trace_id]
|
14
|
+
@observation_id = @data[:observation_id]
|
15
|
+
@created_at = @data[:created_at]
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_dict
|
19
|
+
{
|
20
|
+
id: @id,
|
21
|
+
name: @name,
|
22
|
+
value: @value,
|
23
|
+
data_type: @data_type,
|
24
|
+
comment: @comment,
|
25
|
+
trace_id: @trace_id,
|
26
|
+
observation_id: @observation_id,
|
27
|
+
created_at: @created_at
|
28
|
+
}.compact
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class Score
|
33
|
+
attr_reader :id, :name, :value, :data_type, :comment, :trace_id, :observation_id, :created_at
|
34
|
+
|
35
|
+
def initialize(data)
|
36
|
+
@data = data.is_a?(Hash) ? Utils.deep_symbolize_keys(data) : data
|
37
|
+
|
38
|
+
@id = @data[:id]
|
39
|
+
@name = @data[:name]
|
40
|
+
@value = @data[:value]
|
41
|
+
@data_type = @data[:data_type]
|
42
|
+
@comment = @data[:comment]
|
43
|
+
@trace_id = @data[:trace_id]
|
44
|
+
@observation_id = @data[:observation_id]
|
45
|
+
@created_at = @data[:created_at]
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_dict
|
49
|
+
{
|
50
|
+
id: @id,
|
51
|
+
name: @name,
|
52
|
+
value: @value,
|
53
|
+
data_type: @data_type,
|
54
|
+
comment: @comment,
|
55
|
+
trace_id: @trace_id,
|
56
|
+
observation_id: @observation_id,
|
57
|
+
created_at: @created_at
|
58
|
+
}.compact
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
module Evaluators
|
63
|
+
class BaseEvaluator
|
64
|
+
def initialize(name:, description: nil)
|
65
|
+
@name = name
|
66
|
+
@description = description
|
67
|
+
end
|
68
|
+
|
69
|
+
def evaluate(input, output, expected: nil, context: nil)
|
70
|
+
raise NotImplementedError, "Subclasses must implement evaluate method"
|
71
|
+
end
|
72
|
+
|
73
|
+
protected
|
74
|
+
|
75
|
+
def create_score(value:, data_type: 'NUMERIC', comment: nil)
|
76
|
+
{
|
77
|
+
name: @name,
|
78
|
+
value: value,
|
79
|
+
data_type: data_type,
|
80
|
+
comment: comment
|
81
|
+
}
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class ExactMatchEvaluator < BaseEvaluator
|
86
|
+
def initialize(name: 'exact_match', description: 'Exact match evaluator')
|
87
|
+
super(name: name, description: description)
|
88
|
+
end
|
89
|
+
|
90
|
+
def evaluate(input, output, expected: nil, context: nil)
|
91
|
+
return create_score(value: 0, comment: 'No expected value provided') unless expected
|
92
|
+
|
93
|
+
score = output.to_s.strip == expected.to_s.strip ? 1 : 0
|
94
|
+
create_score(
|
95
|
+
value: score,
|
96
|
+
comment: score == 1 ? 'Exact match' : 'No match'
|
97
|
+
)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
class ContainsEvaluator < BaseEvaluator
|
102
|
+
def initialize(name: 'contains', description: 'Contains evaluator', case_sensitive: false)
|
103
|
+
super(name: name, description: description)
|
104
|
+
@case_sensitive = case_sensitive
|
105
|
+
end
|
106
|
+
|
107
|
+
def evaluate(input, output, expected: nil, context: nil)
|
108
|
+
return create_score(value: 0, comment: 'No expected value provided') unless expected
|
109
|
+
|
110
|
+
output_str = @case_sensitive ? output.to_s : output.to_s.downcase
|
111
|
+
expected_str = @case_sensitive ? expected.to_s : expected.to_s.downcase
|
112
|
+
|
113
|
+
score = output_str.include?(expected_str) ? 1 : 0
|
114
|
+
create_score(
|
115
|
+
value: score,
|
116
|
+
comment: score == 1 ? 'Contains expected text' : 'Does not contain expected text'
|
117
|
+
)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
class LengthEvaluator < BaseEvaluator
|
122
|
+
def initialize(name: 'length', description: 'Length evaluator', min_length: nil, max_length: nil)
|
123
|
+
super(name: name, description: description)
|
124
|
+
@min_length = min_length
|
125
|
+
@max_length = max_length
|
126
|
+
end
|
127
|
+
|
128
|
+
def evaluate(input, output, expected: nil, context: nil)
|
129
|
+
length = output.to_s.length
|
130
|
+
|
131
|
+
if @min_length && @max_length
|
132
|
+
score = (length >= @min_length && length <= @max_length) ? 1 : 0
|
133
|
+
comment = score == 1 ? "Length #{length} within range" : "Length #{length} outside range #{@min_length}-#{@max_length}"
|
134
|
+
elsif @min_length
|
135
|
+
score = length >= @min_length ? 1 : 0
|
136
|
+
comment = score == 1 ? "Length #{length} meets minimum" : "Length #{length} below minimum #{@min_length}"
|
137
|
+
elsif @max_length
|
138
|
+
score = length <= @max_length ? 1 : 0
|
139
|
+
comment = score == 1 ? "Length #{length} within maximum" : "Length #{length} exceeds maximum #{@max_length}"
|
140
|
+
else
|
141
|
+
score = length
|
142
|
+
comment = "Length: #{length}"
|
143
|
+
end
|
144
|
+
|
145
|
+
create_score(
|
146
|
+
value: score,
|
147
|
+
data_type: 'NUMERIC',
|
148
|
+
comment: comment
|
149
|
+
)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
class RegexEvaluator < BaseEvaluator
|
154
|
+
def initialize(name: 'regex', description: 'Regex evaluator', pattern:)
|
155
|
+
super(name: name, description: description)
|
156
|
+
@pattern = pattern.is_a?(Regexp) ? pattern : Regexp.new(pattern)
|
157
|
+
end
|
158
|
+
|
159
|
+
def evaluate(input, output, expected: nil, context: nil)
|
160
|
+
match = @pattern.match(output.to_s)
|
161
|
+
score = match ? 1 : 0
|
162
|
+
|
163
|
+
create_score(
|
164
|
+
value: score,
|
165
|
+
comment: score == 1 ? 'Regex pattern matched' : 'Regex pattern not matched'
|
166
|
+
)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
class SimilarityEvaluator < BaseEvaluator
|
171
|
+
def initialize(name: 'similarity', description: 'Similarity evaluator')
|
172
|
+
super(name: name, description: description)
|
173
|
+
end
|
174
|
+
|
175
|
+
def evaluate(input, output, expected: nil, context: nil)
|
176
|
+
return create_score(value: 0, comment: 'No expected value provided') unless expected
|
177
|
+
|
178
|
+
# Simple character-based similarity (Levenshtein distance)
|
179
|
+
similarity = calculate_similarity(output.to_s, expected.to_s)
|
180
|
+
|
181
|
+
create_score(
|
182
|
+
value: similarity,
|
183
|
+
data_type: 'NUMERIC',
|
184
|
+
comment: "Similarity: #{(similarity * 100).round(2)}%"
|
185
|
+
)
|
186
|
+
end
|
187
|
+
|
188
|
+
private
|
189
|
+
|
190
|
+
def calculate_similarity(str1, str2)
|
191
|
+
return 1.0 if str1 == str2
|
192
|
+
return 0.0 if str1.empty? || str2.empty?
|
193
|
+
|
194
|
+
distance = levenshtein_distance(str1, str2)
|
195
|
+
max_length = [str1.length, str2.length].max
|
196
|
+
|
197
|
+
1.0 - (distance.to_f / max_length)
|
198
|
+
end
|
199
|
+
|
200
|
+
def levenshtein_distance(str1, str2)
|
201
|
+
matrix = Array.new(str1.length + 1) { Array.new(str2.length + 1) }
|
202
|
+
|
203
|
+
(0..str1.length).each { |i| matrix[i][0] = i }
|
204
|
+
(0..str2.length).each { |j| matrix[0][j] = j }
|
205
|
+
|
206
|
+
(1..str1.length).each do |i|
|
207
|
+
(1..str2.length).each do |j|
|
208
|
+
cost = str1[i-1] == str2[j-1] ? 0 : 1
|
209
|
+
matrix[i][j] = [
|
210
|
+
matrix[i-1][j] + 1, # deletion
|
211
|
+
matrix[i][j-1] + 1, # insertion
|
212
|
+
matrix[i-1][j-1] + cost # substitution
|
213
|
+
].min
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
matrix[str1.length][str2.length]
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
class LLMEvaluator < BaseEvaluator
|
222
|
+
def initialize(name: 'llm_evaluator', description: 'LLM-based evaluator', client:, model: 'gpt-3.5-turbo', prompt_template: nil)
|
223
|
+
super(name: name, description: description)
|
224
|
+
@client = client
|
225
|
+
@model = model
|
226
|
+
@prompt_template = prompt_template || default_prompt_template
|
227
|
+
end
|
228
|
+
|
229
|
+
def evaluate(input, output, expected: nil, context: nil)
|
230
|
+
# This is a placeholder for LLM-based evaluation
|
231
|
+
# In a real implementation, you would call an LLM API here
|
232
|
+
prompt = @prompt_template.gsub('{input}', input.to_s)
|
233
|
+
.gsub('{output}', output.to_s)
|
234
|
+
.gsub('{expected}', expected.to_s)
|
235
|
+
.gsub('{context}', context.to_s)
|
236
|
+
|
237
|
+
# Simulate LLM response (in real implementation, call actual LLM)
|
238
|
+
score = rand(0.0..1.0).round(2)
|
239
|
+
|
240
|
+
create_score(
|
241
|
+
value: score,
|
242
|
+
data_type: 'NUMERIC',
|
243
|
+
comment: "LLM evaluation score: #{score}"
|
244
|
+
)
|
245
|
+
end
|
246
|
+
|
247
|
+
private
|
248
|
+
|
249
|
+
def default_prompt_template
|
250
|
+
<<~PROMPT
|
251
|
+
Please evaluate the following response:
|
252
|
+
|
253
|
+
Input: {input}
|
254
|
+
Output: {output}
|
255
|
+
Expected: {expected}
|
256
|
+
Context: {context}
|
257
|
+
|
258
|
+
Rate the quality of the output on a scale from 0 to 1, where:
|
259
|
+
- 0 = Poor quality, incorrect or irrelevant
|
260
|
+
- 1 = Excellent quality, accurate and relevant
|
261
|
+
|
262
|
+
Provide only the numeric score.
|
263
|
+
PROMPT
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|