langsmithrb_rails 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +3 -0
- data/.rspec_status +82 -0
- data/CHANGELOG.md +25 -0
- data/Gemfile +20 -0
- data/Gemfile.lock +321 -0
- data/LICENSE +21 -0
- data/README.md +268 -0
- data/Rakefile +10 -0
- data/langsmithrb_rails-0.1.0.gem +0 -0
- data/langsmithrb_rails.gemspec +45 -0
- data/lib/generators/langsmithrb_rails/buffer/buffer_generator.rb +94 -0
- data/lib/generators/langsmithrb_rails/buffer/templates/create_langsmith_run_buffers.rb +29 -0
- data/lib/generators/langsmithrb_rails/buffer/templates/flush_buffer_job.rb +40 -0
- data/lib/generators/langsmithrb_rails/buffer/templates/langsmith.rake +71 -0
- data/lib/generators/langsmithrb_rails/buffer/templates/langsmith_run_buffer.rb +70 -0
- data/lib/generators/langsmithrb_rails/buffer/templates/migration.rb +28 -0
- data/lib/generators/langsmithrb_rails/ci/ci_generator.rb +37 -0
- data/lib/generators/langsmithrb_rails/ci/templates/langsmith-evals.yml +85 -0
- data/lib/generators/langsmithrb_rails/ci/templates/langsmith_export_summary.rb +81 -0
- data/lib/generators/langsmithrb_rails/demo/demo_generator.rb +81 -0
- data/lib/generators/langsmithrb_rails/demo/templates/chat_controller.js +88 -0
- data/lib/generators/langsmithrb_rails/demo/templates/chat_controller.rb +58 -0
- data/lib/generators/langsmithrb_rails/demo/templates/chat_message.rb +24 -0
- data/lib/generators/langsmithrb_rails/demo/templates/create_chat_messages.rb +19 -0
- data/lib/generators/langsmithrb_rails/demo/templates/index.html.erb +180 -0
- data/lib/generators/langsmithrb_rails/demo/templates/llm_service.rb +165 -0
- data/lib/generators/langsmithrb_rails/evals/evals_generator.rb +52 -0
- data/lib/generators/langsmithrb_rails/evals/templates/checks/correctness.rb +71 -0
- data/lib/generators/langsmithrb_rails/evals/templates/checks/llm_graded.rb +137 -0
- data/lib/generators/langsmithrb_rails/evals/templates/datasets/sample.yml +60 -0
- data/lib/generators/langsmithrb_rails/evals/templates/langsmith_evals.rake +255 -0
- data/lib/generators/langsmithrb_rails/evals/templates/targets/http.rb +120 -0
- data/lib/generators/langsmithrb_rails/evals/templates/targets/ruby.rb +136 -0
- data/lib/generators/langsmithrb_rails/install/install_generator.rb +35 -0
- data/lib/generators/langsmithrb_rails/install/templates/config.yml +45 -0
- data/lib/generators/langsmithrb_rails/install/templates/initializer.rb +34 -0
- data/lib/generators/langsmithrb_rails/privacy/privacy_generator.rb +39 -0
- data/lib/generators/langsmithrb_rails/privacy/templates/custom_redactor.rb +132 -0
- data/lib/generators/langsmithrb_rails/privacy/templates/privacy.yml +88 -0
- data/lib/generators/langsmithrb_rails/privacy/templates/privacy_initializer.rb +41 -0
- data/lib/generators/langsmithrb_rails/tracing/templates/langsmith_traced.rb +146 -0
- data/lib/generators/langsmithrb_rails/tracing/templates/langsmith_traced_job.rb +151 -0
- data/lib/generators/langsmithrb_rails/tracing/templates/request_tracing.rb +117 -0
- data/lib/generators/langsmithrb_rails/tracing/tracing_generator.rb +78 -0
- data/lib/langsmithrb_rails/client.rb +77 -0
- data/lib/langsmithrb_rails/config.rb +72 -0
- data/lib/langsmithrb_rails/generators/langsmithrb_rails/langsmith_generator.rb +61 -0
- data/lib/langsmithrb_rails/generators/langsmithrb_rails/templates/langsmith_initializer.rb +22 -0
- data/lib/langsmithrb_rails/langsmith.rb +35 -0
- data/lib/langsmithrb_rails/railtie.rb +33 -0
- data/lib/langsmithrb_rails/redactor.rb +76 -0
- data/lib/langsmithrb_rails/version.rb +5 -0
- data/lib/langsmithrb_rails.rb +31 -0
- metadata +59 -6
@@ -0,0 +1,165 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Service for interacting with LLM providers
|
4
|
+
class LlmService
|
5
|
+
include LangsmithrbRails::TracedService
|
6
|
+
|
7
|
+
# Initialize the service
|
8
|
+
# @param provider [String] LLM provider to use (openai, anthropic, or mock)
|
9
|
+
def initialize(provider = ENV.fetch("LLM_PROVIDER", "<%= options[:provider] %>"))
|
10
|
+
@provider = provider.to_s.downcase
|
11
|
+
end
|
12
|
+
|
13
|
+
# Generate a response to a prompt
|
14
|
+
# @param prompt [String] User prompt
|
15
|
+
# @param context [Array<Hash>] Previous messages for context
|
16
|
+
# @param options [Hash] Additional options
|
17
|
+
# @return [String] Generated response
|
18
|
+
def generate(prompt, context = [], options = {})
|
19
|
+
# Create trace with LangSmith
|
20
|
+
langsmith_trace("llm_generate", inputs: { prompt: prompt, context: context }) do |run|
|
21
|
+
response = case @provider
|
22
|
+
when "openai"
|
23
|
+
generate_with_openai(prompt, context, options)
|
24
|
+
when "anthropic"
|
25
|
+
generate_with_anthropic(prompt, context, options)
|
26
|
+
else
|
27
|
+
generate_with_mock(prompt, context, options)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Record the output
|
31
|
+
run.outputs = { response: response }
|
32
|
+
|
33
|
+
response
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
# Generate with OpenAI
|
40
|
+
# @param prompt [String] User prompt
|
41
|
+
# @param context [Array<Hash>] Previous messages for context
|
42
|
+
# @param options [Hash] Additional options
|
43
|
+
# @return [String] Generated response
|
44
|
+
def generate_with_openai(prompt, context = [], options = {})
|
45
|
+
require "openai"
|
46
|
+
|
47
|
+
client = OpenAI::Client.new(access_token: ENV.fetch("OPENAI_API_KEY"))
|
48
|
+
|
49
|
+
# Format messages for OpenAI
|
50
|
+
messages = format_openai_messages(prompt, context)
|
51
|
+
|
52
|
+
# Call OpenAI API with LangSmith tracing
|
53
|
+
langsmith_trace("openai_chat_completion",
|
54
|
+
inputs: { messages: messages },
|
55
|
+
run_type: "llm") do |run|
|
56
|
+
response = client.chat(
|
57
|
+
parameters: {
|
58
|
+
model: options[:model] || ENV.fetch("OPENAI_MODEL", "gpt-3.5-turbo"),
|
59
|
+
messages: messages,
|
60
|
+
temperature: options[:temperature] || 0.7,
|
61
|
+
max_tokens: options[:max_tokens] || 1000
|
62
|
+
}
|
63
|
+
)
|
64
|
+
|
65
|
+
# Record the API response
|
66
|
+
run.outputs = { response: response }
|
67
|
+
|
68
|
+
# Extract and return the text
|
69
|
+
response.dig("choices", 0, "message", "content")
|
70
|
+
end
|
71
|
+
rescue => e
|
72
|
+
Rails.logger.error("OpenAI API error: #{e.message}")
|
73
|
+
"Sorry, I encountered an error: #{e.message}"
|
74
|
+
end
|
75
|
+
|
76
|
+
# Generate with Anthropic
|
77
|
+
# @param prompt [String] User prompt
|
78
|
+
# @param context [Array<Hash>] Previous messages for context
|
79
|
+
# @param options [Hash] Additional options
|
80
|
+
# @return [String] Generated response
|
81
|
+
def generate_with_anthropic(prompt, context = [], options = {})
|
82
|
+
require "anthropic"
|
83
|
+
|
84
|
+
client = Anthropic::Client.new(api_key: ENV.fetch("ANTHROPIC_API_KEY"))
|
85
|
+
|
86
|
+
# Format messages for Anthropic
|
87
|
+
messages = format_anthropic_messages(prompt, context)
|
88
|
+
|
89
|
+
# Call Anthropic API with LangSmith tracing
|
90
|
+
langsmith_trace("anthropic_completion",
|
91
|
+
inputs: { messages: messages },
|
92
|
+
run_type: "llm") do |run|
|
93
|
+
response = client.messages.create(
|
94
|
+
model: options[:model] || ENV.fetch("ANTHROPIC_MODEL", "claude-2"),
|
95
|
+
max_tokens: options[:max_tokens] || 1000,
|
96
|
+
temperature: options[:temperature] || 0.7,
|
97
|
+
messages: messages
|
98
|
+
)
|
99
|
+
|
100
|
+
# Record the API response
|
101
|
+
run.outputs = { response: response }
|
102
|
+
|
103
|
+
# Extract and return the text
|
104
|
+
response.content.first.text
|
105
|
+
end
|
106
|
+
rescue => e
|
107
|
+
Rails.logger.error("Anthropic API error: #{e.message}")
|
108
|
+
"Sorry, I encountered an error: #{e.message}"
|
109
|
+
end
|
110
|
+
|
111
|
+
# Generate with mock (for testing)
|
112
|
+
# @param prompt [String] User prompt
|
113
|
+
# @param context [Array<Hash>] Previous messages for context
|
114
|
+
# @param options [Hash] Additional options
|
115
|
+
# @return [String] Generated response
|
116
|
+
def generate_with_mock(prompt, context = [], options = {})
|
117
|
+
# Simulate a delay
|
118
|
+
sleep(0.5)
|
119
|
+
|
120
|
+
# Generate a mock response
|
121
|
+
"This is a mock response to: \"#{prompt}\"\n\nIn a real application, this would be generated by an LLM API."
|
122
|
+
end
|
123
|
+
|
124
|
+
# Format messages for OpenAI
|
125
|
+
# @param prompt [String] User prompt
|
126
|
+
# @param context [Array<Hash>] Previous messages for context
|
127
|
+
# @return [Array<Hash>] Formatted messages
|
128
|
+
def format_openai_messages(prompt, context)
|
129
|
+
messages = []
|
130
|
+
|
131
|
+
# Add system message if provided
|
132
|
+
system_message = ENV["OPENAI_SYSTEM_MESSAGE"]
|
133
|
+
messages << { role: "system", content: system_message } if system_message.present?
|
134
|
+
|
135
|
+
# Add context messages
|
136
|
+
context.each do |message|
|
137
|
+
role = message[:role] || (message[:is_user] ? "user" : "assistant")
|
138
|
+
messages << { role: role, content: message[:content] }
|
139
|
+
end
|
140
|
+
|
141
|
+
# Add the current prompt
|
142
|
+
messages << { role: "user", content: prompt }
|
143
|
+
|
144
|
+
messages
|
145
|
+
end
|
146
|
+
|
147
|
+
# Format messages for Anthropic
|
148
|
+
# @param prompt [String] User prompt
|
149
|
+
# @param context [Array<Hash>] Previous messages for context
|
150
|
+
# @return [Array<Hash>] Formatted messages
|
151
|
+
def format_anthropic_messages(prompt, context)
|
152
|
+
messages = []
|
153
|
+
|
154
|
+
# Add context messages
|
155
|
+
context.each do |message|
|
156
|
+
role = message[:is_user] ? "user" : "assistant"
|
157
|
+
messages << { role: role, content: message[:content] }
|
158
|
+
end
|
159
|
+
|
160
|
+
# Add the current prompt
|
161
|
+
messages << { role: "user", content: prompt }
|
162
|
+
|
163
|
+
messages
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LangsmithrbRails
|
4
|
+
module Generators
|
5
|
+
# Generator for adding LangSmith evaluations to Rails applications
|
6
|
+
class EvalsGenerator < Rails::Generators::Base
|
7
|
+
source_root File.expand_path("templates", __dir__)
|
8
|
+
|
9
|
+
desc "Adds LangSmith evaluations to your Rails application"
|
10
|
+
|
11
|
+
def create_directories
|
12
|
+
empty_directory "config/langsmith/evals/datasets"
|
13
|
+
empty_directory "config/langsmith/evals/checks"
|
14
|
+
empty_directory "config/langsmith/evals/targets"
|
15
|
+
end
|
16
|
+
|
17
|
+
def create_sample_dataset
|
18
|
+
template "datasets/sample.yml", "config/langsmith/evals/datasets/sample.yml"
|
19
|
+
end
|
20
|
+
|
21
|
+
def create_sample_check
|
22
|
+
template "checks/correctness.rb", "config/langsmith/evals/checks/correctness.rb"
|
23
|
+
template "checks/llm_graded.rb", "config/langsmith/evals/checks/llm_graded.rb"
|
24
|
+
end
|
25
|
+
|
26
|
+
def create_sample_target
|
27
|
+
template "targets/http.rb", "config/langsmith/evals/targets/http.rb"
|
28
|
+
template "targets/ruby.rb", "config/langsmith/evals/targets/ruby.rb"
|
29
|
+
end
|
30
|
+
|
31
|
+
def create_rake_task
|
32
|
+
template "langsmith_evals.rake", "lib/tasks/langsmith_evals.rake"
|
33
|
+
end
|
34
|
+
|
35
|
+
def display_post_install_message
|
36
|
+
say "\n"
|
37
|
+
say "LangSmith evaluations have been added to your Rails application! 🎉", :green
|
38
|
+
say "\n"
|
39
|
+
say "Usage:", :yellow
|
40
|
+
say " 1. Customize the sample dataset in config/langsmith/evals/datasets/", :yellow
|
41
|
+
say " 2. Run an evaluation:", :yellow
|
42
|
+
say " bin/rails langsmith:eval[sample,http,my_experiment]", :yellow
|
43
|
+
say " 3. Compare experiments:", :yellow
|
44
|
+
say " bin/rails langsmith:compare[exp_a,exp_b]", :yellow
|
45
|
+
say "\n"
|
46
|
+
say "To add CI integration for evaluations, run:", :yellow
|
47
|
+
say " bin/rails g langsmithrb_rails:ci", :yellow
|
48
|
+
say "\n"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LangsmithrbRails
|
4
|
+
module Evals
|
5
|
+
module Checks
|
6
|
+
# Simple correctness check for evaluating LLM responses
|
7
|
+
class Correctness
|
8
|
+
# Check if the response is correct
|
9
|
+
# @param input [Hash] Input data
|
10
|
+
# @param response [Hash] Response data
|
11
|
+
# @param expected [Hash] Expected output data
|
12
|
+
# @return [Hash] Evaluation result
|
13
|
+
def self.evaluate(input, response, expected)
|
14
|
+
result = {
|
15
|
+
score: 0.0,
|
16
|
+
reasoning: "",
|
17
|
+
passed: false
|
18
|
+
}
|
19
|
+
|
20
|
+
# Extract the answer from the response
|
21
|
+
answer = extract_answer(response)
|
22
|
+
|
23
|
+
# Check for exact match
|
24
|
+
if expected["answer"] && answer == expected["answer"]
|
25
|
+
result[:score] = 1.0
|
26
|
+
result[:reasoning] = "Exact match with expected answer"
|
27
|
+
result[:passed] = true
|
28
|
+
return result
|
29
|
+
end
|
30
|
+
|
31
|
+
# Check for partial matches using contains
|
32
|
+
if expected["answer_contains"] && expected["answer_contains"].is_a?(Array)
|
33
|
+
matches = expected["answer_contains"].select { |phrase| answer.include?(phrase) }
|
34
|
+
match_ratio = matches.size.to_f / expected["answer_contains"].size
|
35
|
+
|
36
|
+
result[:score] = match_ratio
|
37
|
+
result[:reasoning] = "Matched #{matches.size}/#{expected["answer_contains"].size} expected phrases"
|
38
|
+
result[:passed] = match_ratio >= 0.5
|
39
|
+
return result
|
40
|
+
end
|
41
|
+
|
42
|
+
# Check for code snippets
|
43
|
+
if expected["code_contains"] && expected["code_contains"].is_a?(Array)
|
44
|
+
matches = expected["code_contains"].select { |phrase| answer.include?(phrase) }
|
45
|
+
match_ratio = matches.size.to_f / expected["code_contains"].size
|
46
|
+
|
47
|
+
result[:score] = match_ratio
|
48
|
+
result[:reasoning] = "Code snippet matched #{matches.size}/#{expected["code_contains"].size} expected elements"
|
49
|
+
result[:passed] = match_ratio >= 0.5
|
50
|
+
return result
|
51
|
+
end
|
52
|
+
|
53
|
+
# No match found
|
54
|
+
result[:reasoning] = "No matching criteria found"
|
55
|
+
result
|
56
|
+
end
|
57
|
+
|
58
|
+
# Extract the answer from the response
|
59
|
+
# @param response [Hash] Response data
|
60
|
+
# @return [String] Extracted answer
|
61
|
+
def self.extract_answer(response)
|
62
|
+
return response["answer"] if response["answer"]
|
63
|
+
return response["text"] if response["text"]
|
64
|
+
return response["content"] if response["content"]
|
65
|
+
return response["output"] if response["output"]
|
66
|
+
return response.to_s
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LangsmithrbRails
|
4
|
+
module Evals
|
5
|
+
module Checks
|
6
|
+
# LLM-based grading for evaluating responses
|
7
|
+
class LlmGraded
|
8
|
+
# Check if the response is correct using an LLM
|
9
|
+
# @param input [Hash] Input data
|
10
|
+
# @param response [Hash] Response data
|
11
|
+
# @param expected [Hash] Expected output data
|
12
|
+
# @return [Hash] Evaluation result
|
13
|
+
def self.evaluate(input, response, expected)
|
14
|
+
result = {
|
15
|
+
score: 0.0,
|
16
|
+
reasoning: "",
|
17
|
+
passed: false
|
18
|
+
}
|
19
|
+
|
20
|
+
# Extract the answer from the response
|
21
|
+
answer = extract_answer(response)
|
22
|
+
expected_answer = extract_answer(expected)
|
23
|
+
|
24
|
+
# Create the prompt for the LLM
|
25
|
+
prompt = create_grading_prompt(input, answer, expected_answer)
|
26
|
+
|
27
|
+
# Call the LLM for grading
|
28
|
+
llm_response = call_llm(prompt)
|
29
|
+
|
30
|
+
# Parse the LLM response
|
31
|
+
parsed_result = parse_llm_response(llm_response)
|
32
|
+
|
33
|
+
# Update the result with the parsed data
|
34
|
+
result[:score] = parsed_result[:score]
|
35
|
+
result[:reasoning] = parsed_result[:reasoning]
|
36
|
+
result[:passed] = parsed_result[:score] >= 0.7
|
37
|
+
|
38
|
+
result
|
39
|
+
end
|
40
|
+
|
41
|
+
# Extract the answer from the response
|
42
|
+
# @param response [Hash] Response data
|
43
|
+
# @return [String] Extracted answer
|
44
|
+
def self.extract_answer(response)
|
45
|
+
return response["answer"] if response["answer"]
|
46
|
+
return response["text"] if response["text"]
|
47
|
+
return response["content"] if response["content"]
|
48
|
+
return response["output"] if response["output"]
|
49
|
+
return response.to_s
|
50
|
+
end
|
51
|
+
|
52
|
+
# Create a prompt for the LLM to grade the response
|
53
|
+
# @param input [Hash] Input data
|
54
|
+
# @param answer [String] Actual answer
|
55
|
+
# @param expected_answer [String] Expected answer
|
56
|
+
# @return [String] Prompt for the LLM
|
57
|
+
def self.create_grading_prompt(input, answer, expected_answer)
|
58
|
+
<<~PROMPT
|
59
|
+
You are an expert evaluator. Your task is to grade the quality and correctness of a response.
|
60
|
+
|
61
|
+
Question: #{input["question"]}
|
62
|
+
|
63
|
+
Expected Answer: #{expected_answer}
|
64
|
+
|
65
|
+
Actual Response: #{answer}
|
66
|
+
|
67
|
+
Please evaluate the response based on:
|
68
|
+
1. Correctness: Is the information accurate?
|
69
|
+
2. Completeness: Does it fully address the question?
|
70
|
+
3. Clarity: Is it well-explained and easy to understand?
|
71
|
+
|
72
|
+
Provide your evaluation in the following format:
|
73
|
+
|
74
|
+
Score: [a number between 0.0 and 1.0]
|
75
|
+
Reasoning: [your detailed explanation]
|
76
|
+
PROMPT
|
77
|
+
end
|
78
|
+
|
79
|
+
# Call the LLM for grading
|
80
|
+
# @param prompt [String] Prompt for the LLM
|
81
|
+
# @return [String] LLM response
|
82
|
+
def self.call_llm(prompt)
|
83
|
+
# Check if OpenAI is configured
|
84
|
+
if defined?(OpenAI) && ENV["OPENAI_API_KEY"].present?
|
85
|
+
client = OpenAI::Client.new(access_token: ENV["OPENAI_API_KEY"])
|
86
|
+
response = client.chat(
|
87
|
+
parameters: {
|
88
|
+
model: ENV.fetch("LANGSMITH_EVAL_MODEL", "gpt-3.5-turbo"),
|
89
|
+
messages: [{ role: "user", content: prompt }],
|
90
|
+
temperature: 0.0
|
91
|
+
}
|
92
|
+
)
|
93
|
+
return response.dig("choices", 0, "message", "content")
|
94
|
+
end
|
95
|
+
|
96
|
+
# Check if Anthropic is configured
|
97
|
+
if defined?(Anthropic) && ENV["ANTHROPIC_API_KEY"].present?
|
98
|
+
client = Anthropic::Client.new(api_key: ENV["ANTHROPIC_API_KEY"])
|
99
|
+
response = client.messages.create(
|
100
|
+
model: ENV.fetch("LANGSMITH_EVAL_MODEL", "claude-2"),
|
101
|
+
max_tokens: 1024,
|
102
|
+
messages: [{ role: "user", content: prompt }]
|
103
|
+
)
|
104
|
+
return response.content.first.text
|
105
|
+
end
|
106
|
+
|
107
|
+
# Fall back to a simple evaluation
|
108
|
+
"Score: 0.5\nReasoning: Unable to perform LLM-based evaluation. Please configure an LLM provider."
|
109
|
+
end
|
110
|
+
|
111
|
+
# Parse the LLM response
|
112
|
+
# @param response [String] LLM response
|
113
|
+
# @return [Hash] Parsed result
|
114
|
+
def self.parse_llm_response(response)
|
115
|
+
result = {
|
116
|
+
score: 0.5,
|
117
|
+
reasoning: "Unable to parse LLM response"
|
118
|
+
}
|
119
|
+
|
120
|
+
# Extract score
|
121
|
+
if response =~ /Score:\s*([\d\.]+)/i
|
122
|
+
result[:score] = $1.to_f
|
123
|
+
# Ensure score is between 0 and 1
|
124
|
+
result[:score] = [0.0, [1.0, result[:score]].min].max
|
125
|
+
end
|
126
|
+
|
127
|
+
# Extract reasoning
|
128
|
+
if response =~ /Reasoning:\s*(.+)/im
|
129
|
+
result[:reasoning] = $1.strip
|
130
|
+
end
|
131
|
+
|
132
|
+
result
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Sample evaluation dataset
|
2
|
+
# This file defines a set of inputs and expected outputs for LLM evaluation
|
3
|
+
|
4
|
+
name: sample_dataset
|
5
|
+
description: A sample dataset for evaluating LLM responses
|
6
|
+
version: 1.0.0
|
7
|
+
|
8
|
+
# Each item in the dataset represents a test case
|
9
|
+
items:
|
10
|
+
- id: question_1
|
11
|
+
input:
|
12
|
+
question: "What is the capital of France?"
|
13
|
+
expected_output:
|
14
|
+
answer: "Paris"
|
15
|
+
metadata:
|
16
|
+
category: geography
|
17
|
+
difficulty: easy
|
18
|
+
|
19
|
+
- id: question_2
|
20
|
+
input:
|
21
|
+
question: "Who wrote 'Pride and Prejudice'?"
|
22
|
+
expected_output:
|
23
|
+
answer: "Jane Austen"
|
24
|
+
metadata:
|
25
|
+
category: literature
|
26
|
+
difficulty: easy
|
27
|
+
|
28
|
+
- id: question_3
|
29
|
+
input:
|
30
|
+
question: "What is the formula for calculating the area of a circle?"
|
31
|
+
expected_output:
|
32
|
+
answer: "A = πr²"
|
33
|
+
metadata:
|
34
|
+
category: mathematics
|
35
|
+
difficulty: medium
|
36
|
+
|
37
|
+
- id: question_4
|
38
|
+
input:
|
39
|
+
question: "Explain how a transformer neural network works."
|
40
|
+
expected_output:
|
41
|
+
answer_contains:
|
42
|
+
- "attention mechanism"
|
43
|
+
- "self-attention"
|
44
|
+
- "encoder"
|
45
|
+
- "decoder"
|
46
|
+
metadata:
|
47
|
+
category: machine_learning
|
48
|
+
difficulty: hard
|
49
|
+
|
50
|
+
- id: question_5
|
51
|
+
input:
|
52
|
+
question: "Write a function to check if a string is a palindrome."
|
53
|
+
expected_output:
|
54
|
+
code_contains:
|
55
|
+
- "function"
|
56
|
+
- "return"
|
57
|
+
- "reverse"
|
58
|
+
metadata:
|
59
|
+
category: programming
|
60
|
+
difficulty: medium
|