langsmithrb_rails 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +3 -0
  3. data/.rspec_status +161 -0
  4. data/CHANGELOG.md +38 -0
  5. data/Gemfile +20 -0
  6. data/Gemfile.lock +321 -0
  7. data/LICENSE +21 -0
  8. data/README.md +421 -0
  9. data/Rakefile +10 -0
  10. data/langsmithrb_rails-0.1.0.gem +0 -0
  11. data/langsmithrb_rails-0.1.1.gem +0 -0
  12. data/langsmithrb_rails.gemspec +45 -0
  13. data/lib/generators/langsmithrb_rails/buffer/buffer_generator.rb +94 -0
  14. data/lib/generators/langsmithrb_rails/buffer/templates/create_langsmith_run_buffers.rb +29 -0
  15. data/lib/generators/langsmithrb_rails/buffer/templates/flush_buffer_job.rb +40 -0
  16. data/lib/generators/langsmithrb_rails/buffer/templates/langsmith.rake +71 -0
  17. data/lib/generators/langsmithrb_rails/buffer/templates/langsmith_run_buffer.rb +70 -0
  18. data/lib/generators/langsmithrb_rails/buffer/templates/migration.rb +28 -0
  19. data/lib/generators/langsmithrb_rails/ci/ci_generator.rb +37 -0
  20. data/lib/generators/langsmithrb_rails/ci/templates/langsmith-evals.yml +85 -0
  21. data/lib/generators/langsmithrb_rails/ci/templates/langsmith_export_summary.rb +81 -0
  22. data/lib/generators/langsmithrb_rails/demo/demo_generator.rb +81 -0
  23. data/lib/generators/langsmithrb_rails/demo/templates/chat_controller.js +88 -0
  24. data/lib/generators/langsmithrb_rails/demo/templates/chat_controller.rb +58 -0
  25. data/lib/generators/langsmithrb_rails/demo/templates/chat_message.rb +24 -0
  26. data/lib/generators/langsmithrb_rails/demo/templates/create_chat_messages.rb +19 -0
  27. data/lib/generators/langsmithrb_rails/demo/templates/index.html.erb +180 -0
  28. data/lib/generators/langsmithrb_rails/demo/templates/llm_service.rb +165 -0
  29. data/lib/generators/langsmithrb_rails/evals/evals_generator.rb +52 -0
  30. data/lib/generators/langsmithrb_rails/evals/templates/checks/correctness.rb +71 -0
  31. data/lib/generators/langsmithrb_rails/evals/templates/checks/llm_graded.rb +137 -0
  32. data/lib/generators/langsmithrb_rails/evals/templates/datasets/sample.yml +60 -0
  33. data/lib/generators/langsmithrb_rails/evals/templates/langsmith_evals.rake +255 -0
  34. data/lib/generators/langsmithrb_rails/evals/templates/targets/http.rb +120 -0
  35. data/lib/generators/langsmithrb_rails/evals/templates/targets/ruby.rb +136 -0
  36. data/lib/generators/langsmithrb_rails/install/install_generator.rb +35 -0
  37. data/lib/generators/langsmithrb_rails/install/templates/config.yml +45 -0
  38. data/lib/generators/langsmithrb_rails/install/templates/initializer.rb +34 -0
  39. data/lib/generators/langsmithrb_rails/privacy/privacy_generator.rb +39 -0
  40. data/lib/generators/langsmithrb_rails/privacy/templates/custom_redactor.rb +132 -0
  41. data/lib/generators/langsmithrb_rails/privacy/templates/privacy.yml +88 -0
  42. data/lib/generators/langsmithrb_rails/privacy/templates/privacy_initializer.rb +41 -0
  43. data/lib/generators/langsmithrb_rails/tracing/templates/langsmith_traced.rb +146 -0
  44. data/lib/generators/langsmithrb_rails/tracing/templates/langsmith_traced_job.rb +151 -0
  45. data/lib/generators/langsmithrb_rails/tracing/templates/request_tracing.rb +117 -0
  46. data/lib/generators/langsmithrb_rails/tracing/tracing_generator.rb +78 -0
  47. data/lib/langsmithrb_rails/client.rb +292 -0
  48. data/lib/langsmithrb_rails/config.rb +169 -0
  49. data/lib/langsmithrb_rails/evaluation/evaluator.rb +178 -0
  50. data/lib/langsmithrb_rails/evaluation/llm_evaluator.rb +154 -0
  51. data/lib/langsmithrb_rails/evaluation/string_evaluator.rb +158 -0
  52. data/lib/langsmithrb_rails/evaluation.rb +76 -0
  53. data/lib/langsmithrb_rails/generators/langsmithrb_rails/langsmith_generator.rb +61 -0
  54. data/lib/langsmithrb_rails/generators/langsmithrb_rails/templates/langsmith_initializer.rb +22 -0
  55. data/lib/langsmithrb_rails/langsmith.rb +35 -0
  56. data/lib/langsmithrb_rails/otel/exporter.rb +120 -0
  57. data/lib/langsmithrb_rails/otel.rb +135 -0
  58. data/lib/langsmithrb_rails/railtie.rb +33 -0
  59. data/lib/langsmithrb_rails/redactor.rb +76 -0
  60. data/lib/langsmithrb_rails/run_trees.rb +157 -0
  61. data/lib/langsmithrb_rails/version.rb +5 -0
  62. data/lib/langsmithrb_rails/wrappers/anthropic.rb +146 -0
  63. data/lib/langsmithrb_rails/wrappers/base.rb +81 -0
  64. data/lib/langsmithrb_rails/wrappers/llm.rb +151 -0
  65. data/lib/langsmithrb_rails/wrappers/openai.rb +193 -0
  66. data/lib/langsmithrb_rails/wrappers.rb +41 -0
  67. data/lib/langsmithrb_rails.rb +151 -0
  68. data/pkg/langsmithrb_rails-0.3.0.gem +0 -0
  69. metadata +74 -7
@@ -0,0 +1,180 @@
1
+ <%# Chat demo index page %>
2
+ <div class="chat-container" data-controller="chat">
3
+ <div class="chat-header">
4
+ <h1>LangSmith Rails Demo</h1>
5
+ <p class="subtitle">A simple chat interface with LangSmith tracing</p>
6
+ </div>
7
+
8
+ <div class="chat-messages" id="chat-messages">
9
+ <% if @messages.empty? %>
10
+ <div class="welcome-message">
11
+ <h2>Welcome to the LangSmith Rails Demo!</h2>
12
+ <p>This is a simple chat interface that demonstrates LangSmith tracing in a Rails application.</p>
13
+ <p>Every message you send will be traced with LangSmith, allowing you to monitor and debug your LLM interactions.</p>
14
+ <p>Try asking a question below to get started!</p>
15
+ </div>
16
+ <% else %>
17
+ <% @messages.each do |message| %>
18
+ <%= render partial: "message", locals: { message: message } %>
19
+ <% end %>
20
+ <% end %>
21
+ </div>
22
+
23
+ <div class="chat-form">
24
+ <%= form_with(model: @message, url: chat_index_path, data: { action: "submit->chat#submit" }) do |form| %>
25
+ <div class="input-group">
26
+ <%= form.text_area :content, placeholder: "Type your message...", class: "chat-input", data: { chat_target: "input" } %>
27
+ <button type="submit" class="send-button" data-chat-target="submitButton">
28
+ Send
29
+ </button>
30
+ </div>
31
+ <% end %>
32
+ </div>
33
+
34
+ <div class="chat-footer">
35
+ <p>
36
+ Powered by LangSmith Rails
37
+ <% if ENV["LANGSMITH_PROJECT"].present? %>
38
+ | Project: <%= ENV["LANGSMITH_PROJECT"] %>
39
+ <% end %>
40
+ </p>
41
+ </div>
42
+ </div>
43
+
44
+ <%# Partial for rendering a message %>
45
+ <% content_for :partials do %>
46
+ <script type="text/html" id="message-template">
47
+ <div class="message <%= message.is_user? ? 'user-message' : 'assistant-message' %>">
48
+ <div class="message-content">
49
+ <%= message.content %>
50
+ </div>
51
+ <div class="message-meta">
52
+ <span class="message-time"><%= message.created_at.strftime("%H:%M") %></span>
53
+ <span class="message-role"><%= message.is_user? ? "You" : "Assistant" %></span>
54
+ </div>
55
+ </div>
56
+ </script>
57
+ <% end %>
58
+
59
+ <style>
60
+ .chat-container {
61
+ max-width: 800px;
62
+ margin: 0 auto;
63
+ padding: 20px;
64
+ font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, "Open Sans", "Helvetica Neue", sans-serif;
65
+ }
66
+
67
+ .chat-header {
68
+ text-align: center;
69
+ margin-bottom: 20px;
70
+ padding-bottom: 10px;
71
+ border-bottom: 1px solid #eaeaea;
72
+ }
73
+
74
+ .chat-header h1 {
75
+ margin-bottom: 5px;
76
+ color: #333;
77
+ }
78
+
79
+ .subtitle {
80
+ color: #666;
81
+ margin-top: 0;
82
+ }
83
+
84
+ .chat-messages {
85
+ height: 500px;
86
+ overflow-y: auto;
87
+ padding: 10px;
88
+ background-color: #f9f9f9;
89
+ border-radius: 8px;
90
+ margin-bottom: 20px;
91
+ }
92
+
93
+ .welcome-message {
94
+ text-align: center;
95
+ padding: 40px 20px;
96
+ color: #666;
97
+ }
98
+
99
+ .welcome-message h2 {
100
+ color: #333;
101
+ margin-bottom: 15px;
102
+ }
103
+
104
+ .message {
105
+ margin-bottom: 15px;
106
+ padding: 10px 15px;
107
+ border-radius: 8px;
108
+ max-width: 80%;
109
+ position: relative;
110
+ }
111
+
112
+ .user-message {
113
+ background-color: #e1f5fe;
114
+ margin-left: auto;
115
+ border-bottom-right-radius: 0;
116
+ }
117
+
118
+ .assistant-message {
119
+ background-color: #f0f0f0;
120
+ margin-right: auto;
121
+ border-bottom-left-radius: 0;
122
+ }
123
+
124
+ .message-content {
125
+ white-space: pre-wrap;
126
+ }
127
+
128
+ .message-meta {
129
+ font-size: 0.8em;
130
+ color: #999;
131
+ margin-top: 5px;
132
+ text-align: right;
133
+ }
134
+
135
+ .chat-form {
136
+ margin-bottom: 20px;
137
+ }
138
+
139
+ .input-group {
140
+ display: flex;
141
+ gap: 10px;
142
+ }
143
+
144
+ .chat-input {
145
+ flex-grow: 1;
146
+ padding: 10px;
147
+ border: 1px solid #ddd;
148
+ border-radius: 4px;
149
+ resize: none;
150
+ height: 60px;
151
+ font-family: inherit;
152
+ }
153
+
154
+ .send-button {
155
+ padding: 10px 20px;
156
+ background-color: #2196f3;
157
+ color: white;
158
+ border: none;
159
+ border-radius: 4px;
160
+ cursor: pointer;
161
+ transition: background-color 0.2s;
162
+ }
163
+
164
+ .send-button:hover {
165
+ background-color: #0b7dda;
166
+ }
167
+
168
+ .send-button:disabled {
169
+ background-color: #cccccc;
170
+ cursor: not-allowed;
171
+ }
172
+
173
+ .chat-footer {
174
+ text-align: center;
175
+ color: #999;
176
+ font-size: 0.8em;
177
+ padding-top: 10px;
178
+ border-top: 1px solid #eaeaea;
179
+ }
180
+ </style>
@@ -0,0 +1,165 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Service for interacting with LLM providers
4
+ class LlmService
5
+ include LangsmithrbRails::TracedService
6
+
7
+ # Initialize the service
8
+ # @param provider [String] LLM provider to use (openai, anthropic, or mock)
9
+ def initialize(provider = ENV.fetch("LLM_PROVIDER", "<%= options[:provider] %>"))
10
+ @provider = provider.to_s.downcase
11
+ end
12
+
13
+ # Generate a response to a prompt
14
+ # @param prompt [String] User prompt
15
+ # @param context [Array<Hash>] Previous messages for context
16
+ # @param options [Hash] Additional options
17
+ # @return [String] Generated response
18
+ def generate(prompt, context = [], options = {})
19
+ # Create trace with LangSmith
20
+ langsmith_trace("llm_generate", inputs: { prompt: prompt, context: context }) do |run|
21
+ response = case @provider
22
+ when "openai"
23
+ generate_with_openai(prompt, context, options)
24
+ when "anthropic"
25
+ generate_with_anthropic(prompt, context, options)
26
+ else
27
+ generate_with_mock(prompt, context, options)
28
+ end
29
+
30
+ # Record the output
31
+ run.outputs = { response: response }
32
+
33
+ response
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ # Generate with OpenAI
40
+ # @param prompt [String] User prompt
41
+ # @param context [Array<Hash>] Previous messages for context
42
+ # @param options [Hash] Additional options
43
+ # @return [String] Generated response
44
+ def generate_with_openai(prompt, context = [], options = {})
45
+ require "openai"
46
+
47
+ client = OpenAI::Client.new(access_token: ENV.fetch("OPENAI_API_KEY"))
48
+
49
+ # Format messages for OpenAI
50
+ messages = format_openai_messages(prompt, context)
51
+
52
+ # Call OpenAI API with LangSmith tracing
53
+ langsmith_trace("openai_chat_completion",
54
+ inputs: { messages: messages },
55
+ run_type: "llm") do |run|
56
+ response = client.chat(
57
+ parameters: {
58
+ model: options[:model] || ENV.fetch("OPENAI_MODEL", "gpt-3.5-turbo"),
59
+ messages: messages,
60
+ temperature: options[:temperature] || 0.7,
61
+ max_tokens: options[:max_tokens] || 1000
62
+ }
63
+ )
64
+
65
+ # Record the API response
66
+ run.outputs = { response: response }
67
+
68
+ # Extract and return the text
69
+ response.dig("choices", 0, "message", "content")
70
+ end
71
+ rescue => e
72
+ Rails.logger.error("OpenAI API error: #{e.message}")
73
+ "Sorry, I encountered an error: #{e.message}"
74
+ end
75
+
76
+ # Generate with Anthropic
77
+ # @param prompt [String] User prompt
78
+ # @param context [Array<Hash>] Previous messages for context
79
+ # @param options [Hash] Additional options
80
+ # @return [String] Generated response
81
+ def generate_with_anthropic(prompt, context = [], options = {})
82
+ require "anthropic"
83
+
84
+ client = Anthropic::Client.new(api_key: ENV.fetch("ANTHROPIC_API_KEY"))
85
+
86
+ # Format messages for Anthropic
87
+ messages = format_anthropic_messages(prompt, context)
88
+
89
+ # Call Anthropic API with LangSmith tracing
90
+ langsmith_trace("anthropic_completion",
91
+ inputs: { messages: messages },
92
+ run_type: "llm") do |run|
93
+ response = client.messages.create(
94
+ model: options[:model] || ENV.fetch("ANTHROPIC_MODEL", "claude-2"),
95
+ max_tokens: options[:max_tokens] || 1000,
96
+ temperature: options[:temperature] || 0.7,
97
+ messages: messages
98
+ )
99
+
100
+ # Record the API response
101
+ run.outputs = { response: response }
102
+
103
+ # Extract and return the text
104
+ response.content.first.text
105
+ end
106
+ rescue => e
107
+ Rails.logger.error("Anthropic API error: #{e.message}")
108
+ "Sorry, I encountered an error: #{e.message}"
109
+ end
110
+
111
+ # Generate with mock (for testing)
112
+ # @param prompt [String] User prompt
113
+ # @param context [Array<Hash>] Previous messages for context
114
+ # @param options [Hash] Additional options
115
+ # @return [String] Generated response
116
+ def generate_with_mock(prompt, context = [], options = {})
117
+ # Simulate a delay
118
+ sleep(0.5)
119
+
120
+ # Generate a mock response
121
+ "This is a mock response to: \"#{prompt}\"\n\nIn a real application, this would be generated by an LLM API."
122
+ end
123
+
124
+ # Format messages for OpenAI
125
+ # @param prompt [String] User prompt
126
+ # @param context [Array<Hash>] Previous messages for context
127
+ # @return [Array<Hash>] Formatted messages
128
+ def format_openai_messages(prompt, context)
129
+ messages = []
130
+
131
+ # Add system message if provided
132
+ system_message = ENV["OPENAI_SYSTEM_MESSAGE"]
133
+ messages << { role: "system", content: system_message } if system_message.present?
134
+
135
+ # Add context messages
136
+ context.each do |message|
137
+ role = message[:role] || (message[:is_user] ? "user" : "assistant")
138
+ messages << { role: role, content: message[:content] }
139
+ end
140
+
141
+ # Add the current prompt
142
+ messages << { role: "user", content: prompt }
143
+
144
+ messages
145
+ end
146
+
147
+ # Format messages for Anthropic
148
+ # @param prompt [String] User prompt
149
+ # @param context [Array<Hash>] Previous messages for context
150
+ # @return [Array<Hash>] Formatted messages
151
+ def format_anthropic_messages(prompt, context)
152
+ messages = []
153
+
154
+ # Add context messages
155
+ context.each do |message|
156
+ role = message[:is_user] ? "user" : "assistant"
157
+ messages << { role: role, content: message[:content] }
158
+ end
159
+
160
+ # Add the current prompt
161
+ messages << { role: "user", content: prompt }
162
+
163
+ messages
164
+ end
165
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LangsmithrbRails
4
+ module Generators
5
+ # Generator for adding LangSmith evaluations to Rails applications
6
+ class EvalsGenerator < Rails::Generators::Base
7
+ source_root File.expand_path("templates", __dir__)
8
+
9
+ desc "Adds LangSmith evaluations to your Rails application"
10
+
11
+ def create_directories
12
+ empty_directory "config/langsmith/evals/datasets"
13
+ empty_directory "config/langsmith/evals/checks"
14
+ empty_directory "config/langsmith/evals/targets"
15
+ end
16
+
17
+ def create_sample_dataset
18
+ template "datasets/sample.yml", "config/langsmith/evals/datasets/sample.yml"
19
+ end
20
+
21
+ def create_sample_check
22
+ template "checks/correctness.rb", "config/langsmith/evals/checks/correctness.rb"
23
+ template "checks/llm_graded.rb", "config/langsmith/evals/checks/llm_graded.rb"
24
+ end
25
+
26
+ def create_sample_target
27
+ template "targets/http.rb", "config/langsmith/evals/targets/http.rb"
28
+ template "targets/ruby.rb", "config/langsmith/evals/targets/ruby.rb"
29
+ end
30
+
31
+ def create_rake_task
32
+ template "langsmith_evals.rake", "lib/tasks/langsmith_evals.rake"
33
+ end
34
+
35
+ def display_post_install_message
36
+ say "\n"
37
+ say "LangSmith evaluations have been added to your Rails application! 🎉", :green
38
+ say "\n"
39
+ say "Usage:", :yellow
40
+ say " 1. Customize the sample dataset in config/langsmith/evals/datasets/", :yellow
41
+ say " 2. Run an evaluation:", :yellow
42
+ say " bin/rails langsmith:eval[sample,http,my_experiment]", :yellow
43
+ say " 3. Compare experiments:", :yellow
44
+ say " bin/rails langsmith:compare[exp_a,exp_b]", :yellow
45
+ say "\n"
46
+ say "To add CI integration for evaluations, run:", :yellow
47
+ say " bin/rails g langsmithrb_rails:ci", :yellow
48
+ say "\n"
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LangsmithrbRails
4
+ module Evals
5
+ module Checks
6
+ # Simple correctness check for evaluating LLM responses
7
+ class Correctness
8
+ # Check if the response is correct
9
+ # @param input [Hash] Input data
10
+ # @param response [Hash] Response data
11
+ # @param expected [Hash] Expected output data
12
+ # @return [Hash] Evaluation result
13
+ def self.evaluate(input, response, expected)
14
+ result = {
15
+ score: 0.0,
16
+ reasoning: "",
17
+ passed: false
18
+ }
19
+
20
+ # Extract the answer from the response
21
+ answer = extract_answer(response)
22
+
23
+ # Check for exact match
24
+ if expected["answer"] && answer == expected["answer"]
25
+ result[:score] = 1.0
26
+ result[:reasoning] = "Exact match with expected answer"
27
+ result[:passed] = true
28
+ return result
29
+ end
30
+
31
+ # Check for partial matches using contains
32
+ if expected["answer_contains"] && expected["answer_contains"].is_a?(Array)
33
+ matches = expected["answer_contains"].select { |phrase| answer.include?(phrase) }
34
+ match_ratio = matches.size.to_f / expected["answer_contains"].size
35
+
36
+ result[:score] = match_ratio
37
+ result[:reasoning] = "Matched #{matches.size}/#{expected["answer_contains"].size} expected phrases"
38
+ result[:passed] = match_ratio >= 0.5
39
+ return result
40
+ end
41
+
42
+ # Check for code snippets
43
+ if expected["code_contains"] && expected["code_contains"].is_a?(Array)
44
+ matches = expected["code_contains"].select { |phrase| answer.include?(phrase) }
45
+ match_ratio = matches.size.to_f / expected["code_contains"].size
46
+
47
+ result[:score] = match_ratio
48
+ result[:reasoning] = "Code snippet matched #{matches.size}/#{expected["code_contains"].size} expected elements"
49
+ result[:passed] = match_ratio >= 0.5
50
+ return result
51
+ end
52
+
53
+ # No match found
54
+ result[:reasoning] = "No matching criteria found"
55
+ result
56
+ end
57
+
58
+ # Extract the answer from the response
59
+ # @param response [Hash] Response data
60
+ # @return [String] Extracted answer
61
+ def self.extract_answer(response)
62
+ return response["answer"] if response["answer"]
63
+ return response["text"] if response["text"]
64
+ return response["content"] if response["content"]
65
+ return response["output"] if response["output"]
66
+ return response.to_s
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LangsmithrbRails
4
+ module Evals
5
+ module Checks
6
+ # LLM-based grading for evaluating responses
7
+ class LlmGraded
8
+ # Check if the response is correct using an LLM
9
+ # @param input [Hash] Input data
10
+ # @param response [Hash] Response data
11
+ # @param expected [Hash] Expected output data
12
+ # @return [Hash] Evaluation result
13
+ def self.evaluate(input, response, expected)
14
+ result = {
15
+ score: 0.0,
16
+ reasoning: "",
17
+ passed: false
18
+ }
19
+
20
+ # Extract the answer from the response
21
+ answer = extract_answer(response)
22
+ expected_answer = extract_answer(expected)
23
+
24
+ # Create the prompt for the LLM
25
+ prompt = create_grading_prompt(input, answer, expected_answer)
26
+
27
+ # Call the LLM for grading
28
+ llm_response = call_llm(prompt)
29
+
30
+ # Parse the LLM response
31
+ parsed_result = parse_llm_response(llm_response)
32
+
33
+ # Update the result with the parsed data
34
+ result[:score] = parsed_result[:score]
35
+ result[:reasoning] = parsed_result[:reasoning]
36
+ result[:passed] = parsed_result[:score] >= 0.7
37
+
38
+ result
39
+ end
40
+
41
+ # Extract the answer from the response
42
+ # @param response [Hash] Response data
43
+ # @return [String] Extracted answer
44
+ def self.extract_answer(response)
45
+ return response["answer"] if response["answer"]
46
+ return response["text"] if response["text"]
47
+ return response["content"] if response["content"]
48
+ return response["output"] if response["output"]
49
+ return response.to_s
50
+ end
51
+
52
+ # Create a prompt for the LLM to grade the response
53
+ # @param input [Hash] Input data
54
+ # @param answer [String] Actual answer
55
+ # @param expected_answer [String] Expected answer
56
+ # @return [String] Prompt for the LLM
57
+ def self.create_grading_prompt(input, answer, expected_answer)
58
+ <<~PROMPT
59
+ You are an expert evaluator. Your task is to grade the quality and correctness of a response.
60
+
61
+ Question: #{input["question"]}
62
+
63
+ Expected Answer: #{expected_answer}
64
+
65
+ Actual Response: #{answer}
66
+
67
+ Please evaluate the response based on:
68
+ 1. Correctness: Is the information accurate?
69
+ 2. Completeness: Does it fully address the question?
70
+ 3. Clarity: Is it well-explained and easy to understand?
71
+
72
+ Provide your evaluation in the following format:
73
+
74
+ Score: [a number between 0.0 and 1.0]
75
+ Reasoning: [your detailed explanation]
76
+ PROMPT
77
+ end
78
+
79
+ # Call the LLM for grading
80
+ # @param prompt [String] Prompt for the LLM
81
+ # @return [String] LLM response
82
+ def self.call_llm(prompt)
83
+ # Check if OpenAI is configured
84
+ if defined?(OpenAI) && ENV["OPENAI_API_KEY"].present?
85
+ client = OpenAI::Client.new(access_token: ENV["OPENAI_API_KEY"])
86
+ response = client.chat(
87
+ parameters: {
88
+ model: ENV.fetch("LANGSMITH_EVAL_MODEL", "gpt-3.5-turbo"),
89
+ messages: [{ role: "user", content: prompt }],
90
+ temperature: 0.0
91
+ }
92
+ )
93
+ return response.dig("choices", 0, "message", "content")
94
+ end
95
+
96
+ # Check if Anthropic is configured
97
+ if defined?(Anthropic) && ENV["ANTHROPIC_API_KEY"].present?
98
+ client = Anthropic::Client.new(api_key: ENV["ANTHROPIC_API_KEY"])
99
+ response = client.messages.create(
100
+ model: ENV.fetch("LANGSMITH_EVAL_MODEL", "claude-2"),
101
+ max_tokens: 1024,
102
+ messages: [{ role: "user", content: prompt }]
103
+ )
104
+ return response.content.first.text
105
+ end
106
+
107
+ # Fall back to a simple evaluation
108
+ "Score: 0.5\nReasoning: Unable to perform LLM-based evaluation. Please configure an LLM provider."
109
+ end
110
+
111
+ # Parse the LLM response
112
+ # @param response [String] LLM response
113
+ # @return [Hash] Parsed result
114
+ def self.parse_llm_response(response)
115
+ result = {
116
+ score: 0.5,
117
+ reasoning: "Unable to parse LLM response"
118
+ }
119
+
120
+ # Extract score
121
+ if response =~ /Score:\s*([\d\.]+)/i
122
+ result[:score] = $1.to_f
123
+ # Ensure score is between 0 and 1
124
+ result[:score] = [0.0, [1.0, result[:score]].min].max
125
+ end
126
+
127
+ # Extract reasoning
128
+ if response =~ /Reasoning:\s*(.+)/im
129
+ result[:reasoning] = $1.strip
130
+ end
131
+
132
+ result
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,60 @@
1
+ # Sample evaluation dataset
2
+ # This file defines a set of inputs and expected outputs for LLM evaluation
3
+
4
+ name: sample_dataset
5
+ description: A sample dataset for evaluating LLM responses
6
+ version: 1.0.0
7
+
8
+ # Each item in the dataset represents a test case
9
+ items:
10
+ - id: question_1
11
+ input:
12
+ question: "What is the capital of France?"
13
+ expected_output:
14
+ answer: "Paris"
15
+ metadata:
16
+ category: geography
17
+ difficulty: easy
18
+
19
+ - id: question_2
20
+ input:
21
+ question: "Who wrote 'Pride and Prejudice'?"
22
+ expected_output:
23
+ answer: "Jane Austen"
24
+ metadata:
25
+ category: literature
26
+ difficulty: easy
27
+
28
+ - id: question_3
29
+ input:
30
+ question: "What is the formula for calculating the area of a circle?"
31
+ expected_output:
32
+ answer: "A = πr²"
33
+ metadata:
34
+ category: mathematics
35
+ difficulty: medium
36
+
37
+ - id: question_4
38
+ input:
39
+ question: "Explain how a transformer neural network works."
40
+ expected_output:
41
+ answer_contains:
42
+ - "attention mechanism"
43
+ - "self-attention"
44
+ - "encoder"
45
+ - "decoder"
46
+ metadata:
47
+ category: machine_learning
48
+ difficulty: hard
49
+
50
+ - id: question_5
51
+ input:
52
+ question: "Write a function to check if a string is a palindrome."
53
+ expected_output:
54
+ code_contains:
55
+ - "function"
56
+ - "return"
57
+ - "reverse"
58
+ metadata:
59
+ category: programming
60
+ difficulty: medium