rubyllm-observ 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (209) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +778 -0
  3. data/Rakefile +49 -0
  4. data/app/assets/javascripts/observ/application.js +12 -0
  5. data/app/assets/javascripts/observ/controllers/autoscroll_controller.js +33 -0
  6. data/app/assets/javascripts/observ/controllers/chat_form_controller.js +93 -0
  7. data/app/assets/javascripts/observ/controllers/copy_controller.js +43 -0
  8. data/app/assets/javascripts/observ/controllers/dashboard_controller.js +58 -0
  9. data/app/assets/javascripts/observ/controllers/drawer_controller.js +58 -0
  10. data/app/assets/javascripts/observ/controllers/expandable_controller.js +33 -0
  11. data/app/assets/javascripts/observ/controllers/filter_controller.js +36 -0
  12. data/app/assets/javascripts/observ/controllers/index.js +52 -0
  13. data/app/assets/javascripts/observ/controllers/json_viewer_controller.js +260 -0
  14. data/app/assets/javascripts/observ/controllers/message_form_controller.js +58 -0
  15. data/app/assets/javascripts/observ/controllers/prompt_variables_controller.js +64 -0
  16. data/app/assets/javascripts/observ/controllers/text_select_controller.js +14 -0
  17. data/app/assets/stylesheets/observ/_annotations.scss +127 -0
  18. data/app/assets/stylesheets/observ/_card.scss +52 -0
  19. data/app/assets/stylesheets/observ/_chat.scss +156 -0
  20. data/app/assets/stylesheets/observ/_components.scss +460 -0
  21. data/app/assets/stylesheets/observ/_dashboard.scss +40 -0
  22. data/app/assets/stylesheets/observ/_datasets.scss +697 -0
  23. data/app/assets/stylesheets/observ/_drawer.scss +273 -0
  24. data/app/assets/stylesheets/observ/_json_viewer.scss +120 -0
  25. data/app/assets/stylesheets/observ/_layout.scss +256 -0
  26. data/app/assets/stylesheets/observ/_metrics.scss +99 -0
  27. data/app/assets/stylesheets/observ/_observations.scss +160 -0
  28. data/app/assets/stylesheets/observ/_pagination.scss +143 -0
  29. data/app/assets/stylesheets/observ/_prompts.scss +365 -0
  30. data/app/assets/stylesheets/observ/_table.scss +53 -0
  31. data/app/assets/stylesheets/observ/_variables.scss +53 -0
  32. data/app/assets/stylesheets/observ/application.scss +15 -0
  33. data/app/controllers/observ/annotations_controller.rb +144 -0
  34. data/app/controllers/observ/application_controller.rb +8 -0
  35. data/app/controllers/observ/chats_controller.rb +58 -0
  36. data/app/controllers/observ/dashboard_controller.rb +159 -0
  37. data/app/controllers/observ/dataset_items_controller.rb +85 -0
  38. data/app/controllers/observ/dataset_run_items_controller.rb +84 -0
  39. data/app/controllers/observ/dataset_runs_controller.rb +110 -0
  40. data/app/controllers/observ/datasets_controller.rb +74 -0
  41. data/app/controllers/observ/messages_controller.rb +26 -0
  42. data/app/controllers/observ/observations_controller.rb +59 -0
  43. data/app/controllers/observ/prompt_versions_controller.rb +148 -0
  44. data/app/controllers/observ/prompts_controller.rb +205 -0
  45. data/app/controllers/observ/sessions_controller.rb +45 -0
  46. data/app/controllers/observ/traces_controller.rb +86 -0
  47. data/app/forms/observ/prompt_form.rb +96 -0
  48. data/app/helpers/observ/application_helper.rb +9 -0
  49. data/app/helpers/observ/chats_helper.rb +47 -0
  50. data/app/helpers/observ/dashboard_helper.rb +154 -0
  51. data/app/helpers/observ/datasets_helper.rb +62 -0
  52. data/app/helpers/observ/pagination_helper.rb +38 -0
  53. data/app/jobs/observ/application_job.rb +4 -0
  54. data/app/jobs/observ/dataset_runner_job.rb +49 -0
  55. data/app/mailers/observ/application_mailer.rb +6 -0
  56. data/app/models/concerns/observ/agent_phaseable.rb +124 -0
  57. data/app/models/concerns/observ/agent_selectable.rb +50 -0
  58. data/app/models/concerns/observ/chat_enhancements.rb +109 -0
  59. data/app/models/concerns/observ/message_enhancements.rb +31 -0
  60. data/app/models/concerns/observ/observability_instrumentation.rb +124 -0
  61. data/app/models/concerns/observ/prompt_management.rb +320 -0
  62. data/app/models/concerns/observ/trace_association.rb +9 -0
  63. data/app/models/observ/annotation.rb +23 -0
  64. data/app/models/observ/application_record.rb +5 -0
  65. data/app/models/observ/dataset.rb +51 -0
  66. data/app/models/observ/dataset_item.rb +41 -0
  67. data/app/models/observ/dataset_run.rb +104 -0
  68. data/app/models/observ/dataset_run_item.rb +111 -0
  69. data/app/models/observ/generation.rb +56 -0
  70. data/app/models/observ/null_prompt.rb +59 -0
  71. data/app/models/observ/observation.rb +38 -0
  72. data/app/models/observ/prompt.rb +315 -0
  73. data/app/models/observ/score.rb +51 -0
  74. data/app/models/observ/session.rb +131 -0
  75. data/app/models/observ/span.rb +13 -0
  76. data/app/models/observ/trace.rb +135 -0
  77. data/app/presenters/observ/agent_select_presenter.rb +59 -0
  78. data/app/services/observ/agent_executor_service.rb +174 -0
  79. data/app/services/observ/agent_provider.rb +60 -0
  80. data/app/services/observ/agent_selection_service.rb +53 -0
  81. data/app/services/observ/chat_instrumenter.rb +523 -0
  82. data/app/services/observ/dataset_runner_service.rb +153 -0
  83. data/app/services/observ/evaluator_runner_service.rb +58 -0
  84. data/app/services/observ/evaluators/base_evaluator.rb +51 -0
  85. data/app/services/observ/evaluators/contains_evaluator.rb +53 -0
  86. data/app/services/observ/evaluators/exact_match_evaluator.rb +23 -0
  87. data/app/services/observ/evaluators/json_structure_evaluator.rb +44 -0
  88. data/app/services/observ/prompt_manager/cache_statistics.rb +82 -0
  89. data/app/services/observ/prompt_manager/caching.rb +167 -0
  90. data/app/services/observ/prompt_manager/comparison.rb +49 -0
  91. data/app/services/observ/prompt_manager/version_management.rb +96 -0
  92. data/app/services/observ/prompt_manager.rb +40 -0
  93. data/app/services/observ/trace_text_formatter.rb +349 -0
  94. data/app/validators/observ/prompt_config_validator.rb +187 -0
  95. data/app/views/kaminari/_first_page.html.erb +11 -0
  96. data/app/views/kaminari/_gap.html.erb +8 -0
  97. data/app/views/kaminari/_last_page.html.erb +11 -0
  98. data/app/views/kaminari/_next_page.html.erb +11 -0
  99. data/app/views/kaminari/_page.html.erb +12 -0
  100. data/app/views/kaminari/_paginator.html.erb +25 -0
  101. data/app/views/kaminari/_prev_page.html.erb +11 -0
  102. data/app/views/kaminari/observ/_first_page.html.erb +11 -0
  103. data/app/views/kaminari/observ/_gap.html.erb +8 -0
  104. data/app/views/kaminari/observ/_last_page.html.erb +11 -0
  105. data/app/views/kaminari/observ/_next_page.html.erb +11 -0
  106. data/app/views/kaminari/observ/_page.html.erb +12 -0
  107. data/app/views/kaminari/observ/_paginator.html.erb +25 -0
  108. data/app/views/kaminari/observ/_prev_page.html.erb +11 -0
  109. data/app/views/layouts/observ/application.html.erb +88 -0
  110. data/app/views/observ/annotations/_annotation.html.erb +13 -0
  111. data/app/views/observ/annotations/_form.html.erb +28 -0
  112. data/app/views/observ/annotations/index.html.erb +28 -0
  113. data/app/views/observ/annotations/sessions_index.html.erb +48 -0
  114. data/app/views/observ/annotations/traces_index.html.erb +48 -0
  115. data/app/views/observ/chats/_form.html.erb +45 -0
  116. data/app/views/observ/chats/index.html.erb +67 -0
  117. data/app/views/observ/chats/new.html.erb +17 -0
  118. data/app/views/observ/chats/show.html.erb +34 -0
  119. data/app/views/observ/dashboard/index.html.erb +236 -0
  120. data/app/views/observ/dataset_items/_form.html.erb +49 -0
  121. data/app/views/observ/dataset_items/edit.html.erb +18 -0
  122. data/app/views/observ/dataset_items/index.html.erb +95 -0
  123. data/app/views/observ/dataset_items/new.html.erb +18 -0
  124. data/app/views/observ/dataset_run_items/_score_close_drawer.html.erb +4 -0
  125. data/app/views/observ/dataset_run_items/_score_drawer.html.erb +75 -0
  126. data/app/views/observ/dataset_run_items/_score_success.html.erb +29 -0
  127. data/app/views/observ/dataset_run_items/_scores_cell.html.erb +19 -0
  128. data/app/views/observ/dataset_run_items/details_drawer.turbo_stream.erb +80 -0
  129. data/app/views/observ/dataset_run_items/score_drawer.turbo_stream.erb +7 -0
  130. data/app/views/observ/dataset_runs/index.html.erb +108 -0
  131. data/app/views/observ/dataset_runs/new.html.erb +57 -0
  132. data/app/views/observ/dataset_runs/review.html.erb +155 -0
  133. data/app/views/observ/dataset_runs/show.html.erb +166 -0
  134. data/app/views/observ/datasets/_form.html.erb +62 -0
  135. data/app/views/observ/datasets/_items_tab.html.erb +66 -0
  136. data/app/views/observ/datasets/_runs_tab.html.erb +82 -0
  137. data/app/views/observ/datasets/edit.html.erb +32 -0
  138. data/app/views/observ/datasets/index.html.erb +105 -0
  139. data/app/views/observ/datasets/new.html.erb +18 -0
  140. data/app/views/observ/datasets/show.html.erb +67 -0
  141. data/app/views/observ/messages/_content.html.erb +1 -0
  142. data/app/views/observ/messages/_form.html.erb +33 -0
  143. data/app/views/observ/messages/_message.html.erb +14 -0
  144. data/app/views/observ/messages/_tool_calls.html.erb +10 -0
  145. data/app/views/observ/messages/create.turbo_stream.erb +9 -0
  146. data/app/views/observ/observations/index.html.erb +97 -0
  147. data/app/views/observ/observations/show_generation.html.erb +195 -0
  148. data/app/views/observ/observations/show_span.html.erb +93 -0
  149. data/app/views/observ/prompts/_diff_content.html.erb +16 -0
  150. data/app/views/observ/prompts/_form.html.erb +111 -0
  151. data/app/views/observ/prompts/_new_form.html.erb +102 -0
  152. data/app/views/observ/prompts/_prompt_actions.html.erb +4 -0
  153. data/app/views/observ/prompts/_prompt_content_highlighted.html.erb +4 -0
  154. data/app/views/observ/prompts/_version_actions.html.erb +40 -0
  155. data/app/views/observ/prompts/compare.html.erb +155 -0
  156. data/app/views/observ/prompts/edit.html.erb +17 -0
  157. data/app/views/observ/prompts/index.html.erb +108 -0
  158. data/app/views/observ/prompts/new.html.erb +17 -0
  159. data/app/views/observ/prompts/show.html.erb +138 -0
  160. data/app/views/observ/prompts/versions.html.erb +87 -0
  161. data/app/views/observ/sessions/annotations_drawer.turbo_stream.erb +25 -0
  162. data/app/views/observ/sessions/drawer_test.turbo_stream.erb +49 -0
  163. data/app/views/observ/sessions/index.html.erb +91 -0
  164. data/app/views/observ/sessions/show.html.erb +251 -0
  165. data/app/views/observ/traces/add_to_dataset_drawer.turbo_stream.erb +48 -0
  166. data/app/views/observ/traces/annotations_drawer.turbo_stream.erb +25 -0
  167. data/app/views/observ/traces/index.html.erb +87 -0
  168. data/app/views/observ/traces/show.html.erb +285 -0
  169. data/app/views/observ/traces/text_output_drawer.turbo_stream.erb +48 -0
  170. data/app/views/shared/_drawer.html.erb +26 -0
  171. data/config/routes.rb +80 -0
  172. data/db/migrate/001_create_observ_sessions.rb +21 -0
  173. data/db/migrate/002_create_observ_traces.rb +25 -0
  174. data/db/migrate/003_create_observ_observations.rb +42 -0
  175. data/db/migrate/004_add_message_id_to_observ_traces.rb +7 -0
  176. data/db/migrate/005_create_observ_prompts.rb +21 -0
  177. data/db/migrate/006_fix_prompt_config_strings.rb +23 -0
  178. data/db/migrate/007_create_observ_annotations.rb +12 -0
  179. data/db/migrate/009_add_prompt_fields_to_observ_chats.rb +11 -0
  180. data/db/migrate/010_create_observ_datasets.rb +15 -0
  181. data/db/migrate/011_create_observ_dataset_items.rb +17 -0
  182. data/db/migrate/012_create_observ_dataset_runs.rb +22 -0
  183. data/db/migrate/013_create_observ_dataset_run_items.rb +16 -0
  184. data/db/migrate/014_create_observ_scores.rb +26 -0
  185. data/lib/generators/observ/add_phase_tracking/add_phase_tracking_generator.rb +150 -0
  186. data/lib/generators/observ/add_phase_tracking/templates/migration.rb.tt +6 -0
  187. data/lib/generators/observ/install/USAGE +27 -0
  188. data/lib/generators/observ/install/install_generator.rb +270 -0
  189. data/lib/generators/observ/install_chat/install_chat_generator.rb +313 -0
  190. data/lib/generators/observ/install_chat/templates/agents/base_agent.rb.tt +147 -0
  191. data/lib/generators/observ/install_chat/templates/agents/simple_agent.rb.tt +55 -0
  192. data/lib/generators/observ/install_chat/templates/concerns/observ_chat_enhancements.rb.tt +34 -0
  193. data/lib/generators/observ/install_chat/templates/concerns/observ_message_enhancements.rb.tt +18 -0
  194. data/lib/generators/observ/install_chat/templates/initializers/observability.rb.tt +20 -0
  195. data/lib/generators/observ/install_chat/templates/jobs/chat_response_job.rb.tt +56 -0
  196. data/lib/generators/observ/install_chat/templates/migrations/add_agent_class_name.rb.tt +6 -0
  197. data/lib/generators/observ/install_chat/templates/migrations/add_observability_session_id.rb.tt +6 -0
  198. data/lib/generators/observ/install_chat/templates/tools/think_tool.rb.tt +29 -0
  199. data/lib/generators/observ/install_chat/templates/views/messages/_content.html.erb.tt +1 -0
  200. data/lib/observ/asset_installer.rb +130 -0
  201. data/lib/observ/asset_syncer.rb +104 -0
  202. data/lib/observ/configuration.rb +108 -0
  203. data/lib/observ/engine.rb +50 -0
  204. data/lib/observ/index_file_generator.rb +142 -0
  205. data/lib/observ/instrumenter/ruby_llm.rb +6 -0
  206. data/lib/observ/version.rb +3 -0
  207. data/lib/observ.rb +29 -0
  208. data/lib/tasks/observ_tasks.rake +75 -0
  209. metadata +453 -0
@@ -0,0 +1,523 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Observ
4
+ class ChatInstrumenter
5
+ attr_reader :session, :chat, :current_trace, :current_tool_span
6
+
7
+ def initialize(session, chat, context: {})
8
+ @session = session
9
+ @chat = chat
10
+ @context = context
11
+ @current_trace = nil
12
+ @current_tool_span = nil
13
+ @original_ask_method = nil
14
+ @instrumented = false
15
+ end
16
+
17
+ def instrument!
18
+ return if @instrumented
19
+
20
+ wrap_ask_method
21
+ setup_event_handlers
22
+ @instrumented = true
23
+
24
+ Rails.logger.info "[Observability] Instrumented chat for session #{session.session_id}"
25
+ end
26
+
27
+ def create_trace(name: "chat_exchange", input: nil, metadata: {})
28
+ @current_trace = session.create_trace(
29
+ name: name,
30
+ input: input,
31
+ metadata: @context.merge(metadata)
32
+ )
33
+ end
34
+
35
+ def finalize_current_trace(output: nil)
36
+ return unless @current_trace
37
+
38
+ @current_trace.finalize(output: output)
39
+ @current_trace = nil
40
+ end
41
+
42
+ private
43
+
44
+ def wrap_ask_method
45
+ return if @original_ask_method
46
+
47
+ @original_ask_method = chat.method(:ask)
48
+ instrumenter = self
49
+
50
+ chat.define_singleton_method(:ask) do |*args, **kwargs, &block|
51
+ instrumenter.send(:handle_ask_call, self, args, kwargs, block)
52
+ end
53
+ end
54
+
55
+ def handle_ask_call(chat_instance, args, kwargs, block)
56
+ user_message = args[0]
57
+ attachments = kwargs[:with]
58
+
59
+ # Track if this is an ephemeral trace (created just for this call)
60
+ is_ephemeral_trace = @current_trace.nil?
61
+
62
+ trace = @current_trace || create_trace(
63
+ name: "chat.ask",
64
+ input: format_input(user_message, attachments),
65
+ metadata: {
66
+ has_attachments: attachments.present?,
67
+ attachment_count: Array(attachments).size
68
+ }
69
+ )
70
+
71
+ model_id = extract_model_id(chat_instance)
72
+
73
+ # Extract prompt metadata from the chat's agent (if available)
74
+ prompt_metadata = extract_prompt_metadata(chat_instance)
75
+
76
+ generation = trace.create_generation(
77
+ name: "llm_call",
78
+ metadata: @context.merge(kwargs.slice(:temperature, :max_tokens)),
79
+ model: model_id,
80
+ model_parameters: extract_model_parameters(chat_instance),
81
+ **prompt_metadata
82
+ )
83
+
84
+ messages_snapshot = capture_messages(chat_instance)
85
+ generation.set_input(user_message, messages: messages_snapshot)
86
+
87
+ call_start_time = Time.current
88
+ result = @original_ask_method.call(*args, **kwargs, &block)
89
+
90
+ finalize_generation(generation, result, call_start_time)
91
+
92
+ if is_ephemeral_trace
93
+ link_trace_to_message(trace, chat_instance, call_start_time)
94
+ trace.finalize(output: result.content)
95
+ @current_trace = nil
96
+ end
97
+
98
+ result
99
+ rescue StandardError => e
100
+ handle_error(e, trace, generation)
101
+ raise
102
+ end
103
+
104
+ def setup_event_handlers
105
+ setup_tool_call_handler
106
+ setup_tool_result_handler
107
+ setup_message_handlers
108
+ end
109
+
110
+ def setup_tool_call_handler
111
+ instrumenter = self
112
+
113
+ chat.on_tool_call do |tool_call|
114
+ instrumenter.send(:handle_tool_call, tool_call)
115
+ end
116
+ end
117
+
118
+ def setup_tool_result_handler
119
+ instrumenter = self
120
+
121
+ chat.on_tool_result do |result|
122
+ instrumenter.send(:handle_tool_result, result)
123
+ end
124
+ end
125
+
126
+ def setup_message_handlers
127
+ instrumenter = self
128
+
129
+ chat.on_new_message do
130
+ Rails.logger.debug "[Observability] New message started"
131
+ end
132
+
133
+ chat.on_end_message do |message|
134
+ Rails.logger.debug "[Observability] Message completed: #{message.role}"
135
+ end
136
+ end
137
+
138
+ def handle_tool_call(tool_call)
139
+ return unless @current_trace
140
+
141
+ @current_tool_span = @current_trace.create_span(
142
+ name: "tool:#{tool_call.name}",
143
+ metadata: {
144
+ tool_name: tool_call.name,
145
+ tool_call_id: tool_call.id,
146
+ level: "INFO"
147
+ },
148
+ input: format_tool_arguments(tool_call.arguments)
149
+ )
150
+
151
+ Rails.logger.info "[Observability] Tool call started: #{tool_call.name}"
152
+ end
153
+
154
+ def handle_tool_result(result)
155
+ return unless @current_trace && @current_tool_span
156
+
157
+ @current_tool_span.finalize(
158
+ output: format_tool_result(result)
159
+ )
160
+
161
+ Rails.logger.info "[Observability] Tool call completed: #{@current_tool_span.name}"
162
+ @current_tool_span = nil
163
+ end
164
+
165
+ def finalize_generation(generation, result, call_start_time)
166
+ usage = extract_usage(result)
167
+ provider_metadata = extract_provider_metadata(result)
168
+ finish_reason = extract_finish_reason(result)
169
+ cost = calculate_cost(result)
170
+ raw_response = extract_raw_response(result)
171
+
172
+ generation.finalize(
173
+ output: result.content,
174
+ usage: usage,
175
+ cost_usd: cost,
176
+ finish_reason: finish_reason,
177
+ completion_start_time: call_start_time,
178
+ provider_metadata: provider_metadata,
179
+ raw_response: raw_response
180
+ )
181
+ rescue StandardError => e
182
+ Rails.logger.error "[Observability] Failed to finalize generation: #{e.message}"
183
+ generation.finalize(
184
+ output: result.content,
185
+ usage: { input_tokens: result.input_tokens || 0, output_tokens: result.output_tokens || 0 }
186
+ ) rescue nil
187
+ end
188
+
189
+ def handle_error(error, trace, generation)
190
+ return unless trace
191
+
192
+ error_span = trace.create_span(
193
+ name: "error",
194
+ metadata: {
195
+ error_type: error.class.name,
196
+ level: "ERROR"
197
+ },
198
+ input: {
199
+ error_message: error.message,
200
+ backtrace: error.backtrace&.first(10)
201
+ }.to_json
202
+ )
203
+ error_span.finalize(output: { error_captured: true }.to_json)
204
+
205
+ generation&.update(status_message: "FAILED", finish_reason: "error") rescue nil
206
+
207
+ Rails.logger.error "[Observability] Error captured: #{error.class.name} - #{error.message}"
208
+ end
209
+
210
+ def extract_prompt_metadata(chat_instance)
211
+ metadata = {}
212
+
213
+ # Try to get the agent class from context
214
+ agent_class = @context[:agent_class]
215
+
216
+ if agent_class && agent_class.respond_to?(:prompt_metadata)
217
+ metadata = agent_class.prompt_metadata
218
+ Rails.logger.debug "[Observability] Extracted prompt metadata: #{metadata.inspect}"
219
+ end
220
+
221
+ metadata
222
+ rescue StandardError => e
223
+ Rails.logger.debug "[Observability] Could not extract prompt metadata: #{e.message}"
224
+ {}
225
+ end
226
+
227
+ def extract_model_id(chat_instance)
228
+ if chat_instance.respond_to?(:model)
229
+ model = chat_instance.model
230
+ if model.respond_to?(:model_id)
231
+ model.model_id
232
+ elsif model.respond_to?(:id)
233
+ model.id
234
+ else
235
+ model.to_s
236
+ end
237
+ else
238
+ "unknown"
239
+ end
240
+ end
241
+
242
+ def extract_model_parameters(chat_instance)
243
+ # Extract parameters from the internal RubyLLM::Chat object
244
+ # The Chat ActiveRecord model stores the RubyLLM::Chat instance in @chat
245
+ # Parameters are set via with_params and stored in the RubyLLM::Chat object's @params
246
+
247
+ # Ensure agent is configured (sets params if not already set)
248
+ # This is safe to call multiple times - it's idempotent
249
+ chat_instance.ensure_agent_configured if chat_instance.respond_to?(:ensure_agent_configured)
250
+
251
+ # Access the internal RubyLLM::Chat object
252
+ llm_chat = chat_instance.instance_variable_get(:@chat)
253
+ return {} unless llm_chat
254
+
255
+ # Get params from the RubyLLM::Chat object
256
+ params = if llm_chat.respond_to?(:params)
257
+ llm_chat.params
258
+ elsif llm_chat.instance_variable_defined?(:@params)
259
+ llm_chat.instance_variable_get(:@params)
260
+ else
261
+ {}
262
+ end
263
+
264
+ params ||= {}
265
+
266
+ # Only include relevant model parameters and convert string values to proper types
267
+ extracted = params.slice(
268
+ :temperature,
269
+ :max_tokens,
270
+ :top_p,
271
+ :frequency_penalty,
272
+ :presence_penalty,
273
+ :stop,
274
+ :response_format,
275
+ :seed
276
+ ).compact
277
+
278
+ # Convert string numeric values to actual numbers
279
+ # This is necessary because prompts may return string values from JSON config
280
+ extracted.transform_values do |value|
281
+ case value
282
+ when String
283
+ # Try to convert to float if it looks like a number
284
+ if value.match?(/\A-?\d+\.?\d*\z/)
285
+ value.include?(".") ? value.to_f : value.to_i
286
+ else
287
+ value
288
+ end
289
+ else
290
+ value
291
+ end
292
+ end
293
+ rescue StandardError => e
294
+ Rails.logger.debug "[Observability] Could not extract model parameters: #{e.message}"
295
+ {}
296
+ end
297
+
298
+ def capture_messages(chat_instance)
299
+ return [] unless chat_instance.respond_to?(:messages)
300
+ return [] unless chat_instance.messages.respond_to?(:map)
301
+
302
+ chat_instance.messages.map do |msg|
303
+ {
304
+ role: msg.role.to_s,
305
+ content: truncate_content(msg.content)
306
+ }
307
+ end
308
+ rescue StandardError => e
309
+ Rails.logger.warn "[Observability] Failed to capture messages: #{e.message}"
310
+ []
311
+ end
312
+
313
+ def extract_usage(result)
314
+ usage = {
315
+ input_tokens: result.input_tokens || 0,
316
+ output_tokens: result.output_tokens || 0,
317
+ total_tokens: (result.input_tokens || 0) + (result.output_tokens || 0)
318
+ }
319
+
320
+ if result.respond_to?(:raw) && result.raw.respond_to?(:body)
321
+ raw_body = result.raw.body
322
+
323
+ if raw_body.is_a?(Hash) && raw_body["usage"]
324
+ raw_usage = raw_body["usage"]
325
+
326
+ if raw_usage["prompt_tokens_details"]
327
+ cached = raw_usage["prompt_tokens_details"]["cached_tokens"]
328
+ usage[:cached_input_tokens] = cached if cached && cached > 0
329
+ end
330
+
331
+ if raw_usage["completion_tokens_details"]
332
+ reasoning = raw_usage["completion_tokens_details"]["reasoning_tokens"]
333
+ usage[:reasoning_tokens] = reasoning if reasoning && reasoning > 0
334
+ end
335
+ end
336
+ end
337
+
338
+ usage
339
+ end
340
+
341
+ def extract_provider_metadata(result)
342
+ metadata = {}
343
+
344
+ return metadata unless result.respond_to?(:raw) && result.raw
345
+
346
+ raw = result.raw
347
+
348
+ if raw.respond_to?(:body) && raw.body.is_a?(Hash)
349
+ body = raw.body
350
+ metadata[:request_id] = body["id"] if body["id"]
351
+ metadata[:system_fingerprint] = body["system_fingerprint"] if body["system_fingerprint"]
352
+ metadata[:model_version] = body["model"] if body["model"]
353
+ end
354
+
355
+ if raw.respond_to?(:headers) && raw.headers
356
+ headers = raw.headers
357
+ metadata[:x_request_id] = headers["x-request-id"] if headers["x-request-id"]
358
+ metadata[:processing_ms] = headers["openai-processing-ms"].to_i if headers["openai-processing-ms"]
359
+ metadata[:ratelimit_remaining_requests] = headers["x-ratelimit-remaining-requests"].to_i if headers["x-ratelimit-remaining-requests"]
360
+ metadata[:ratelimit_remaining_tokens] = headers["x-ratelimit-remaining-tokens"].to_i if headers["x-ratelimit-remaining-tokens"]
361
+ end
362
+
363
+ metadata[:model_id] = result.model_id if result.respond_to?(:model_id)
364
+
365
+ metadata.compact
366
+ end
367
+
368
+ def extract_finish_reason(result)
369
+ return nil unless result.respond_to?(:raw) && result.raw
370
+ return nil unless result.raw.respond_to?(:body) && result.raw.body.is_a?(Hash)
371
+
372
+ raw_body = result.raw.body
373
+ raw_body.dig("choices", 0, "finish_reason")
374
+ end
375
+
376
+ def calculate_cost(result)
377
+ return 0.0 unless result.respond_to?(:model_id) && result.model_id
378
+
379
+ model_info = RubyLLM.models.find(result.model_id)
380
+ return 0.0 unless model_info&.input_price_per_million
381
+
382
+ input_tokens = result.input_tokens || 0
383
+ output_tokens = result.output_tokens || 0
384
+
385
+ input_cost = input_tokens * model_info.input_price_per_million / 1_000_000.0
386
+ output_cost = output_tokens * model_info.output_price_per_million / 1_000_000.0
387
+
388
+ (input_cost + output_cost).round(6)
389
+ rescue StandardError => e
390
+ Rails.logger.warn "[Observability] Failed to calculate cost: #{e.message}"
391
+ 0.0
392
+ end
393
+
394
+ def extract_raw_response(result)
395
+ return nil unless result.respond_to?(:raw) && result.raw
396
+
397
+ raw_data = {}
398
+ raw = result.raw
399
+
400
+ raw_data[:status] = raw.status if raw.respond_to?(:status)
401
+
402
+ if raw.respond_to?(:body)
403
+ if raw.body.is_a?(Hash)
404
+ raw_data[:body] = truncate_large_hash(raw.body)
405
+ elsif raw.body.is_a?(String)
406
+ begin
407
+ parsed = JSON.parse(raw.body)
408
+ raw_data[:body] = truncate_large_hash(parsed)
409
+ rescue JSON::ParserError
410
+ raw_data[:body] = raw.body[0..1000]
411
+ end
412
+ end
413
+ end
414
+
415
+ raw_data[:headers] = extract_relevant_headers(raw.headers) if raw.respond_to?(:headers)
416
+
417
+ raw_data.empty? ? nil : raw_data
418
+ end
419
+
420
+ def extract_relevant_headers(headers)
421
+ return {} unless headers
422
+
423
+ relevant = {}
424
+ interesting_headers = %w[
425
+ x-request-id
426
+ openai-processing-ms
427
+ x-ratelimit-remaining-requests
428
+ x-ratelimit-remaining-tokens
429
+ x-ratelimit-limit-requests
430
+ x-ratelimit-limit-tokens
431
+ openai-organization
432
+ openai-version
433
+ content-type
434
+ ]
435
+
436
+ interesting_headers.each do |header|
437
+ value = headers[header] || headers[header.downcase]
438
+ relevant[header] = value if value
439
+ end
440
+
441
+ relevant
442
+ end
443
+
444
+ def format_input(message, attachments)
445
+ input = { text: message }
446
+
447
+ if attachments
448
+ attachment_array = Array(attachments)
449
+ input[:attachments] = attachment_array.map do |att|
450
+ if att.is_a?(String)
451
+ { path: att }
452
+ else
453
+ { type: att.class.name }
454
+ end
455
+ end
456
+ end
457
+
458
+ input
459
+ end
460
+
461
+ def format_tool_arguments(arguments)
462
+ return arguments if arguments.is_a?(Hash) && arguments.size < 100
463
+
464
+ arguments.to_json
465
+ rescue StandardError
466
+ arguments.to_s
467
+ end
468
+
469
+ def format_tool_result(result)
470
+ case result
471
+ when Hash
472
+ truncate_large_hash(result)
473
+ when String
474
+ truncate_content(result)
475
+ when RubyLLM::Content
476
+ {
477
+ text: truncate_content(result.text),
478
+ has_attachments: result.attachments.present?
479
+ }
480
+ else
481
+ result.to_s[0..5000]
482
+ end
483
+ end
484
+
485
+ def truncate_content(content, max_length: 10_000)
486
+ return nil if content.nil?
487
+ return content if content.length <= max_length
488
+
489
+ "#{content[0...max_length]}... [truncated, original length: #{content.length}]"
490
+ end
491
+
492
+ def truncate_large_hash(hash)
493
+ hash.transform_values do |value|
494
+ if value.is_a?(String) && value.length > 10_000
495
+ truncate_content(value)
496
+ elsif value.is_a?(Hash)
497
+ truncate_large_hash(value)
498
+ elsif value.is_a?(Array) && value.size > 100
499
+ value[0..99] + [ "... #{value.size - 100} more items" ]
500
+ else
501
+ value
502
+ end
503
+ end
504
+ end
505
+
506
+ def link_trace_to_message(trace, chat_instance, call_start_time)
507
+ return unless chat_instance.respond_to?(:messages)
508
+
509
+ assistant_message = chat_instance.messages
510
+ .where(role: "assistant")
511
+ .where("created_at >= ?", call_start_time)
512
+ .order(created_at: :desc)
513
+ .first
514
+
515
+ if assistant_message
516
+ trace.update(message_id: assistant_message.id)
517
+ Rails.logger.info "[Observability] Linked trace #{trace.trace_id} to message #{assistant_message.id}"
518
+ end
519
+ rescue StandardError => e
520
+ Rails.logger.warn "[Observability] Failed to link trace to message: #{e.message}"
521
+ end
522
+ end
523
+ end
@@ -0,0 +1,153 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Observ
4
+ # Service responsible for executing dataset evaluations
5
+ #
6
+ # This service runs an agent against all items in a dataset run,
7
+ # creating traces for each execution and tracking results.
8
+ #
9
+ # Usage:
10
+ # run = DatasetRun.find(1)
11
+ # DatasetRunnerService.new(run).call
12
+ #
13
+ # The service:
14
+ # - Updates run status to :running at start
15
+ # - Processes each dataset item through the AgentExecutorService
16
+ # - Creates a session and trace for each item execution
17
+ # - Records errors for failed items
18
+ # - Updates metrics after completion
19
+ # - Sets final status to :completed or :failed
20
+ #
21
+ class DatasetRunnerService
22
+ attr_reader :dataset_run, :dataset
23
+
24
+ def initialize(dataset_run)
25
+ @dataset_run = dataset_run
26
+ @dataset = dataset_run.dataset
27
+ end
28
+
29
+ def call
30
+ dataset_run.update!(status: :running)
31
+
32
+ process_all_items
33
+
34
+ dataset_run.update_metrics!
35
+ determine_final_status
36
+ rescue StandardError => e
37
+ handle_run_failure(e)
38
+ raise
39
+ end
40
+
41
+ private
42
+
43
+ def process_all_items
44
+ dataset_run.run_items.includes(:dataset_item).find_each do |run_item|
45
+ process_item(run_item)
46
+ end
47
+ end
48
+
49
+ def process_item(run_item)
50
+ session = create_session_for_item(run_item)
51
+ trace = create_trace_for_item(session, run_item)
52
+
53
+ begin
54
+ result = execute_agent(run_item.dataset_item.input, session)
55
+ finalize_successful_item(run_item, trace, result)
56
+ rescue StandardError => e
57
+ finalize_failed_item(run_item, trace, e)
58
+ end
59
+ end
60
+
61
+ def create_session_for_item(run_item)
62
+ Observ::Session.create!(
63
+ user_id: "dataset_run_#{dataset_run.id}",
64
+ metadata: {
65
+ dataset_id: dataset.id,
66
+ dataset_run_id: dataset_run.id,
67
+ dataset_item_id: run_item.dataset_item_id,
68
+ source: "dataset_evaluation"
69
+ }
70
+ )
71
+ end
72
+
73
+ def create_trace_for_item(session, run_item)
74
+ session.create_trace(
75
+ name: "dataset_evaluation",
76
+ input: run_item.dataset_item.input,
77
+ metadata: {
78
+ dataset_id: dataset.id,
79
+ dataset_name: dataset.name,
80
+ dataset_run_id: dataset_run.id,
81
+ dataset_run_name: dataset_run.name,
82
+ dataset_item_id: run_item.dataset_item_id,
83
+ agent_class: dataset.agent_class
84
+ },
85
+ tags: [ "dataset_evaluation", dataset.name, dataset_run.name ]
86
+ )
87
+ end
88
+
89
+ def execute_agent(input, session)
90
+ executor = AgentExecutorService.new(
91
+ dataset.agent,
92
+ observability_session: session,
93
+ context: {
94
+ dataset_id: dataset.id,
95
+ dataset_run_id: dataset_run.id
96
+ }
97
+ )
98
+ executor.call(input)
99
+ end
100
+
101
+ def finalize_successful_item(run_item, trace, result)
102
+ output = extract_output(result)
103
+ trace.finalize(output: output)
104
+
105
+ run_item.update!(
106
+ trace: trace,
107
+ error: nil
108
+ )
109
+ end
110
+
111
+ def finalize_failed_item(run_item, trace, error)
112
+ trace.finalize(
113
+ output: nil,
114
+ metadata: { error: error.message, error_class: error.class.name }
115
+ )
116
+
117
+ run_item.update!(
118
+ trace: trace,
119
+ error: "#{error.class.name}: #{error.message}"
120
+ )
121
+ end
122
+
123
+ def extract_output(result)
124
+ case result
125
+ when String
126
+ result
127
+ when Hash
128
+ result
129
+ else
130
+ result.respond_to?(:to_h) ? result.to_h : result.to_s
131
+ end
132
+ end
133
+
134
+ def determine_final_status
135
+ if dataset_run.failed_items == dataset_run.total_items
136
+ dataset_run.update!(status: :failed)
137
+ else
138
+ dataset_run.update!(status: :completed)
139
+ end
140
+ end
141
+
142
+ def handle_run_failure(error)
143
+ dataset_run.update!(
144
+ status: :failed,
145
+ metadata: dataset_run.metadata.merge(
146
+ error: error.message,
147
+ error_class: error.class.name,
148
+ failed_at: Time.current.iso8601
149
+ )
150
+ )
151
+ end
152
+ end
153
+ end