rails_error_dashboard 0.6.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +101 -6
- data/app/controllers/rails_error_dashboard/application_controller.rb +8 -2
- data/app/controllers/rails_error_dashboard/errors_controller.rb +66 -14
- data/app/helpers/rails_error_dashboard/application_helper.rb +42 -10
- data/app/views/layouts/rails_error_dashboard.html.erb +307 -0
- data/app/views/rails_error_dashboard/errors/_ai_help_panel.html.erb +36 -0
- data/app/views/rails_error_dashboard/errors/_breadcrumbs_group.html.erb +64 -5
- data/app/views/rails_error_dashboard/errors/_error_row.html.erb +2 -2
- data/app/views/rails_error_dashboard/errors/_instance_variables.html.erb +1 -0
- data/app/views/rails_error_dashboard/errors/_llm_summary.html.erb +97 -0
- data/app/views/rails_error_dashboard/errors/_local_variables.html.erb +1 -0
- data/app/views/rails_error_dashboard/errors/_modals.html.erb +1 -1
- data/app/views/rails_error_dashboard/errors/_show_scripts.html.erb +21 -20
- data/app/views/rails_error_dashboard/errors/_sidebar_metadata.html.erb +33 -19
- data/app/views/rails_error_dashboard/errors/_timeline.html.erb +1 -2
- data/app/views/rails_error_dashboard/errors/analytics.html.erb +5 -1
- data/app/views/rails_error_dashboard/errors/correlation.html.erb +16 -4
- data/app/views/rails_error_dashboard/errors/database_health_summary.html.erb +7 -3
- data/app/views/rails_error_dashboard/errors/diagnostic_dumps.html.erb +1 -1
- data/app/views/rails_error_dashboard/errors/index.html.erb +7 -1
- data/app/views/rails_error_dashboard/errors/overview.html.erb +2 -2
- data/app/views/rails_error_dashboard/errors/platform_comparison.html.erb +3 -1
- data/app/views/rails_error_dashboard/errors/releases.html.erb +3 -1
- data/app/views/rails_error_dashboard/errors/settings.html.erb +0 -1
- data/app/views/rails_error_dashboard/errors/show.html.erb +12 -2
- data/app/views/rails_error_dashboard/errors/swallowed_exceptions.html.erb +5 -1
- data/app/views/rails_error_dashboard/errors/user_impact.html.erb +3 -1
- data/config/routes.rb +1 -0
- data/lib/generators/rails_error_dashboard/install/templates/initializer.rb +27 -0
- data/lib/rails_error_dashboard/configuration.rb +101 -1
- data/lib/rails_error_dashboard/engine.rb +14 -0
- data/lib/rails_error_dashboard/integrations/llm_middleware.rb +276 -0
- data/lib/rails_error_dashboard/integrations/llm_span_processor.rb +181 -0
- data/lib/rails_error_dashboard/integrations/o_tel.rb +45 -0
- data/lib/rails_error_dashboard/queries/analytics_stats.rb +4 -1
- data/lib/rails_error_dashboard/queries/error_correlation.rb +17 -13
- data/lib/rails_error_dashboard/queries/errors_list.rb +14 -0
- data/lib/rails_error_dashboard/services/cascade_detector.rb +28 -18
- data/lib/rails_error_dashboard/services/llm_client.rb +368 -0
- data/lib/rails_error_dashboard/services/llm_cost_estimator.rb +85 -0
- data/lib/rails_error_dashboard/services/llm_summary.rb +91 -0
- data/lib/rails_error_dashboard/services/markdown_error_formatter.rb +87 -0
- data/lib/rails_error_dashboard/subscribers/llm_call_subscriber.rb +150 -0
- data/lib/rails_error_dashboard/value_objects/llm_call_event.rb +92 -0
- data/lib/rails_error_dashboard/version.rb +1 -1
- data/lib/rails_error_dashboard.rb +8 -0
- metadata +13 -3
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RailsErrorDashboard
|
|
4
|
+
module Integrations
|
|
5
|
+
# Faraday middleware that captures LLM calls to OpenAI and Anthropic APIs
|
|
6
|
+
# as breadcrumbs. The Tier 2 path — for hosts using `ruby-openai` or
|
|
7
|
+
# `anthropic-sdk-ruby` directly without OpenTelemetry instrumentation.
|
|
8
|
+
#
|
|
9
|
+
# Install in the host app:
|
|
10
|
+
#
|
|
11
|
+
# # Anthropic SDK
|
|
12
|
+
# Anthropic::Client.new do |f|
|
|
13
|
+
# f.use RailsErrorDashboard::Integrations::LlmMiddleware
|
|
14
|
+
# end
|
|
15
|
+
#
|
|
16
|
+
# # ruby-openai
|
|
17
|
+
# OpenAI::Client.new do |f|
|
|
18
|
+
# f.use RailsErrorDashboard::Integrations::LlmMiddleware
|
|
19
|
+
# end
|
|
20
|
+
#
|
|
21
|
+
# IMPORTANT — does NOT subclass ::Faraday::Middleware. Doing so would
|
|
22
|
+
# NameError at file-load time on hosts without Faraday. Faraday accepts
|
|
23
|
+
# any object that responds to `#call(env)` and is initialized with `app`.
|
|
24
|
+
# Hosts that don't use OpenAI/Anthropic SDKs simply won't reference this
|
|
25
|
+
# class and never load the constant.
|
|
26
|
+
#
|
|
27
|
+
# HOST APP SAFETY:
|
|
28
|
+
# - Wraps the upstream call in rescue, but ALWAYS re-raises (we are in
|
|
29
|
+
# the host's request path — swallowing would break their app logic)
|
|
30
|
+
# - Our own bookkeeping (response parsing, breadcrumb emission) is wrapped
|
|
31
|
+
# separately in rescue StandardError => nil
|
|
32
|
+
# - No work happens unless enable_llm_observability AND enable_breadcrumbs
|
|
33
|
+
# - Non-LLM URLs (anything but api.openai.com / api.anthropic.com) skip
|
|
34
|
+
# straight through with one host-string comparison
|
|
35
|
+
# - Streaming responses (SSE) skipped — token counts only available in
|
|
36
|
+
# the final stream event, which we'd need to buffer to read
|
|
37
|
+
class LlmMiddleware
|
|
38
|
+
OPENAI_HOSTS = [ "api.openai.com" ].freeze
|
|
39
|
+
ANTHROPIC_HOSTS = [ "api.anthropic.com" ].freeze
|
|
40
|
+
|
|
41
|
+
def initialize(app)
|
|
42
|
+
@app = app
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def call(env)
|
|
46
|
+
return @app.call(env) unless RailsErrorDashboard.configuration.enable_llm_observability
|
|
47
|
+
return @app.call(env) unless RailsErrorDashboard.configuration.enable_breadcrumbs
|
|
48
|
+
|
|
49
|
+
provider = detect_provider(env)
|
|
50
|
+
return @app.call(env) unless provider
|
|
51
|
+
|
|
52
|
+
request_body = safe_parse_body(env.body)
|
|
53
|
+
model = request_body.is_a?(Hash) ? request_body["model"] : nil
|
|
54
|
+
started_at = monotonic_ms
|
|
55
|
+
|
|
56
|
+
response = nil
|
|
57
|
+
upstream_error = nil
|
|
58
|
+
begin
|
|
59
|
+
response = @app.call(env)
|
|
60
|
+
rescue StandardError => e
|
|
61
|
+
upstream_error = e
|
|
62
|
+
raise
|
|
63
|
+
ensure
|
|
64
|
+
# Record the breadcrumb whether the call succeeded, returned an HTTP
|
|
65
|
+
# error, or raised mid-flight. NEVER raise from this block — the
|
|
66
|
+
# host's app.call has either returned or is propagating an exception
|
|
67
|
+
# via `raise` above, and we must not interfere with either path.
|
|
68
|
+
begin
|
|
69
|
+
duration_ms = (monotonic_ms - started_at).round(2)
|
|
70
|
+
emit_breadcrumb(provider, model, request_body, response, upstream_error, duration_ms)
|
|
71
|
+
rescue StandardError => e
|
|
72
|
+
RailsErrorDashboard::Logger.debug("[RailsErrorDashboard] LlmMiddleware.emit failed: #{e.message}")
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
response
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def detect_provider(env)
|
|
82
|
+
host = env.url&.host
|
|
83
|
+
return nil unless host
|
|
84
|
+
return "openai" if OPENAI_HOSTS.include?(host)
|
|
85
|
+
return "anthropic" if ANTHROPIC_HOSTS.include?(host)
|
|
86
|
+
nil
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def safe_parse_body(body)
|
|
90
|
+
return body if body.is_a?(Hash)
|
|
91
|
+
return {} if body.nil? || (body.respond_to?(:empty?) && body.empty?)
|
|
92
|
+
JSON.parse(body.to_s)
|
|
93
|
+
rescue StandardError
|
|
94
|
+
{}
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def streaming_response?(response)
|
|
98
|
+
return false unless response
|
|
99
|
+
ct = response.respond_to?(:headers) ? response.headers&.[]("content-type") : nil
|
|
100
|
+
ct.is_a?(String) && ct.include?("text/event-stream")
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def emit_breadcrumb(provider, model, request_body, response, upstream_error, duration_ms)
|
|
104
|
+
if upstream_error
|
|
105
|
+
event = build_error_event(provider, model, upstream_error, duration_ms)
|
|
106
|
+
add_crumb(event)
|
|
107
|
+
return
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Streaming — token counts aren't available without buffering the
|
|
111
|
+
# stream, which would defeat the SDK's streaming behavior. Skip for
|
|
112
|
+
# v0.7.0; a future release can add an SSE parser if demand warrants.
|
|
113
|
+
if streaming_response?(response)
|
|
114
|
+
RailsErrorDashboard::Logger.debug("[RailsErrorDashboard] LlmMiddleware skipping streaming response (#{provider})")
|
|
115
|
+
return
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
status = response.respond_to?(:status) ? response.status : nil
|
|
119
|
+
body = parse_response_body(response)
|
|
120
|
+
|
|
121
|
+
if status && status >= 400
|
|
122
|
+
add_crumb(build_http_error_event(provider, model, status, body, duration_ms))
|
|
123
|
+
return
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
add_crumb(build_success_event(provider, model, request_body, body, duration_ms))
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def parse_response_body(response)
|
|
130
|
+
body = response.respond_to?(:body) ? response.body : nil
|
|
131
|
+
return body if body.is_a?(Hash)
|
|
132
|
+
return {} if body.nil? || (body.respond_to?(:empty?) && body.empty?)
|
|
133
|
+
JSON.parse(body.to_s)
|
|
134
|
+
rescue StandardError
|
|
135
|
+
{}
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def build_success_event(provider, request_model, request_body, response_body, duration_ms)
|
|
139
|
+
response_model = response_body.is_a?(Hash) ? response_body["model"] : nil
|
|
140
|
+
model = response_model || request_model
|
|
141
|
+
|
|
142
|
+
input_tokens, output_tokens = extract_tokens(provider, response_body)
|
|
143
|
+
tool_calls_requested = extract_tool_calls(provider, response_body)
|
|
144
|
+
|
|
145
|
+
cost = Services::LlmCostEstimator.estimate(
|
|
146
|
+
provider: provider,
|
|
147
|
+
model: model,
|
|
148
|
+
input_tokens: input_tokens,
|
|
149
|
+
output_tokens: output_tokens
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
ValueObjects::LlmCallEvent.new(
|
|
153
|
+
provider: provider,
|
|
154
|
+
model: model || "unknown",
|
|
155
|
+
status: :success,
|
|
156
|
+
input_tokens: input_tokens,
|
|
157
|
+
output_tokens: output_tokens,
|
|
158
|
+
duration_ms: duration_ms,
|
|
159
|
+
cost_usd_estimate: cost,
|
|
160
|
+
tool_arguments: tool_calls_metadata(tool_calls_requested)
|
|
161
|
+
)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def build_http_error_event(provider, model, status, body, duration_ms)
|
|
165
|
+
err_class = "HTTP #{status}"
|
|
166
|
+
err_msg = extract_error_message(body)
|
|
167
|
+
|
|
168
|
+
ValueObjects::LlmCallEvent.new(
|
|
169
|
+
provider: provider,
|
|
170
|
+
model: model || "unknown",
|
|
171
|
+
status: :error,
|
|
172
|
+
duration_ms: duration_ms,
|
|
173
|
+
error_class: err_class,
|
|
174
|
+
error_message: err_msg
|
|
175
|
+
)
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def build_error_event(provider, model, exception, duration_ms)
|
|
179
|
+
status = exception_status(exception)
|
|
180
|
+
ValueObjects::LlmCallEvent.new(
|
|
181
|
+
provider: provider,
|
|
182
|
+
model: model || "unknown",
|
|
183
|
+
status: status,
|
|
184
|
+
duration_ms: duration_ms,
|
|
185
|
+
error_class: exception.class.name,
|
|
186
|
+
error_message: exception.message
|
|
187
|
+
)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def exception_status(exception)
|
|
191
|
+
klass = exception.class.name.to_s
|
|
192
|
+
return :timeout if klass.include?("Timeout") || klass.include?("TimedOut")
|
|
193
|
+
:error
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# @return [Array(Integer|nil, Integer|nil)] (input_tokens, output_tokens)
|
|
197
|
+
def extract_tokens(provider, body)
|
|
198
|
+
return [ nil, nil ] unless body.is_a?(Hash)
|
|
199
|
+
usage = body["usage"]
|
|
200
|
+
return [ nil, nil ] unless usage.is_a?(Hash)
|
|
201
|
+
|
|
202
|
+
case provider
|
|
203
|
+
when "openai"
|
|
204
|
+
[ usage["prompt_tokens"], usage["completion_tokens"] ]
|
|
205
|
+
when "anthropic"
|
|
206
|
+
[ usage["input_tokens"], usage["output_tokens"] ]
|
|
207
|
+
else
|
|
208
|
+
[ nil, nil ]
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Returns an Array of tool-call descriptors: [{ name: "...", id: "..." }, ...]
|
|
213
|
+
# Empty when the model didn't request any tools.
|
|
214
|
+
def extract_tool_calls(provider, body)
|
|
215
|
+
return [] unless body.is_a?(Hash)
|
|
216
|
+
|
|
217
|
+
case provider
|
|
218
|
+
when "openai"
|
|
219
|
+
choices = body["choices"]
|
|
220
|
+
return [] unless choices.is_a?(Array) && choices.any?
|
|
221
|
+
tool_calls = choices.first.dig("message", "tool_calls")
|
|
222
|
+
return [] unless tool_calls.is_a?(Array)
|
|
223
|
+
tool_calls.filter_map do |tc|
|
|
224
|
+
next unless tc.is_a?(Hash)
|
|
225
|
+
name = tc.dig("function", "name")
|
|
226
|
+
name ? { name: name, id: tc["id"] } : nil
|
|
227
|
+
end
|
|
228
|
+
when "anthropic"
|
|
229
|
+
content = body["content"]
|
|
230
|
+
return [] unless content.is_a?(Array)
|
|
231
|
+
content.filter_map do |c|
|
|
232
|
+
next unless c.is_a?(Hash) && c["type"] == "tool_use"
|
|
233
|
+
{ name: c["name"], id: c["id"] }
|
|
234
|
+
end
|
|
235
|
+
else
|
|
236
|
+
[]
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Compact summary of tool calls — packed into the
|
|
241
|
+
# `tool_arguments` field on LlmCallEvent so it lands in breadcrumb
|
|
242
|
+
# metadata under `:tool_arguments`. (We reuse the existing slot rather
|
|
243
|
+
# than adding a new field for v0.7.0; UI in 4.1 reads it back.)
|
|
244
|
+
# Returns nil when no tools were requested so the field omits from JSON.
|
|
245
|
+
def tool_calls_metadata(tool_calls)
|
|
246
|
+
return nil if tool_calls.nil? || tool_calls.empty?
|
|
247
|
+
names = tool_calls.first(3).map { |tc| tc[:name] }.compact
|
|
248
|
+
suffix = tool_calls.size > 3 ? "+#{tool_calls.size - 3} more" : nil
|
|
249
|
+
[ "tools:#{tool_calls.size}", names.join(","), suffix ].compact.join(" ")
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def extract_error_message(body)
|
|
253
|
+
return nil unless body.is_a?(Hash)
|
|
254
|
+
# OpenAI: { "error": { "message": "...", "type": "...", "code": "..." } }
|
|
255
|
+
# Anthropic: { "error": { "type": "...", "message": "..." }, "type": "error" }
|
|
256
|
+
err = body["error"]
|
|
257
|
+
return nil unless err.is_a?(Hash)
|
|
258
|
+
err["message"] || err["type"]
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def add_crumb(event)
|
|
262
|
+
category = event.tool_call? ? "llm_tool" : "llm"
|
|
263
|
+
Services::BreadcrumbCollector.add(
|
|
264
|
+
category,
|
|
265
|
+
event.to_breadcrumb_message,
|
|
266
|
+
duration_ms: event.duration_ms,
|
|
267
|
+
metadata: event.to_breadcrumb_metadata
|
|
268
|
+
)
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def monotonic_ms
|
|
272
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC) * 1000.0
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
end
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RailsErrorDashboard
|
|
4
|
+
module Integrations
|
|
5
|
+
# OpenTelemetry SpanProcessor that maps GenAI semantic-convention spans
|
|
6
|
+
# into LLM breadcrumbs. Registered with `OpenTelemetry.tracer_provider`
|
|
7
|
+
# when the host app already runs OTel (ruby_llm, thoughtbot/instrumentation,
|
|
8
|
+
# etc. all emit GenAI spans automatically).
|
|
9
|
+
#
|
|
10
|
+
# IMPORTANT — does NOT subclass ::OpenTelemetry::SDK::Trace::SpanProcessor.
|
|
11
|
+
# That would NameError at file-load time on hosts without the SDK. Ruby's
|
|
12
|
+
# OTel SDK accepts any duck-typed processor — name + arity is the contract.
|
|
13
|
+
#
|
|
14
|
+
# Reads attribute keys per the GenAI semconv (current + deprecated aliases).
|
|
15
|
+
# Spec: https://opentelemetry.io/docs/specs/semconv/gen-ai/
|
|
16
|
+
#
|
|
17
|
+
# HOST APP SAFETY:
|
|
18
|
+
# - on_finish wraps the entire body in rescue StandardError => nil
|
|
19
|
+
# - No work happens unless enable_llm_observability AND enable_breadcrumbs
|
|
20
|
+
# - Non-GenAI spans return immediately (cheapest possible path)
|
|
21
|
+
# - Never raises, never blocks the tracer pipeline
|
|
22
|
+
class LlmSpanProcessor
|
|
23
|
+
class << self
|
|
24
|
+
# Idempotently register a single shared LlmSpanProcessor instance with
|
|
25
|
+
# the host's OpenTelemetry tracer provider. Called from Engine
|
|
26
|
+
# `after_initialize` when `enable_llm_observability` is on.
|
|
27
|
+
#
|
|
28
|
+
# Returns false (and does nothing) when:
|
|
29
|
+
# - OTel SDK isn't loaded (`Integrations::OTel.available?` is false)
|
|
30
|
+
# - `enable_llm_observability` is off
|
|
31
|
+
# - The active tracer provider is the default `ProxyTracerProvider`
|
|
32
|
+
# (SDK loaded but `OpenTelemetry::SDK.configure` never called) —
|
|
33
|
+
# detected by absence of `add_span_processor`
|
|
34
|
+
# - Already registered in this process (Spring reload safety)
|
|
35
|
+
# - `add_span_processor` raises (host app safety — never crash boot)
|
|
36
|
+
#
|
|
37
|
+
# @return [Boolean] true if a processor was newly registered, false otherwise
|
|
38
|
+
def register!
|
|
39
|
+
return false if @registered
|
|
40
|
+
return false unless RailsErrorDashboard.configuration.enable_llm_observability
|
|
41
|
+
return false unless OTel.available?
|
|
42
|
+
|
|
43
|
+
provider = ::OpenTelemetry.tracer_provider
|
|
44
|
+
return false unless provider.respond_to?(:add_span_processor)
|
|
45
|
+
|
|
46
|
+
provider.add_span_processor(new)
|
|
47
|
+
@registered = true
|
|
48
|
+
true
|
|
49
|
+
rescue StandardError => e
|
|
50
|
+
RailsErrorDashboard::Logger.debug("[RailsErrorDashboard] LlmSpanProcessor.register! failed: #{e.message}")
|
|
51
|
+
false
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Test hook — clear the registered flag so re-registration is possible
|
|
55
|
+
# in a fresh spec example. Does NOT remove the processor from the
|
|
56
|
+
# tracer provider (OTel SDK offers no symmetric `remove_span_processor`).
|
|
57
|
+
def reset!
|
|
58
|
+
@registered = false
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# @return [Boolean]
|
|
62
|
+
def registered?
|
|
63
|
+
@registered == true
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Attribute keys — current GenAI semconv, with deprecated aliases.
|
|
68
|
+
PROVIDER_KEYS = [ "gen_ai.provider.name", "gen_ai.system" ].freeze
|
|
69
|
+
MODEL_KEYS = [ "gen_ai.response.model", "gen_ai.request.model" ].freeze
|
|
70
|
+
INPUT_TOKEN_KEYS = [ "gen_ai.usage.input_tokens", "gen_ai.usage.prompt_tokens" ].freeze
|
|
71
|
+
OUTPUT_TOKEN_KEYS = [ "gen_ai.usage.output_tokens", "gen_ai.usage.completion_tokens" ].freeze
|
|
72
|
+
TOOL_NAME_KEY = "gen_ai.tool.name"
|
|
73
|
+
OPERATION_KEY = "gen_ai.operation.name"
|
|
74
|
+
ERROR_TYPE_KEY = "error.type"
|
|
75
|
+
|
|
76
|
+
# Required SpanProcessor interface — no-op. We only act when the span
|
|
77
|
+
# is fully populated (attributes/timestamps/status), which is on_finish.
|
|
78
|
+
def on_start(_span, _parent_context)
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Required SpanProcessor interface. Must never raise.
|
|
83
|
+
def on_finish(span)
|
|
84
|
+
return unless RailsErrorDashboard.configuration.enable_llm_observability
|
|
85
|
+
return unless RailsErrorDashboard.configuration.enable_breadcrumbs
|
|
86
|
+
|
|
87
|
+
attrs = safe_attributes(span)
|
|
88
|
+
return if attrs.empty?
|
|
89
|
+
return unless gen_ai_span?(attrs)
|
|
90
|
+
|
|
91
|
+
event = build_event(span, attrs)
|
|
92
|
+
category = event.tool_call? ? "llm_tool" : "llm"
|
|
93
|
+
|
|
94
|
+
Services::BreadcrumbCollector.add(
|
|
95
|
+
category,
|
|
96
|
+
event.to_breadcrumb_message,
|
|
97
|
+
duration_ms: event.duration_ms,
|
|
98
|
+
metadata: event.to_breadcrumb_metadata
|
|
99
|
+
)
|
|
100
|
+
rescue StandardError => e
|
|
101
|
+
RailsErrorDashboard::Logger.debug("[RailsErrorDashboard] LlmSpanProcessor.on_finish failed: #{e.message}")
|
|
102
|
+
nil
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# OTel SDK Export::SUCCESS == 0. Hardcoded so this file loads without OTel.
|
|
106
|
+
def force_flush(timeout: nil)
|
|
107
|
+
0
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def shutdown(timeout: nil)
|
|
111
|
+
0
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
private
|
|
115
|
+
|
|
116
|
+
def safe_attributes(span)
|
|
117
|
+
attrs = span.attributes
|
|
118
|
+
attrs.is_a?(Hash) ? attrs : {}
|
|
119
|
+
rescue StandardError
|
|
120
|
+
{}
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Cheap pre-filter — only inspect spans that actually carry GenAI semconv.
|
|
124
|
+
def gen_ai_span?(attrs)
|
|
125
|
+
PROVIDER_KEYS.any? { |k| attrs.key?(k) } ||
|
|
126
|
+
MODEL_KEYS.any? { |k| attrs.key?(k) } ||
|
|
127
|
+
attrs.key?(OPERATION_KEY) ||
|
|
128
|
+
attrs.key?(TOOL_NAME_KEY)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def build_event(span, attrs)
|
|
132
|
+
provider = first_attr(attrs, PROVIDER_KEYS)
|
|
133
|
+
model = first_attr(attrs, MODEL_KEYS)
|
|
134
|
+
input_tokens = first_attr(attrs, INPUT_TOKEN_KEYS)
|
|
135
|
+
output_tokens = first_attr(attrs, OUTPUT_TOKEN_KEYS)
|
|
136
|
+
tool_name = attrs[TOOL_NAME_KEY] || (attrs[OPERATION_KEY] == "execute_tool" ? attrs[OPERATION_KEY] : nil)
|
|
137
|
+
error_type = attrs[ERROR_TYPE_KEY]
|
|
138
|
+
|
|
139
|
+
status = error_type ? :error : :success
|
|
140
|
+
duration_ms = compute_duration_ms(span)
|
|
141
|
+
|
|
142
|
+
cost = nil
|
|
143
|
+
if status == :success && tool_name.nil? && model
|
|
144
|
+
cost = Services::LlmCostEstimator.estimate(
|
|
145
|
+
provider: provider,
|
|
146
|
+
model: model,
|
|
147
|
+
input_tokens: input_tokens,
|
|
148
|
+
output_tokens: output_tokens
|
|
149
|
+
)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
ValueObjects::LlmCallEvent.new(
|
|
153
|
+
provider: provider || "unknown",
|
|
154
|
+
model: model || "unknown",
|
|
155
|
+
status: status,
|
|
156
|
+
input_tokens: input_tokens,
|
|
157
|
+
output_tokens: output_tokens,
|
|
158
|
+
duration_ms: duration_ms,
|
|
159
|
+
error_class: error_type,
|
|
160
|
+
tool_name: tool_name,
|
|
161
|
+
cost_usd_estimate: cost
|
|
162
|
+
)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def first_attr(attrs, keys)
|
|
166
|
+
keys.each { |k| return attrs[k] if attrs.key?(k) }
|
|
167
|
+
nil
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# OTel timestamps are nanoseconds since epoch. Convert to ms; guard nils.
|
|
171
|
+
def compute_duration_ms(span)
|
|
172
|
+
start_ns = span.start_timestamp
|
|
173
|
+
end_ns = span.end_timestamp
|
|
174
|
+
return nil unless start_ns && end_ns
|
|
175
|
+
((end_ns - start_ns) / 1_000_000.0).round(2)
|
|
176
|
+
rescue StandardError
|
|
177
|
+
nil
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RailsErrorDashboard
|
|
4
|
+
module Integrations
|
|
5
|
+
# Detection shim for the OpenTelemetry SDK. The LLM-observability feature
|
|
6
|
+
# registers a SpanProcessor against `OpenTelemetry.tracer_provider` when
|
|
7
|
+
# the host app already runs OTel — for ruby_llm and thoughtbot users
|
|
8
|
+
# this is the zero-config path. When OTel is absent, we silently skip
|
|
9
|
+
# the SpanProcessor (the Faraday middleware path still works).
|
|
10
|
+
#
|
|
11
|
+
# `opentelemetry-sdk` is an OPTIONAL dependency. This module must never
|
|
12
|
+
# raise, never require the gem itself, and never assume the host has it.
|
|
13
|
+
module OTel
|
|
14
|
+
class << self
|
|
15
|
+
# Returns true when the OpenTelemetry SDK is loaded and the
|
|
16
|
+
# SpanProcessor base class is reachable (Task 2.2 subclasses it).
|
|
17
|
+
# Memoized — host apps don't dynamically load gems mid-process.
|
|
18
|
+
# Rescues any unexpected error to a hard false: a broken partial
|
|
19
|
+
# install must never block a request in the host app.
|
|
20
|
+
# @return [Boolean]
|
|
21
|
+
def available?
|
|
22
|
+
return @available unless @available.nil?
|
|
23
|
+
@available = detect
|
|
24
|
+
rescue StandardError
|
|
25
|
+
@available = false
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Test hook — clears the memoized result so specs can flip
|
|
29
|
+
# OpenTelemetry constants in/out between examples.
|
|
30
|
+
def reset!
|
|
31
|
+
@available = nil
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def detect
|
|
37
|
+
return false unless defined?(::OpenTelemetry)
|
|
38
|
+
return false unless defined?(::OpenTelemetry::SDK)
|
|
39
|
+
return false unless defined?(::OpenTelemetry::SDK::Trace::SpanProcessor)
|
|
40
|
+
true
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -89,7 +89,10 @@ module RailsErrorDashboard
|
|
|
89
89
|
end
|
|
90
90
|
|
|
91
91
|
def errors_by_hour
|
|
92
|
-
|
|
92
|
+
# group_by_hour_of_day buckets into 0..23 to show diurnal patterns
|
|
93
|
+
# (when in the day errors peak). The chart title says "Errors by Hour
|
|
94
|
+
# of Day" — group_by_hour produced a chronological time series instead.
|
|
95
|
+
base_query.group_by_hour_of_day(:occurred_at).count
|
|
93
96
|
end
|
|
94
97
|
|
|
95
98
|
def top_affected_users
|
|
@@ -37,11 +37,13 @@ module RailsErrorDashboard
|
|
|
37
37
|
versions.each_with_object({}) do |(version, count), result|
|
|
38
38
|
errors = base_query.where(app_version: version)
|
|
39
39
|
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
#
|
|
44
|
-
|
|
40
|
+
# Pluck error_types once; both the unique-types count and the critical
|
|
41
|
+
# count classify off this string array. Avoids loading full ErrorLog
|
|
42
|
+
# records into memory just to compute counts (severity is a Ruby-side
|
|
43
|
+
# method on error_type, not a column).
|
|
44
|
+
types_for_version = errors.pluck(:error_type)
|
|
45
|
+
error_types = types_for_version.uniq.size
|
|
46
|
+
critical_count = types_for_version.count { |t| Services::SeverityClassifier.classify(t) == :critical }
|
|
45
47
|
|
|
46
48
|
# Get platforms for this version
|
|
47
49
|
platforms = errors.distinct.pluck(:platform).compact
|
|
@@ -180,15 +182,17 @@ module RailsErrorDashboard
|
|
|
180
182
|
error_types = base_query.distinct.pluck(:error_type)
|
|
181
183
|
return {} if error_types.count < 2
|
|
182
184
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
185
|
+
# Bucket counts by (error_type, hour-of-day) in a single SQL GROUP BY.
|
|
186
|
+
# Groupdate's group_by_hour_of_day generates database-portable SQL
|
|
187
|
+
# (PG/MySQL/SQLite) so we don't load full ErrorLog records into Ruby.
|
|
188
|
+
# Result shape: { [hour_int, error_type] => count }
|
|
189
|
+
grouped = base_query
|
|
190
|
+
.group_by_hour_of_day(:occurred_at)
|
|
191
|
+
.group(:error_type)
|
|
192
|
+
.count
|
|
189
193
|
|
|
190
|
-
|
|
191
|
-
|
|
194
|
+
hourly_distributions = error_types.each_with_object({}) do |error_type, h|
|
|
195
|
+
h[error_type] = (0..23).map { |hour| grouped[[ hour, error_type ]] || 0 }
|
|
192
196
|
end
|
|
193
197
|
|
|
194
198
|
# Calculate correlation between error type pairs
|
|
@@ -30,6 +30,8 @@ module RailsErrorDashboard
|
|
|
30
30
|
query = filter_by_platform(query)
|
|
31
31
|
query = filter_by_application(query)
|
|
32
32
|
query = filter_by_user_id(query)
|
|
33
|
+
query = filter_by_app_version(query)
|
|
34
|
+
query = filter_by_git_sha(query)
|
|
33
35
|
query = filter_by_search(query)
|
|
34
36
|
query = filter_by_severity(query)
|
|
35
37
|
query = filter_by_timeframe(query)
|
|
@@ -44,6 +46,18 @@ module RailsErrorDashboard
|
|
|
44
46
|
query
|
|
45
47
|
end
|
|
46
48
|
|
|
49
|
+
def filter_by_app_version(query)
|
|
50
|
+
return query unless @filters[:app_version].present?
|
|
51
|
+
return query unless ErrorLog.column_names.include?("app_version")
|
|
52
|
+
query.where(app_version: @filters[:app_version])
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def filter_by_git_sha(query)
|
|
56
|
+
return query unless @filters[:git_sha].present?
|
|
57
|
+
return query unless ErrorLog.column_names.include?("git_sha")
|
|
58
|
+
query.where(git_sha: @filters[:git_sha])
|
|
59
|
+
end
|
|
60
|
+
|
|
47
61
|
def filter_by_error_type(query)
|
|
48
62
|
return query unless @filters[:error_type].present?
|
|
49
63
|
|
|
@@ -28,27 +28,37 @@ module RailsErrorDashboard
|
|
|
28
28
|
def detect_cascades
|
|
29
29
|
return { detected: 0, updated: 0 } unless can_detect?
|
|
30
30
|
|
|
31
|
-
#
|
|
31
|
+
# Pluck (error_log_id, occurred_at) for every occurrence in the window
|
|
32
|
+
# ordered chronologically. Using pluck instead of loading full
|
|
33
|
+
# ActiveRecord rows keeps memory bounded to ~16 bytes/row instead of
|
|
34
|
+
# ~5KB/row, which matters because the host app schedules this job and
|
|
35
|
+
# the lookback window may contain a lot of occurrences.
|
|
32
36
|
start_time = @lookback_hours.hours.ago
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
37
|
+
rows = ErrorOccurrence
|
|
38
|
+
.where("occurred_at >= ?", start_time)
|
|
39
|
+
.order(:occurred_at)
|
|
40
|
+
.pluck(:error_log_id, :occurred_at)
|
|
41
|
+
|
|
42
|
+
# Two-pointer sweep: occurrences are time-sorted, so for each parent we
|
|
43
|
+
# only advance the child pointer forward through occurrences within the
|
|
44
|
+
# detection window. O(N + pairs) instead of O(N) inner SQL queries.
|
|
36
45
|
patterns_found = Hash.new { |h, k| h[k] = { delays: [], count: 0 } }
|
|
37
46
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
47
|
+
rows.each_with_index do |(parent_id, parent_time), i|
|
|
48
|
+
window_end = parent_time + DETECTION_WINDOW
|
|
49
|
+
j = i + 1
|
|
50
|
+
while j < rows.length
|
|
51
|
+
child_id, child_time = rows[j]
|
|
52
|
+
break if child_time > window_end
|
|
53
|
+
|
|
54
|
+
# Match the original SQL `occurred_at > parent` — strict, so two
|
|
55
|
+
# occurrences with identical timestamps don't form a cascade pair.
|
|
56
|
+
if child_id != parent_id && child_time > parent_time
|
|
57
|
+
key = [ parent_id, child_id ]
|
|
58
|
+
patterns_found[key][:delays] << (child_time - parent_time).to_f
|
|
59
|
+
patterns_found[key][:count] += 1
|
|
60
|
+
end
|
|
61
|
+
j += 1
|
|
52
62
|
end
|
|
53
63
|
end
|
|
54
64
|
|