rails_error_dashboard 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 128bf5d8e84190cf45290fc1000c5a0bc45dd27aea13f7cc6447d5795ad5059f
4
- data.tar.gz: 94e96577de22affd2fecbd2f7d723823840021b8fbcc3df10c1cf5092c150248
3
+ metadata.gz: 002c2e5d338c585425599802271a90c5bac425dc86867da1aa10a0c7a478d1e8
4
+ data.tar.gz: 077d7d7a7f957bee799cf39c96632532e3c74259e83310605c6068d57436aec3
5
5
  SHA512:
6
- metadata.gz: 4743bd41df374b04291b6887e7b08e0889ec7093bd312ba361b93156be9fbeb6bf4a103a35e8ff65b33cf0d07161c771ababbad1bcdd1d77e930bb3265b80f63
7
- data.tar.gz: 79dd710f629a9c3f6d3eab49405a13efb08b25f3f6212fe6b70d1f98839bffe5e36ba55524a77c99c3bcaf00ade39ac61e3dd2563179797e1ea5e1cfa03696e7
6
+ metadata.gz: 8e6ef9f87aae8c200e997bf761184eb37c86d0637dfdd383fd6cc003f3c3b9fa30b7906fbaf6377670d932a60da5a9046202d4a48ab63264ce916c861ac20b91
7
+ data.tar.gz: 67858acc12ad29cb412b0c049b776b7193c9ce8d45febfa4bc23e079a1afd7ff64a2a85bd81516c999cc57861631ed3f89ea62bc36544370811e8cff858c8678
data/README.md CHANGED
@@ -475,6 +475,32 @@ end
475
475
  [Plugin System guide →](docs/PLUGIN_SYSTEM.md)
476
476
  </details>
477
477
 
478
+ <details>
479
+ <summary><strong>OpenTelemetry Export — Emit Gem Operations as Spans</strong></summary>
480
+
481
+ Send the gem's error-capture pipeline as OpenTelemetry spans to your existing Datadog, Honeycomb, or Jaeger collector. Each stage of the capture path — DB write, breadcrumb harvest, system health snapshot, and notification dispatch — becomes a named child span so you can audit gem overhead from your own observability dashboards.
482
+
483
+ - Off by default — zero impact unless you opt in
484
+ - No-op when the OTel API gem isn't loaded
485
+ - Per-span-kind opt-in: enable only the stages you care about
486
+ - Every span individually rescue-wrapped — never raises into host code
487
+ - Boot-time warning if `enable_otel_export = true` but `opentelemetry-api` isn't in the Gemfile
488
+
489
+ ```ruby
490
+ # Gemfile — only the API gem is required; the SDK is optional
491
+ gem "opentelemetry-api"
492
+
493
+ # config/initializers/rails_error_dashboard.rb
494
+ config.enable_otel_export = true
495
+ config.otel_service_name = "my-app" # falls back to application_name
496
+ config.otel_spans = [:capture, :breadcrumbs, :health, :notifications] # all (default)
497
+ # config.otel_spans = [:capture] # parent span only
498
+ ```
499
+
500
+ Span names follow the `rails_error_dashboard.<operation>` convention, e.g. `rails_error_dashboard.capture_error`. Both attributes are attached to every span: `rails_error_dashboard.version` and `rails_error_dashboard.service_name` — use them to filter the gem's traffic in your dashboards.
501
+
502
+ </details>
503
+
478
504
  ---
479
505
 
480
506
  ## Quick Start
@@ -539,6 +565,8 @@ end
539
565
 
540
566
  **Multi-App Support** — Track errors from multiple Rails apps in a single shared database. Auto-detects app name, supports per-app filtering. [Multi-App guide →](docs/MULTI_APP_PERFORMANCE.md)
541
567
 
568
+ **OpenTelemetry Export** — Emit error-capture operations as OTel spans to Datadog, Honeycomb, or Jaeger. Add `gem "opentelemetry-api"` and set `config.enable_otel_export = true`. See [OpenTelemetry Export](#opentelemetry-export--emit-gem-operations-as-spans) above for full options.
569
+
542
570
  ---
543
571
 
544
572
  ## Documentation
@@ -518,6 +518,26 @@ module RailsErrorDashboard
518
518
  @pagy, @channels = pagy(:offset, all_channels, limit: params[:per_page] || 25)
519
519
  end
520
520
 
521
+ def llm_health_summary
522
+ unless RailsErrorDashboard.configuration.enable_llm_observability &&
523
+ RailsErrorDashboard.configuration.enable_breadcrumbs
524
+ @feature_disabled = true
525
+ @days = days_param(default: 30)
526
+ @models = []
527
+ @totals = Queries::LlmHealthSummary.blank_totals
528
+ @pagy = nil
529
+ return
530
+ end
531
+
532
+ days = days_param(default: 30)
533
+ @days = days
534
+ result = Queries::LlmHealthSummary.call(days, application_id: @current_application_id)
535
+ @totals = result[:totals]
536
+ all_models = result[:models]
537
+
538
+ @pagy, @models = pagy(:offset, all_models, limit: params[:per_page] || 25)
539
+ end
540
+
521
541
  def activestorage_health_summary
522
542
  unless RailsErrorDashboard.configuration.enable_activestorage_tracking &&
523
543
  RailsErrorDashboard.configuration.enable_breadcrumbs
@@ -1002,6 +1002,9 @@ tr[data-red-row-href]:hover .sev-bar { opacity: 1 !important; }
1002
1002
  <% if RailsErrorDashboard.configuration.enable_activestorage_tracking && RailsErrorDashboard.configuration.enable_breadcrumbs %>
1003
1003
  <% health_items << { path: activestorage_health_summary_errors_path(nav_params), icon: 'bi-cloud-arrow-up', label: 'ActiveStorage' } %>
1004
1004
  <% end %>
1005
+ <% if RailsErrorDashboard.configuration.enable_llm_observability && RailsErrorDashboard.configuration.enable_breadcrumbs %>
1006
+ <% health_items << { path: llm_health_summary_errors_path(nav_params), icon: 'bi-cpu-fill', label: 'LLM' } %>
1007
+ <% end %>
1005
1008
 
1006
1009
  <% if health_items.any? %>
1007
1010
  <div style="margin-bottom: var(--space-2);" id="navHealthSection">
@@ -0,0 +1,249 @@
1
+ <% content_for :page_title, "LLM Health" %>
2
+
3
+ <div>
4
+ <div class="d-flex justify-content-between align-items-center mb-4">
5
+ <h1 style="font-size: 20px; font-weight: 700; margin: 0;">
6
+ <i class="bi bi-cpu-fill me-2"></i>
7
+ LLM Health
8
+ </h1>
9
+
10
+ <% unless @feature_disabled %>
11
+ <div class="btn-group" role="group">
12
+ <%= link_to llm_health_summary_errors_path(days: 7), class: "btn btn-sm #{@days == 7 ? 'btn-primary' : 'btn-outline-primary'}" do %>
13
+ 7 Days
14
+ <% end %>
15
+ <%= link_to llm_health_summary_errors_path(days: 30), class: "btn btn-sm #{@days == 30 ? 'btn-primary' : 'btn-outline-primary'}" do %>
16
+ 30 Days
17
+ <% end %>
18
+ <%= link_to llm_health_summary_errors_path(days: 90), class: "btn btn-sm #{@days == 90 ? 'btn-primary' : 'btn-outline-primary'}" do %>
19
+ 90 Days
20
+ <% end %>
21
+ </div>
22
+ <% end %>
23
+ </div>
24
+
25
+ <% if @feature_disabled %>
26
+ <div class="red-empty-state">
27
+ <i class="bi bi-cpu-fill display-1 text-muted mb-3"></i>
28
+ <div class="red-empty-state-title">LLM Observability Not Enabled</div>
29
+ <p class="text-muted">
30
+ To see per-model LLM stats here, enable LLM observability and breadcrumbs.
31
+ </p>
32
+ <div class="card mx-auto" style="max-width: 600px;">
33
+ <div class="card-body text-start">
34
+ <h6>Enable in <code>config/initializers/rails_error_dashboard.rb</code>:</h6>
35
+ <pre class="mb-0" style="background: var(--bg-subtle); padding: var(--space-3); border-radius: var(--radius-md); font-size: 12px;"><code>RailsErrorDashboard.configure do |config|
36
+ config.enable_breadcrumbs = true
37
+ config.enable_llm_observability = true
38
+ end</code></pre>
39
+ </div>
40
+ </div>
41
+ </div>
42
+
43
+ <% elsif @totals[:total_calls] == 0 && @totals[:total_tool_calls] == 0 %>
44
+ <div class="red-empty-state">
45
+ <i class="bi bi-cpu-fill display-1 text-success mb-3"></i>
46
+ <div class="red-empty-state-title">No LLM Calls Detected</div>
47
+ <p class="text-muted">
48
+ No LLM API calls were captured in error breadcrumbs over the last <%= @days %> days.
49
+ Instrument LLM calls in one of three ways:
50
+ </p>
51
+
52
+ <div class="card mx-auto" style="max-width: 720px;">
53
+ <div class="card-body p-0">
54
+ <ul class="nav nav-tabs" role="tablist" id="llmPathTabs">
55
+ <li class="nav-item" role="presentation">
56
+ <button class="nav-link active" data-bs-toggle="tab" data-bs-target="#llmPathOtel" type="button" role="tab">
57
+ <i class="bi bi-diagram-3 me-1"></i> OpenTelemetry
58
+ </button>
59
+ </li>
60
+ <li class="nav-item" role="presentation">
61
+ <button class="nav-link" data-bs-toggle="tab" data-bs-target="#llmPathFaraday" type="button" role="tab">
62
+ <i class="bi bi-arrow-left-right me-1"></i> Faraday / RubyLLM
63
+ </button>
64
+ </li>
65
+ <li class="nav-item" role="presentation">
66
+ <button class="nav-link" data-bs-toggle="tab" data-bs-target="#llmPathManual" type="button" role="tab">
67
+ <i class="bi bi-pencil-square me-1"></i> Manual
68
+ </button>
69
+ </li>
70
+ </ul>
71
+
72
+ <div class="tab-content p-3 text-start">
73
+ <div class="tab-pane fade show active" id="llmPathOtel" role="tabpanel">
74
+ <p class="mb-2"><small>Already running an OTel collector? Drop in this gem and we'll subscribe automatically.</small></p>
75
+ <pre class="mb-2" style="background: var(--bg-subtle); padding: var(--space-3); border-radius: var(--radius-md); font-size: 12px;"><code># Gemfile
76
+ gem "opentelemetry-instrumentation-ruby_llm"
77
+
78
+ # Then nothing else — OTel spans flow into RED breadcrumbs.</code></pre>
79
+ <small class="text-muted">Requires <code>enable_llm_observability = true</code>.</small>
80
+ </div>
81
+
82
+ <div class="tab-pane fade" id="llmPathFaraday" role="tabpanel">
83
+ <p class="mb-2"><small>Using <code>ruby_llm</code>, <code>openai</code>, <code>anthropic</code>, or any Faraday-based SDK? Insert the middleware once.</small></p>
84
+ <pre class="mb-2" style="background: var(--bg-subtle); padding: var(--space-3); border-radius: var(--radius-md); font-size: 12px;"><code>Faraday.default_connection_options.builder.tap do |b|
85
+ b.use RailsErrorDashboard::Integrations::LlmMiddleware
86
+ end</code></pre>
87
+ <small class="text-muted">RED auto-detects provider/model and tokens from the request/response.</small>
88
+ </div>
89
+
90
+ <div class="tab-pane fade" id="llmPathManual" role="tabpanel">
91
+ <p class="mb-2"><small>Direct <code>Net::HTTP</code>, gRPC, local inference, or anything else — instrument by hand.</small></p>
92
+ <pre class="mb-2" style="background: var(--bg-subtle); padding: var(--space-3); border-radius: var(--radius-md); font-size: 12px;"><code>ActiveSupport::Notifications.instrument(
93
+ "red.llm_call",
94
+ provider: "anthropic",
95
+ model: "claude-3-5-sonnet",
96
+ input_tokens: 1200,
97
+ output_tokens: 350
98
+ ) do
99
+ call_anthropic(...)
100
+ end</code></pre>
101
+ <small class="text-muted">Cost is auto-estimated from tokens — payload can override with <code>cost_usd_estimate:</code>.</small>
102
+ </div>
103
+ </div>
104
+ </div>
105
+ </div>
106
+ </div>
107
+
108
+ <% else %>
109
+ <div class="row mb-4">
110
+ <div class="col-md-3">
111
+ <div class="card text-center">
112
+ <div class="card-body">
113
+ <div class="display-6 text-primary"><%= @totals[:total_calls] %></div>
114
+ <small class="text-muted">LLM Calls</small>
115
+ </div>
116
+ </div>
117
+ </div>
118
+ <div class="col-md-3">
119
+ <div class="card text-center">
120
+ <div class="card-body">
121
+ <div class="display-6 text-secondary"><%= @totals[:model_count] %></div>
122
+ <small class="text-muted">Models</small>
123
+ </div>
124
+ </div>
125
+ </div>
126
+ <div class="col-md-3">
127
+ <div class="card text-center">
128
+ <div class="card-body">
129
+ <% err_color = @totals[:error_rate] >= 10 ? "danger" : (@totals[:error_rate] >= 5 ? "warning" : "success") %>
130
+ <div class="display-6 text-<%= err_color %>"><%= @totals[:unique_error_count] %></div>
131
+ <small class="text-muted">Errors with LLM (<%= @totals[:error_rate] %>%)</small>
132
+ </div>
133
+ </div>
134
+ </div>
135
+ <div class="col-md-3">
136
+ <div class="card text-center">
137
+ <div class="card-body">
138
+ <div class="display-6 text-info">$<%= format("%.2f", @totals[:total_cost_usd]) %></div>
139
+ <small class="text-muted">Total Cost</small>
140
+ </div>
141
+ </div>
142
+ </div>
143
+ </div>
144
+
145
+ <div class="card mb-4">
146
+ <div class="card-header d-flex justify-content-between align-items-center">
147
+ <h5 class="mb-0">
148
+ <i class="bi bi-cpu-fill text-primary me-2"></i>
149
+ Performance by Model
150
+ <span class="badge bg-primary"><%= @totals[:model_count] %></span>
151
+ </h5>
152
+ <small class="text-muted"><%== @pagy.info_tag %></small>
153
+ </div>
154
+ <div class="card-body p-0">
155
+ <div class="table-responsive">
156
+ <table class="table table-hover mb-0">
157
+ <thead class="table-light">
158
+ <tr>
159
+ <th>Provider · Model</th>
160
+ <th width="90">Calls</th>
161
+ <th width="80">Tools</th>
162
+ <th width="130">Avg tokens</th>
163
+ <th width="110">Avg latency</th>
164
+ <th width="100">Error rate</th>
165
+ <th width="100">Cost</th>
166
+ <th>Top error</th>
167
+ <th width="140">Last seen</th>
168
+ </tr>
169
+ </thead>
170
+ <tbody>
171
+ <% @models.each do |entry| %>
172
+ <tr>
173
+ <td>
174
+ <small class="text-muted"><%= entry[:provider] %></small><br>
175
+ <code><%= entry[:model] %></code>
176
+ </td>
177
+ <td><%= entry[:call_count] %></td>
178
+ <td>
179
+ <% if entry[:tool_call_count].positive? %>
180
+ <span class="badge bg-info"><%= entry[:tool_call_count] %></span>
181
+ <% else %>
182
+ <span class="text-muted">—</span>
183
+ <% end %>
184
+ </td>
185
+ <td>
186
+ <% if entry[:avg_input_tokens] || entry[:avg_output_tokens] %>
187
+ <small>
188
+ in <strong><%= number_with_delimiter(entry[:avg_input_tokens] || 0) %></strong><br>
189
+ out <strong><%= number_with_delimiter(entry[:avg_output_tokens] || 0) %></strong>
190
+ </small>
191
+ <% else %>
192
+ <span class="text-muted">—</span>
193
+ <% end %>
194
+ </td>
195
+ <td>
196
+ <% if entry[:avg_duration_ms] %>
197
+ <% lat_color = entry[:avg_duration_ms] > 5000 ? "text-danger" : entry[:avg_duration_ms] > 2000 ? "text-warning" : "text-muted" %>
198
+ <span class="<%= lat_color %>"><%= number_with_delimiter(entry[:avg_duration_ms].round) %>ms</span>
199
+ <% else %>
200
+ <span class="text-muted">—</span>
201
+ <% end %>
202
+ </td>
203
+ <td>
204
+ <% sev_color = entry[:severity] == :danger ? "danger" : (entry[:severity] == :warning ? "warning" : "success") %>
205
+ <span class="badge bg-<%= sev_color %>"><%= entry[:error_rate] %>%</span>
206
+ <% if entry[:error_count].positive? %>
207
+ <br><small class="text-muted"><%= entry[:error_count] %> failed</small>
208
+ <% end %>
209
+ </td>
210
+ <td>
211
+ <% if entry[:cost_usd_sum].positive? %>
212
+ $<%= format("%.4f", entry[:cost_usd_sum]) %>
213
+ <% else %>
214
+ <span class="text-muted">—</span>
215
+ <% end %>
216
+ </td>
217
+ <td>
218
+ <% if entry[:top_error_class] %>
219
+ <code style="font-size: 11px;"><%= entry[:top_error_class] %></code>
220
+ <small class="text-muted">×<%= entry[:top_error_class_count] %></small>
221
+ <% else %>
222
+ <span class="text-muted">—</span>
223
+ <% end %>
224
+ </td>
225
+ <td><%= local_time_ago(entry[:last_seen]) %></td>
226
+ </tr>
227
+ <% end %>
228
+ </tbody>
229
+ </table>
230
+ </div>
231
+ </div>
232
+ <div class="card-footer border-top d-flex justify-content-between align-items-center">
233
+ <div>
234
+ <small class="text-muted">
235
+ <i class="bi bi-lightbulb text-warning"></i> High error rates on a model often signal quota or rate-limit issues. Slow latency (&gt;5s) suggests provider degradation.
236
+ </small>
237
+ <small class="ms-3">
238
+ <a href="https://github.com/thoughtbot/opentelemetry-instrumentation-ruby_llm" target="_blank" rel="noopener" class="text-decoration-none">
239
+ <i class="bi bi-book"></i> Instrumentation Guide <i class="bi bi-box-arrow-up-right" style="font-size: 0.7em;"></i>
240
+ </a>
241
+ </small>
242
+ </div>
243
+ <div>
244
+ <%== @pagy.series_nav(:bootstrap) if @pagy.pages > 1 %>
245
+ </div>
246
+ </div>
247
+ </div>
248
+ <% end %>
249
+ </div>
data/config/routes.rb CHANGED
@@ -38,6 +38,7 @@ RailsErrorDashboard::Engine.routes.draw do
38
38
  get :rack_attack_summary
39
39
  get :actioncable_health_summary
40
40
  get :activestorage_health_summary
41
+ get :llm_health_summary
41
42
  get :releases
42
43
  get :user_impact
43
44
  get :diagnostic_dumps
@@ -483,6 +483,41 @@ RailsErrorDashboard.configure do |config|
483
483
  # config.llm_max_output_tokens = 900
484
484
  # config.llm_system_prompt = "Prefer concise answers with file-level next steps."
485
485
 
486
+ # ============================================================================
487
+ # OPENTELEMETRY EXPORT (OUTBOUND)
488
+ # ============================================================================
489
+ #
490
+ # Emit gem operations as OpenTelemetry spans so the host's existing
491
+ # Datadog / Honeycomb / Jaeger / Grafana Tempo pipeline gets a trace
492
+ # of every error capture. Useful for:
493
+ # - Auditing "when did this error get captured?" against deploy events
494
+ # - Measuring how much time the gem spends in the capture path
495
+ # - Proving the <5ms host-safety budget from operator dashboards
496
+ #
497
+ # Emits four spans per error capture:
498
+ # rails_error_dashboard.capture_error — parent, wraps everything
499
+ # rails_error_dashboard.breadcrumb_collection — buffer drain (~µs)
500
+ # rails_error_dashboard.system_health_snapshot — GC.stat etc. (<1ms)
501
+ # rails_error_dashboard.notification_dispatch — Slack/email enqueue
502
+ #
503
+ # Disabled by default. Requires the host app to already run OpenTelemetry
504
+ # (the gem does NOT add an opentelemetry-* runtime dependency). When OTel
505
+ # is absent, every span call is a zero-overhead no-op.
506
+ #
507
+ # config.enable_otel_export = true
508
+ # config.otel_service_name = "my-app" # Falls back to application_name when nil
509
+ #
510
+ # Per-span opt-out: pass any subset to disable individual span kinds
511
+ # without code changes. Useful when e.g. notification dispatch is slow due
512
+ # to outbound HTTP and you don't want it polluting your trace dashboards.
513
+ #
514
+ # config.otel_spans = [:capture, :breadcrumbs, :health, :notifications] # all (default)
515
+ # config.otel_spans = [:capture] # parent only
516
+ # config.otel_spans = [:capture, :health] # parent + health
517
+ #
518
+ # No PII or request bodies in span attributes — just metadata + timing.
519
+ # Safe to enable on production OTel pipelines.
520
+
486
521
  # ============================================================================
487
522
  # ISSUE TRACKING (GitHub / GitLab / Codeberg)
488
523
  # ============================================================================
@@ -17,6 +17,21 @@ module RailsErrorDashboard
17
17
  end
18
18
  end
19
19
 
20
+ # Build the base OTel span attributes available before any work happens.
21
+ # Kept as a module-level helper so both sync and async paths can call it.
22
+ # @return [Hash<String, Object>]
23
+ def self.build_capture_span_attributes(exception, was_async:)
24
+ msg = exception.message.to_s
25
+ {
26
+ "error.type" => exception.class.name,
27
+ "error.message" => msg.length > 200 ? "#{msg[0, 200]}…" : msg,
28
+ "rails_error_dashboard.environment" => (defined?(Rails) && Rails.env.to_s) || "unknown",
29
+ "rails_error_dashboard.was_async" => was_async
30
+ }
31
+ rescue StandardError
32
+ { "error.type" => "unknown", "rails_error_dashboard.was_async" => was_async }
33
+ end
34
+
20
35
  # Queue error logging as a background job
21
36
  def self.call_async(exception, context = {})
22
37
  # Serialize exception data for the job
@@ -68,7 +83,17 @@ module RailsErrorDashboard
68
83
  # Enqueue the async job using ActiveJob
69
84
  # The queue adapter (:sidekiq, :solid_queue, :async) is configured separately
70
85
  begin
71
- AsyncErrorLoggingJob.perform_later(exception_data, context)
86
+ # OTel: emit a capture span around the enqueue itself. The real capture
87
+ # work runs in the job (which starts its own root span via .new(...).call).
88
+ # For the async path the span here measures *enqueue latency only* — used
89
+ # to detect queue-adapter backpressure or Redis slowness.
90
+ Integrations::Tracer.in_span(
91
+ "capture_error",
92
+ kind: :capture,
93
+ attributes: build_capture_span_attributes(exception, was_async: true)
94
+ ) do |_span|
95
+ AsyncErrorLoggingJob.perform_later(exception_data, context)
96
+ end
72
97
  rescue => e
73
98
  # Queue adapter failed (e.g., Redis down for Sidekiq). Fall back to
74
99
  # sync logging so the error is still captured. Without this rescue,
@@ -118,13 +143,31 @@ module RailsErrorDashboard
118
143
  end
119
144
 
120
145
  def call
146
+ # OTel: parent capture span. Wraps the entire sync capture path so
147
+ # operators can audit how long error capture takes from their existing
148
+ # tracing pipeline. Child spans (breadcrumbs, health, notifications)
149
+ # nest under this one automatically via OTel context propagation.
150
+ #
151
+ # The span lives INSIDE the rescue clause — if the span itself raises
152
+ # somehow, the outer rescue still catches it and returns nil. Defense
153
+ # in depth. When the block raises, the Tracer façade records the
154
+ # exception on the span and re-raises so the rescue can swallow it.
155
+ Integrations::Tracer.in_span(
156
+ "capture_error",
157
+ kind: :capture,
158
+ attributes: self.class.build_capture_span_attributes(@exception, was_async: false)
159
+ ) do |span|
121
160
  # Check if this exception should be logged (ignore list + sampling)
122
- return nil unless Services::ExceptionFilter.should_log?(@exception)
161
+ if !Services::ExceptionFilter.should_log?(@exception)
162
+ span&.set_attribute("rails_error_dashboard.filtered", true)
163
+ next nil
164
+ end
123
165
 
124
166
  error_context = ValueObjects::ErrorContext.new(@context, @context[:source])
125
167
 
126
168
  # Find or create application (cached lookup)
127
169
  application = find_or_create_application
170
+ span&.set_attribute("rails_error_dashboard.application", application.name.to_s) if application.respond_to?(:name)
128
171
 
129
172
  # Build error attributes
130
173
  truncated_backtrace = Services::BacktraceProcessor.truncate(@exception.backtrace)
@@ -262,6 +305,14 @@ module RailsErrorDashboard
262
305
  # This ensures accurate occurrence tracking
263
306
  error_log = ErrorLog.find_or_increment_by_hash(error_hash, attributes.merge(error_hash: error_hash))
264
307
 
308
+ # OTel: now that the error_log exists, attach its id + dedup flag + severity
309
+ # to the parent capture span so operators can correlate to dashboard URLs.
310
+ if span && error_log
311
+ span.set_attribute("rails_error_dashboard.error_id", error_log.id) if error_log.id
312
+ span.set_attribute("rails_error_dashboard.deduplicated", error_log.occurrence_count.to_i > 1)
313
+ span.set_attribute("rails_error_dashboard.severity", error_log.severity.to_s) if error_log.respond_to?(:severity) && error_log.severity
314
+ end
315
+
265
316
  # Track individual error occurrence for co-occurrence analysis (if table exists)
266
317
  if defined?(ErrorOccurrence) && ErrorOccurrence.table_exists?
267
318
  begin
@@ -298,6 +349,7 @@ module RailsErrorDashboard
298
349
  check_baseline_anomaly(error_log)
299
350
 
300
351
  error_log
352
+ end
301
353
  rescue => e
302
354
  # Don't let error logging cause more errors - fail silently
303
355
  # CRITICAL: Log but never propagate exception
@@ -189,6 +189,14 @@ module RailsErrorDashboard
189
189
  attr_accessor :llm_observability_content_capture # Capture prompt/completion text (default: false — PII risk)
190
190
  attr_accessor :llm_pricing_overrides # Hash of { "model-name" => { input: usd_per_1m, output: usd_per_1m } }
191
191
 
192
+ # OpenTelemetry outbound export — emit gem operations as OTel spans for
193
+ # Datadog/Honeycomb/Jaeger. Requires the host app to already run OTel.
194
+ # When OTel is absent OR enable_otel_export is false, all emit calls
195
+ # are no-ops with zero overhead.
196
+ attr_accessor :enable_otel_export # Master switch (default: false)
197
+ attr_accessor :otel_service_name # Falls back to application_name when nil
198
+ attr_accessor :otel_spans # Array of enabled span kinds — see Integrations::Tracer::ALL_SPAN_KINDS
199
+
192
200
  # Dashboard UI appearance
193
201
  attr_accessor :accent_color # :crimson (default), :ruby, :ember, :violet
194
202
 
@@ -372,6 +380,13 @@ module RailsErrorDashboard
372
380
  @llm_observability_content_capture = false
373
381
  @llm_pricing_overrides = {}
374
382
 
383
+ # OTel outbound export defaults — OFF (opt-in). All four span kinds enabled
384
+ # by default once master switch flips on; users can pass a subset to opt out
385
+ # of e.g. notification spans without code changes.
386
+ @enable_otel_export = false
387
+ @otel_service_name = nil
388
+ @otel_spans = %i[capture breadcrumbs health notifications]
389
+
375
390
  # Internal logging defaults - SILENT by default
376
391
  @enable_internal_logging = false # Opt-in for debugging
377
392
  @log_level = :silent # Silent by default, use :debug, :info, :warn, :error, or :silent
@@ -553,6 +568,28 @@ module RailsErrorDashboard
553
568
  @enable_llm_observability = false
554
569
  end
555
570
 
571
+ # Validate OTel export config — coerce or warn rather than raise so a
572
+ # config typo never blocks the host app from booting.
573
+ if enable_otel_export
574
+ unless otel_spans.is_a?(Array)
575
+ warnings << "otel_spans must be an Array of symbols (e.g. [:capture, :breadcrumbs]). " \
576
+ "Resetting to all-enabled."
577
+ @otel_spans = %i[capture breadcrumbs health notifications]
578
+ end
579
+
580
+ invalid = otel_spans - %i[capture breadcrumbs health notifications]
581
+ if invalid.any?
582
+ warnings << "otel_spans contains unknown kinds: #{invalid.inspect}. Allowed: " \
583
+ "[:capture, :breadcrumbs, :health, :notifications]. Ignoring unknown values."
584
+ @otel_spans = otel_spans - invalid
585
+ end
586
+
587
+ if @otel_spans.empty?
588
+ warnings << "enable_otel_export = true but otel_spans is empty — no spans will be emitted. " \
589
+ "Set otel_spans to enable at least one of [:capture, :breadcrumbs, :health, :notifications]."
590
+ end
591
+ end
592
+
556
593
  # Skip credential/service-dependent validations during Docker builds.
557
594
  # SECRET_KEY_BASE_DUMMY=1 means no credentials or external services available.
558
595
  build_env = ENV["SECRET_KEY_BASE_DUMMY"].present?
@@ -107,6 +107,20 @@ module RailsErrorDashboard
107
107
  # capability, so this is safe to call unconditionally.
108
108
  RailsErrorDashboard::Integrations::LlmSpanProcessor.register!
109
109
 
110
+ # Outbound OTel export — warn at boot if the feature is enabled but
111
+ # the OTel API isn't loaded. The Tracer façade silently no-ops in that
112
+ # state, so without this warning users could enable the feature and
113
+ # see zero spans without knowing why. Don't auto-disable — the user
114
+ # may install OTel later in the boot sequence.
115
+ if RailsErrorDashboard.configuration.enable_otel_export &&
116
+ !RailsErrorDashboard::Integrations::Tracer.otel_api_loaded?
117
+ Rails.logger.warn(
118
+ "[RailsErrorDashboard] enable_otel_export = true but the OpenTelemetry API " \
119
+ "(opentelemetry-api gem) isn't loaded. Outbound spans will not emit. " \
120
+ "Add `gem \"opentelemetry-api\"` (or the full opentelemetry-sdk) to your Gemfile."
121
+ )
122
+ end
123
+
110
124
  # Subscribe to red.llm_call / red.llm_tool_call AS::Notifications — Tier 3
111
125
  # path for hosts using direct Net::HTTP / gRPC / local inference servers
112
126
  # that aren't covered by OTel or the Faraday middleware.
@@ -0,0 +1,195 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RailsErrorDashboard
4
+ module Integrations
5
+ # OpenTelemetry tracer façade for the outbound direction — emits spans
6
+ # from the gem's capture path so host operators can audit error tracking
7
+ # latency from their existing Datadog/Honeycomb/Jaeger pipeline.
8
+ #
9
+ # Symmetric counterpart to LlmSpanProcessor (which is INBOUND — pulls
10
+ # OTel spans INTO RED breadcrumbs). This module pushes OUTBOUND: gem
11
+ # operations OUT to the host's tracer provider.
12
+ #
13
+ # Designed to be called from hot paths unconditionally. When OTel is
14
+ # absent or the feature is off, `in_span` runs the block with a no-op
15
+ # span object — call sites do NOT branch on availability.
16
+ #
17
+ # HOST APP SAFETY (HOST_APP_SAFETY.md):
18
+ # - No-op when `enable_otel_export = false` OR OTel API not loaded
19
+ # - Per-span-kind opt-in/out via config.otel_spans
20
+ # - Tracer instance memoized per-process (rebuild on `reset!`)
21
+ # - Every public method hard-rescues — never raises into host code
22
+ # - Block return value is preserved even when tracer errors
23
+ # - Exceptions raised by the block re-raise after being recorded
24
+ #
25
+ # Configuration:
26
+ # config.enable_otel_export = true # master switch (default false)
27
+ # config.otel_service_name = "my-app" # falls back to application_name
28
+ # config.otel_spans = [:capture, :breadcrumbs, :health, :notifications]
29
+ #
30
+ # Usage from capture-path code:
31
+ #
32
+ # Tracer.in_span("capture_error", kind: :capture,
33
+ # attributes: { error_type: exception.class.name }) do |span|
34
+ # # ... do the work ...
35
+ # span&.set_attribute("rails_error_dashboard.error_id", error.id)
36
+ # end
37
+ #
38
+ # The span object yielded may be the real OTel span or a NoopSpan.
39
+ # Always use safe-nav (`span&.`) or guard with `span.respond_to?(:...)`.
40
+ module Tracer
41
+ INSTRUMENTATION_NAME = "rails_error_dashboard"
42
+ ALL_SPAN_KINDS = %i[capture breadcrumbs health notifications].freeze
43
+
44
+ # No-op stand-in returned to the block when tracing is off or unavailable.
45
+ # Mimics the OTel Span interface (set_attribute, add_event, record_exception)
46
+ # so call sites don't branch.
47
+ class NoopSpan
48
+ def set_attribute(_key, _value); self; end
49
+ def add_event(_name, attributes: nil); self; end
50
+ def record_exception(_exception, attributes: nil); self; end
51
+ def status=(_status); end
52
+ def finish; self; end
53
+ def context; nil; end
54
+ end
55
+
56
+ NOOP_SPAN = NoopSpan.new.freeze
57
+
58
+ class << self
59
+ # Yields a span object to the block. Returns the block's return value.
60
+ # Records exceptions raised by the block as span events and re-raises.
61
+ #
62
+ # @param name [String] short span name (will be namespaced with INSTRUMENTATION_NAME)
63
+ # @param kind [Symbol] one of ALL_SPAN_KINDS — checked against config.otel_spans
64
+ # @param attributes [Hash<String,Object>] attached to the span at creation
65
+ # @yieldparam span [NoopSpan, ::OpenTelemetry::Trace::Span] real or no-op
66
+ # @return [Object] whatever the block returns
67
+ def in_span(name, kind: :capture, attributes: {})
68
+ return yield(NOOP_SPAN) unless emit?(kind)
69
+
70
+ tr = tracer
71
+ return yield(NOOP_SPAN) unless tr
72
+
73
+ full_name = "#{INSTRUMENTATION_NAME}.#{name}"
74
+ merged = base_attributes.merge(safe_stringify(attributes))
75
+
76
+ tr.in_span(full_name, attributes: merged) do |span|
77
+ begin
78
+ yield span
79
+ rescue StandardError => e
80
+ record_block_exception(span, e)
81
+ raise
82
+ end
83
+ end
84
+ rescue StandardError => e
85
+ # Tracer internals failed (e.g. OTel SDK threw on add_span). Fall back
86
+ # to running the block with a no-op so the host app never sees a crash
87
+ # caused by the tracer.
88
+ Logger.debug("[RailsErrorDashboard] Tracer.in_span(#{name.inspect}) failed: #{e.class}: #{e.message}")
89
+ yield NOOP_SPAN
90
+ end
91
+
92
+ # Returns true when the OTel API is loaded AND the master switch is on
93
+ # AND the given span kind is in the enabled set. Cheap — called on every
94
+ # in_span invocation, including in the hot path.
95
+ # @param kind [Symbol]
96
+ # @return [Boolean]
97
+ def emit?(kind)
98
+ config = RailsErrorDashboard.configuration
99
+ return false unless config.enable_otel_export
100
+ return false unless otel_api_loaded?
101
+
102
+ enabled_kinds = config.otel_spans
103
+ return false if enabled_kinds.nil? || enabled_kinds.empty?
104
+ enabled_kinds.include?(kind)
105
+ rescue StandardError
106
+ false
107
+ end
108
+
109
+ # Reset memoized tracer + availability — for spec isolation only.
110
+ def reset!
111
+ @tracer = nil
112
+ @otel_api_loaded = nil
113
+ end
114
+
115
+ # Returns true if the OTel API gem is loaded (NOT the SDK). The API alone
116
+ # is enough — it ships a ProxyTracerProvider that's a no-op when no SDK
117
+ # is configured, which is the behavior we want.
118
+ # @return [Boolean]
119
+ def otel_api_loaded?
120
+ return @otel_api_loaded unless @otel_api_loaded.nil?
121
+ @otel_api_loaded = !!(defined?(::OpenTelemetry) &&
122
+ ::OpenTelemetry.respond_to?(:tracer_provider))
123
+ rescue StandardError
124
+ @otel_api_loaded = false
125
+ end
126
+
127
+ private
128
+
129
+ # Memoized tracer instance. Returns nil on any failure so the caller
130
+ # falls back to no-op behavior.
131
+ # @return [::OpenTelemetry::Trace::Tracer, nil]
132
+ def tracer
133
+ return @tracer if @tracer
134
+ return nil unless otel_api_loaded?
135
+
136
+ @tracer = ::OpenTelemetry.tracer_provider.tracer(
137
+ INSTRUMENTATION_NAME,
138
+ RailsErrorDashboard::VERSION
139
+ )
140
+ rescue StandardError => e
141
+ Logger.debug("[RailsErrorDashboard] Tracer initialization failed: #{e.class}: #{e.message}")
142
+ nil
143
+ end
144
+
145
+ # Attributes attached to every span — service-name and gem version
146
+ # let operators filter the gem's traffic out of their dashboards.
147
+ def base_attributes
148
+ config = RailsErrorDashboard.configuration
149
+ {
150
+ "rails_error_dashboard.version" => RailsErrorDashboard::VERSION,
151
+ "rails_error_dashboard.service_name" => config.otel_service_name ||
152
+ config.application_name ||
153
+ "unknown"
154
+ }
155
+ rescue StandardError
156
+ {}
157
+ end
158
+
159
+ # OTel attribute values must be strings, bools, numerics, or arrays of those.
160
+ # Coerce hash values to strings as a safety net — host code passing arbitrary
161
+ # objects (e.g. a Symbol or an Exception) won't crash the SDK.
162
+ def safe_stringify(attrs)
163
+ return {} unless attrs.is_a?(Hash)
164
+ attrs.each_with_object({}) do |(k, v), acc|
165
+ key = k.to_s
166
+ acc[key] = case v
167
+ when String, Numeric, TrueClass, FalseClass then v
168
+ when Array
169
+ v.map { |x| x.is_a?(String) || x.is_a?(Numeric) || x == true || x == false ? x : x.to_s }
170
+ when nil then nil
171
+ else v.to_s
172
+ end
173
+ end.compact
174
+ rescue StandardError
175
+ {}
176
+ end
177
+
178
+ # OTel semconv for exceptions:
179
+ # span.record_exception(exception) -- adds an "exception" event
180
+ # span.status = OpenTelemetry::Trace::Status.error("message")
181
+ def record_block_exception(span, exception)
182
+ return unless span.respond_to?(:record_exception)
183
+ span.record_exception(exception)
184
+
185
+ if defined?(::OpenTelemetry::Trace::Status) &&
186
+ ::OpenTelemetry::Trace::Status.respond_to?(:error)
187
+ span.status = ::OpenTelemetry::Trace::Status.error(exception.message.to_s[0, 200])
188
+ end
189
+ rescue StandardError
190
+ nil
191
+ end
192
+ end
193
+ end
194
+ end
195
+ end
@@ -0,0 +1,220 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RailsErrorDashboard
4
+ module Queries
5
+ # Query: Aggregate LLM call breadcrumbs across all errors, grouped by
6
+ # "provider · model". Scans error_logs breadcrumbs JSON, filters for "llm"
7
+ # and "llm_tool" category crumbs, and computes per-model stats: call count,
8
+ # tool count, avg tokens, avg latency, error rate, cost, top error class.
9
+ #
10
+ # Sorted by error rate desc, then unique-error count desc, then call volume
11
+ # desc — so the model causing the most errors floats to the top.
12
+ #
13
+ # Cost is read straight from the breadcrumb metadata (already estimated at
14
+ # capture time by LlmCallSubscriber via LlmCostEstimator). No re-estimation.
15
+ class LlmHealthSummary
16
+ DANGER_THRESHOLD = 10.0 # error rate %
17
+ WARNING_THRESHOLD = 5.0
18
+
19
+ def self.call(days = 30, application_id: nil)
20
+ new(days, application_id: application_id).call
21
+ end
22
+
23
+ # Public helper so controllers can render an empty-state shell without
24
+ # running the query (e.g., when the feature is disabled).
25
+ def self.blank_totals
26
+ {
27
+ total_calls: 0,
28
+ total_tool_calls: 0,
29
+ model_count: 0,
30
+ unique_error_count: 0,
31
+ error_rate: 0.0,
32
+ total_cost_usd: 0.0
33
+ }
34
+ end
35
+
36
+ def initialize(days = 30, application_id: nil)
37
+ @days = days
38
+ @application_id = application_id
39
+ @start_date = days.days.ago
40
+ end
41
+
42
+ def call
43
+ models = aggregated_models
44
+ {
45
+ models: models,
46
+ totals: totals_for(models)
47
+ }
48
+ end
49
+
50
+ private
51
+
52
+ def base_query
53
+ scope = ErrorLog.where("occurred_at >= ?", @start_date)
54
+ .where.not(breadcrumbs: nil)
55
+ scope = scope.where(application_id: @application_id) if @application_id.present?
56
+ scope
57
+ end
58
+
59
+ def aggregated_models
60
+ results = {}
61
+
62
+ base_query.select(:id, :breadcrumbs, :occurred_at).find_each(batch_size: 500) do |error_log|
63
+ crumbs = parse_breadcrumbs(error_log.breadcrumbs)
64
+ next if crumbs.empty?
65
+
66
+ llm_crumbs = crumbs.select { |c| c["c"] == "llm" || c["c"] == "llm_tool" }
67
+ next if llm_crumbs.empty?
68
+
69
+ llm_crumbs.each do |crumb|
70
+ meta = crumb["meta"] || {}
71
+ provider = meta["provider"].to_s.presence || "unknown"
72
+ model = meta["model"].to_s.presence || "unknown"
73
+ key = "#{provider}·#{model}"
74
+
75
+ results[key] ||= new_entry(provider, model)
76
+ entry = results[key]
77
+
78
+ if crumb["c"] == "llm_tool"
79
+ entry[:tool_call_count] += 1
80
+ else
81
+ entry[:call_count] += 1
82
+ end
83
+
84
+ status = meta["status"].to_s
85
+ entry[:error_count] += 1 if status == "error" || status == "timeout"
86
+
87
+ # BreadcrumbCollector#truncate_metadata stringifies every metadata
88
+ # value (input_tokens "42", cost_usd "0.0003", etc.), so we coerce
89
+ # back to numeric here using the same pattern as LlmSummary. nil and
90
+ # blank values skip the accumulator entirely so they don't pollute
91
+ # averages.
92
+ if (it = meta["input_tokens"]).present?
93
+ entry[:input_tokens_sum] += it.to_i
94
+ entry[:input_tokens_seen] += 1
95
+ end
96
+ if (ot = meta["output_tokens"]).present?
97
+ entry[:output_tokens_sum] += ot.to_i
98
+ entry[:output_tokens_seen] += 1
99
+ end
100
+ duration_raw = meta["duration_ms"] || crumb["d"]
101
+ if duration_raw.present?
102
+ d = duration_raw.to_f
103
+ if d > 0
104
+ entry[:duration_sum] += d
105
+ entry[:duration_seen] += 1
106
+ end
107
+ end
108
+ if (cost = meta["cost_usd"]).present?
109
+ entry[:cost_usd_sum] += cost.to_f
110
+ end
111
+
112
+ if (err_class = meta["error_class"]).is_a?(String) && !err_class.empty?
113
+ entry[:error_classes][err_class] = (entry[:error_classes][err_class] || 0) + 1
114
+ end
115
+
116
+ entry[:error_ids] << error_log.id
117
+ entry[:last_seen] = [ entry[:last_seen], error_log.occurred_at ].compact.max
118
+ end
119
+ end
120
+
121
+ finalize(results)
122
+ rescue => e
123
+ Rails.logger.error("[RailsErrorDashboard] LlmHealthSummary query failed: #{e.class}: #{e.message}")
124
+ []
125
+ end
126
+
127
+ def new_entry(provider, model)
128
+ {
129
+ provider: provider,
130
+ model: model,
131
+ call_count: 0,
132
+ tool_call_count: 0,
133
+ error_count: 0,
134
+ input_tokens_sum: 0,
135
+ input_tokens_seen: 0,
136
+ output_tokens_sum: 0,
137
+ output_tokens_seen: 0,
138
+ duration_sum: 0.0,
139
+ duration_seen: 0,
140
+ cost_usd_sum: 0.0,
141
+ error_classes: {},
142
+ error_ids: [],
143
+ last_seen: nil
144
+ }
145
+ end
146
+
147
+ def finalize(results)
148
+ results.values.each do |r|
149
+ r[:error_ids] = r[:error_ids].uniq
150
+ r[:unique_error_count] = r[:error_ids].size
151
+
152
+ total_attempts = r[:call_count] + r[:tool_call_count]
153
+ r[:error_rate] = total_attempts.positive? ? (r[:error_count].to_f / total_attempts * 100).round(2) : 0.0
154
+ r[:severity] = severity_for(r[:error_rate])
155
+
156
+ r[:avg_input_tokens] = avg(r[:input_tokens_sum], r[:input_tokens_seen])
157
+ r[:avg_output_tokens] = avg(r[:output_tokens_sum], r[:output_tokens_seen])
158
+ r[:avg_duration_ms] = r[:duration_seen].positive? ? (r[:duration_sum] / r[:duration_seen]).round(2) : nil
159
+ r[:cost_usd_sum] = r[:cost_usd_sum].round(4)
160
+
161
+ top_class, top_count = r[:error_classes].max_by { |_, c| c }
162
+ r[:top_error_class] = top_class
163
+ r[:top_error_class_count] = top_count
164
+
165
+ # Drop accumulators — view doesn't need them
166
+ r.delete(:input_tokens_sum)
167
+ r.delete(:input_tokens_seen)
168
+ r.delete(:output_tokens_sum)
169
+ r.delete(:output_tokens_seen)
170
+ r.delete(:duration_sum)
171
+ r.delete(:duration_seen)
172
+ r.delete(:error_classes)
173
+ end
174
+
175
+ results.values.sort_by { |r| [ -r[:error_rate], -r[:unique_error_count], -r[:call_count] ] }
176
+ end
177
+
178
+ def avg(sum, count)
179
+ return nil if count.zero?
180
+ (sum.to_f / count).round
181
+ end
182
+
183
+ def severity_for(error_rate)
184
+ return :danger if error_rate >= DANGER_THRESHOLD
185
+ return :warning if error_rate >= WARNING_THRESHOLD
186
+ :success
187
+ end
188
+
189
+ def totals_for(models)
190
+ return blank_totals if models.empty? || !models.is_a?(Array)
191
+
192
+ total_calls = models.sum { |m| m[:call_count] }
193
+ total_tool_calls = models.sum { |m| m[:tool_call_count] }
194
+ total_errors = models.sum { |m| m[:error_count] }
195
+ total_attempts = total_calls + total_tool_calls
196
+ unique_error_ids = models.flat_map { |m| m[:error_ids] }.uniq
197
+
198
+ {
199
+ total_calls: total_calls,
200
+ total_tool_calls: total_tool_calls,
201
+ model_count: models.size,
202
+ unique_error_count: unique_error_ids.size,
203
+ error_rate: total_attempts.positive? ? (total_errors.to_f / total_attempts * 100).round(2) : 0.0,
204
+ total_cost_usd: models.sum { |m| m[:cost_usd_sum] }.round(4)
205
+ }
206
+ end
207
+
208
+ def blank_totals
209
+ self.class.blank_totals
210
+ end
211
+
212
+ def parse_breadcrumbs(raw)
213
+ return [] if raw.blank?
214
+ JSON.parse(raw)
215
+ rescue JSON::ParserError
216
+ []
217
+ end
218
+ end
219
+ end
220
+ end
@@ -120,12 +120,33 @@ module RailsErrorDashboard
120
120
  # Harvest breadcrumbs from the current buffer and clear it
121
121
  # @return [Array<Hash>] Array of breadcrumb hashes (empty if none)
122
122
  def self.harvest
123
- buffer = Thread.current[THREAD_KEY]
124
- return [] unless buffer
123
+ # OTel: emit a child span around the harvest so operators see the
124
+ # buffer-drain step in the capture trace. Cheap to compute (single
125
+ # Array#size + JSON byte estimate) and contained to LogError invocations
126
+ # via the parent rails_error_dashboard.capture_error span.
127
+ RailsErrorDashboard::Integrations::Tracer.in_span(
128
+ "breadcrumb_collection",
129
+ kind: :breadcrumbs
130
+ ) do |span|
131
+ buffer = Thread.current[THREAD_KEY]
132
+ if buffer.nil?
133
+ span&.set_attribute("breadcrumb_count", 0)
134
+ next []
135
+ end
125
136
 
126
- result = buffer.to_a
127
- buffer.clear
128
- result
137
+ result = buffer.to_a
138
+ buffer.clear
139
+
140
+ # Only pay for attribute computation when a real span is recording.
141
+ # NoopSpan is the singleton returned when OTel is off — skip the work
142
+ # entirely so the harvest path stays free in the common case.
143
+ if span && !span.equal?(RailsErrorDashboard::Integrations::Tracer::NOOP_SPAN)
144
+ span.set_attribute("breadcrumb_count", result.size)
145
+ span.set_attribute("bytes_serialized_estimate", estimate_byte_size(result))
146
+ end
147
+
148
+ result
149
+ end
129
150
  rescue => e
130
151
  RailsErrorDashboard::Logger.debug("[RailsErrorDashboard] BreadcrumbCollector.harvest failed: #{e.message}")
131
152
  []
@@ -222,6 +243,25 @@ module RailsErrorDashboard
222
243
  {}
223
244
  end
224
245
  private_class_method :truncate_metadata
246
+
247
+ # Rough byte-size estimate without paying the full JSON serialization
248
+ # cost. Sums the (already-truncated) message lengths and metadata string
249
+ # values. Used as the bytes_serialized_estimate attribute on the OTel
250
+ # breadcrumb_collection span.
251
+ def self.estimate_byte_size(breadcrumbs)
252
+ return 0 unless breadcrumbs.is_a?(Array)
253
+ breadcrumbs.sum do |c|
254
+ next 0 unless c.is_a?(Hash)
255
+ # ~12 bytes constant overhead per crumb (timestamp + category key)
256
+ base = 12 + (c[:m] || c["m"]).to_s.bytesize
257
+ meta = c[:meta] || c["meta"]
258
+ base += meta.values.sum { |v| v.to_s.bytesize } if meta.is_a?(Hash)
259
+ base
260
+ end
261
+ rescue StandardError
262
+ 0
263
+ end
264
+ private_class_method :estimate_byte_size
225
265
  end
226
266
  end
227
267
  end
@@ -12,26 +12,48 @@ module RailsErrorDashboard
12
12
  class ErrorNotificationDispatcher
13
13
  # @param error_log [ErrorLog] The error to notify about
14
14
  def self.call(error_log)
15
- config = RailsErrorDashboard.configuration
15
+ # OTel: emit a child span around the dispatch so operators can see
16
+ # which channels fired for a given error and how long the enqueue
17
+ # itself took. Actual delivery happens in the background jobs (Slack
18
+ # HTTP, SMTP, etc.) — those would need their own instrumentation to
19
+ # measure delivery latency.
20
+ RailsErrorDashboard::Integrations::Tracer.in_span(
21
+ "notification_dispatch",
22
+ kind: :notifications,
23
+ attributes: { "rails_error_dashboard.error_id" => error_log.id.to_i }
24
+ ) do |span|
25
+ config = RailsErrorDashboard.configuration
26
+ fired = []
16
27
 
17
- if config.enable_slack_notifications && config.slack_webhook_url.present?
18
- SlackErrorNotificationJob.perform_later(error_log.id)
19
- end
28
+ if config.enable_slack_notifications && config.slack_webhook_url.present?
29
+ SlackErrorNotificationJob.perform_later(error_log.id)
30
+ fired << "slack"
31
+ end
20
32
 
21
- if config.enable_email_notifications && config.notification_email_recipients.present?
22
- EmailErrorNotificationJob.perform_later(error_log.id)
23
- end
33
+ if config.enable_email_notifications && config.notification_email_recipients.present?
34
+ EmailErrorNotificationJob.perform_later(error_log.id)
35
+ fired << "email"
36
+ end
24
37
 
25
- if config.enable_discord_notifications && config.discord_webhook_url.present?
26
- DiscordErrorNotificationJob.perform_later(error_log.id)
27
- end
38
+ if config.enable_discord_notifications && config.discord_webhook_url.present?
39
+ DiscordErrorNotificationJob.perform_later(error_log.id)
40
+ fired << "discord"
41
+ end
28
42
 
29
- if config.enable_pagerduty_notifications && config.pagerduty_integration_key.present?
30
- PagerdutyErrorNotificationJob.perform_later(error_log.id)
31
- end
43
+ if config.enable_pagerduty_notifications && config.pagerduty_integration_key.present?
44
+ PagerdutyErrorNotificationJob.perform_later(error_log.id)
45
+ fired << "pagerduty"
46
+ end
47
+
48
+ if config.enable_webhook_notifications && config.webhook_urls.present?
49
+ WebhookErrorNotificationJob.perform_later(error_log.id)
50
+ fired << "webhook"
51
+ end
32
52
 
33
- if config.enable_webhook_notifications && config.webhook_urls.present?
34
- WebhookErrorNotificationJob.perform_later(error_log.id)
53
+ if span && !span.equal?(RailsErrorDashboard::Integrations::Tracer::NOOP_SPAN)
54
+ span.set_attribute("channels", fired)
55
+ span.set_attribute("channel_count", fired.size)
56
+ end
35
57
  end
36
58
  end
37
59
  end
@@ -24,7 +24,16 @@ module RailsErrorDashboard
24
24
  # Capture current system health metrics
25
25
  # @return [Hash] Health snapshot (always safe, never raises)
26
26
  def self.capture
27
- new.capture
27
+ # OTel: emit a child span around the snapshot so operators can verify
28
+ # the <1ms health-budget claim from their own tracing dashboard. The
29
+ # snapshot itself is read-only (GC.stat, pool.stat, procfs reads) so
30
+ # the span carries no useful attributes beyond timing.
31
+ RailsErrorDashboard::Integrations::Tracer.in_span(
32
+ "system_health_snapshot",
33
+ kind: :health
34
+ ) do |_span|
35
+ new.capture
36
+ end
28
37
  rescue => e
29
38
  RailsErrorDashboard::Logger.debug("[RailsErrorDashboard] SystemHealthSnapshot.capture failed: #{e.message}")
30
39
  { captured_at: Time.current.iso8601 }
@@ -1,3 +1,3 @@
1
1
  module RailsErrorDashboard
2
- VERSION = "0.7.1"
2
+ VERSION = "0.8.0"
3
3
  end
@@ -21,6 +21,7 @@ begin; require "turbo-rails"; rescue LoadError; end
21
21
  require "rails_error_dashboard/value_objects/error_context"
22
22
  require "rails_error_dashboard/value_objects/llm_call_event"
23
23
  require "rails_error_dashboard/integrations/o_tel"
24
+ require "rails_error_dashboard/integrations/tracer"
24
25
  require "rails_error_dashboard/integrations/llm_span_processor"
25
26
  require "rails_error_dashboard/integrations/llm_middleware"
26
27
  require "rails_error_dashboard/helpers/user_model_detector"
@@ -131,6 +132,7 @@ require "rails_error_dashboard/queries/job_health_summary"
131
132
  require "rails_error_dashboard/queries/database_health_summary"
132
133
  require "rails_error_dashboard/queries/swallowed_exception_summary"
133
134
  require "rails_error_dashboard/queries/rack_attack_summary"
135
+ require "rails_error_dashboard/queries/llm_health_summary"
134
136
  require "rails_error_dashboard/queries/release_timeline"
135
137
  require "rails_error_dashboard/error_reporter"
136
138
  require "rails_error_dashboard/middleware/error_catcher"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rails_error_dashboard
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Anjan Jagirdar
@@ -312,6 +312,7 @@ files:
312
312
  - app/views/rails_error_dashboard/errors/diagnostic_dumps.html.erb
313
313
  - app/views/rails_error_dashboard/errors/index.html.erb
314
314
  - app/views/rails_error_dashboard/errors/job_health_summary.html.erb
315
+ - app/views/rails_error_dashboard/errors/llm_health_summary.html.erb
315
316
  - app/views/rails_error_dashboard/errors/n_plus_one_summary.html.erb
316
317
  - app/views/rails_error_dashboard/errors/overview.html.erb
317
318
  - app/views/rails_error_dashboard/errors/platform_comparison.html.erb
@@ -397,6 +398,7 @@ files:
397
398
  - lib/rails_error_dashboard/integrations/llm_middleware.rb
398
399
  - lib/rails_error_dashboard/integrations/llm_span_processor.rb
399
400
  - lib/rails_error_dashboard/integrations/o_tel.rb
401
+ - lib/rails_error_dashboard/integrations/tracer.rb
400
402
  - lib/rails_error_dashboard/logger.rb
401
403
  - lib/rails_error_dashboard/manual_error_reporter.rb
402
404
  - lib/rails_error_dashboard/middleware/error_catcher.rb
@@ -421,6 +423,7 @@ files:
421
423
  - lib/rails_error_dashboard/queries/errors_list.rb
422
424
  - lib/rails_error_dashboard/queries/filter_options.rb
423
425
  - lib/rails_error_dashboard/queries/job_health_summary.rb
426
+ - lib/rails_error_dashboard/queries/llm_health_summary.rb
424
427
  - lib/rails_error_dashboard/queries/mttr_stats.rb
425
428
  - lib/rails_error_dashboard/queries/n_plus_one_summary.rb
426
429
  - lib/rails_error_dashboard/queries/platform_comparison.rb
@@ -508,7 +511,7 @@ metadata:
508
511
  funding_uri: https://github.com/sponsors/AnjanJ
509
512
  post_install_message: |
510
513
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
511
- RED (Rails Error Dashboard) v0.7.1
514
+ RED (Rails Error Dashboard) v0.8.0
512
515
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
513
516
 
514
517
  First install:
@@ -544,7 +547,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
544
547
  - !ruby/object:Gem::Version
545
548
  version: '0'
546
549
  requirements: []
547
- rubygems_version: 3.6.9
550
+ rubygems_version: 4.0.3
548
551
  specification_version: 4
549
552
  summary: Self-hosted error tracking and exception monitoring for Rails. Free, forever.
550
553
  test_files: []