solid_events 0.1.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +53 -0
- data/LICENSE.txt +21 -0
- data/README.md +406 -10
- data/Rakefile +9 -5
- data/app/controllers/solid_events/api_controller.rb +782 -0
- data/app/controllers/solid_events/application_controller.rb +4 -0
- data/app/controllers/solid_events/incidents_controller.rb +38 -0
- data/app/controllers/solid_events/saved_views_controller.rb +43 -0
- data/app/controllers/solid_events/traces_controller.rb +731 -0
- data/app/helpers/solid_events/traces_helper.rb +79 -0
- data/app/jobs/solid_events/evaluate_incidents_job.rb +11 -0
- data/app/jobs/solid_events/prune_job.rb +26 -0
- data/app/models/solid_events/causal_edge.rb +9 -0
- data/app/models/solid_events/error_link.rb +11 -0
- data/app/models/solid_events/event.rb +11 -0
- data/app/models/solid_events/incident.rb +68 -0
- data/app/models/solid_events/incident_event.rb +13 -0
- data/app/models/solid_events/journey.rb +62 -0
- data/app/models/solid_events/record.rb +11 -0
- data/app/models/solid_events/record_link.rb +11 -0
- data/app/models/solid_events/saved_view.rb +11 -0
- data/app/models/solid_events/summary.rb +11 -0
- data/app/models/solid_events/trace.rb +85 -0
- data/app/views/layouts/solid_events/_style.html.erb +39 -0
- data/app/views/layouts/solid_events/application.html.erb +21 -14
- data/app/views/solid_events/incidents/events.html.erb +60 -0
- data/app/views/solid_events/traces/hot_path.html.erb +63 -0
- data/app/views/solid_events/traces/index.html.erb +532 -0
- data/app/views/solid_events/traces/show.html.erb +216 -0
- data/app/views/solid_events/traces/timeline.html.erb +54 -0
- data/config/locales/en.yml +4 -0
- data/config/routes.rb +35 -0
- data/db/migrate/20260216010000_create_solid_events_tables.rb +51 -0
- data/db/migrate/20260216020000_create_solid_events_summaries.rb +33 -0
- data/db/migrate/20260216030000_add_dimensions_to_solid_events_summaries.rb +20 -0
- data/db/migrate/20260216040000_add_request_id_to_solid_events_summaries.rb +8 -0
- data/db/migrate/20260216050000_add_sql_metrics_to_solid_events_summaries.rb +8 -0
- data/db/migrate/20260216060000_add_deploy_dimensions_to_solid_events_summaries.rb +17 -0
- data/db/migrate/20260216070000_create_solid_events_incidents.rb +30 -0
- data/db/migrate/20260216080000_add_schema_version_to_solid_events_summaries.rb +7 -0
- data/db/migrate/20260216090000_add_assignment_and_mute_to_solid_events_incidents.rb +12 -0
- data/db/migrate/20260216100000_add_resolution_metadata_to_solid_events_incidents.rb +11 -0
- data/db/migrate/20260216110000_add_assignment_audit_to_solid_events_incidents.rb +10 -0
- data/db/migrate/20260216120000_create_solid_events_saved_views.rb +17 -0
- data/db/migrate/20260216130000_create_solid_events_incident_events.rb +19 -0
- data/db/migrate/20260216140000_add_incident_event_lookup_indexes.rb +8 -0
- data/db/migrate/20260216150000_add_causal_links_to_solid_events.rb +15 -0
- data/db/migrate/20260216160000_create_solid_events_journeys_and_causal_edges.rb +45 -0
- data/lib/generators/solid_events/install/USAGE +8 -0
- data/lib/generators/solid_events/install/install_generator.rb +26 -0
- data/lib/generators/solid_events/install/templates/config/initializers/solid_events.rb +84 -0
- data/lib/generators/solid_events/install/templates/db/events_schema.rb +206 -0
- data/lib/solid_events/benchmark.rb +43 -0
- data/lib/solid_events/configuration.rb +167 -0
- data/lib/solid_events/context_scraper.rb +23 -0
- data/lib/solid_events/controller_tracing.rb +94 -0
- data/lib/solid_events/current.rb +15 -0
- data/lib/solid_events/engine.rb +93 -0
- data/lib/solid_events/incident_evaluator.rb +327 -0
- data/lib/solid_events/labeler.rb +21 -0
- data/lib/solid_events/notifiers/slack_webhook_notifier.rb +36 -0
- data/lib/solid_events/subscribers/action_cable_subscriber.rb +48 -0
- data/lib/solid_events/subscribers/controller_subscriber.rb +39 -0
- data/lib/solid_events/subscribers/enqueue_subscriber.rb +30 -0
- data/lib/solid_events/subscribers/error_subscriber.rb +107 -0
- data/lib/solid_events/subscribers/external_http_subscriber.rb +54 -0
- data/lib/solid_events/subscribers/job_subscriber.rb +45 -0
- data/lib/solid_events/subscribers/mailer_subscriber.rb +49 -0
- data/lib/solid_events/subscribers/sql_subscriber.rb +46 -0
- data/lib/solid_events/tracer.rb +672 -0
- data/lib/solid_events/version.rb +3 -1
- data/lib/solid_events.rb +210 -3
- data/lib/tasks/solid_events_tasks.rake +30 -4
- metadata +141 -28
- data/MIT-LICENSE +0 -20
- data/app/assets/config/solid_events_manifest.js +0 -1
- data/app/assets/stylesheets/solid_events/application.css +0 -15
- data/app/helpers/solid_events/application_helper.rb +0 -4
- data/app/jobs/solid_events/application_job.rb +0 -4
- data/app/mailers/solid_events/application_mailer.rb +0 -6
- data/app/models/solid_events/application_record.rb +0 -5
|
@@ -0,0 +1,782 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SolidEvents
|
|
4
|
+
class ApiController < ApplicationController
|
|
5
|
+
before_action :authenticate_api!
|
|
6
|
+
|
|
7
|
+
def incidents
|
|
8
|
+
incidents = if incident_table_available?
|
|
9
|
+
scope = SolidEvents::Incident.order(id: :desc)
|
|
10
|
+
scope = scope.where(status: params[:status]) if params[:status].present?
|
|
11
|
+
scope = scope.where(kind: params[:kind]) if params[:kind].present?
|
|
12
|
+
scope = scope.where(severity: params[:severity]) if params[:severity].present?
|
|
13
|
+
scope = apply_cursor(scope)
|
|
14
|
+
scope.limit(limit_param)
|
|
15
|
+
else
|
|
16
|
+
[]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
render json: {
|
|
20
|
+
data: incidents.map { |incident| serialize_incident(incident) },
|
|
21
|
+
next_cursor: incidents.last&.id
|
|
22
|
+
}
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def incident_traces
|
|
26
|
+
incident = SolidEvents::Incident.find(params[:id])
|
|
27
|
+
traces = incident_related_traces(incident).limit(limit_param)
|
|
28
|
+
render json: {
|
|
29
|
+
incident: serialize_incident(incident),
|
|
30
|
+
traces: traces.map(&:canonical_event)
|
|
31
|
+
}
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def incident_context
|
|
35
|
+
incident = SolidEvents::Incident.find(params[:id])
|
|
36
|
+
render json: context_payload_for(incident)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def incident_events
|
|
40
|
+
incident = SolidEvents::Incident.find(params[:id])
|
|
41
|
+
events = incident.incident_events.recent.limit(limit_param)
|
|
42
|
+
events = events.where(action: params[:event_action].to_s) if params[:event_action].present?
|
|
43
|
+
events = apply_cursor(events)
|
|
44
|
+
render json: {
|
|
45
|
+
incident: serialize_incident(incident),
|
|
46
|
+
data: events.map { |event| serialize_incident_event(event) },
|
|
47
|
+
next_cursor: events.last&.id
|
|
48
|
+
}
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def incident_evidences
|
|
52
|
+
incident = SolidEvents::Incident.find(params[:id])
|
|
53
|
+
traces = incident_related_traces(incident).includes(:summary).limit(2_000)
|
|
54
|
+
summaries = traces.map(&:summary).compact
|
|
55
|
+
|
|
56
|
+
by_source = summaries.group_by(&:source).transform_values(&:size).sort_by { |_, count| -count }.first(10).to_h
|
|
57
|
+
by_status = summaries.group_by(&:status).transform_values(&:size)
|
|
58
|
+
by_entity = summaries
|
|
59
|
+
.map { |summary| [summary.entity_type, summary.entity_id] }
|
|
60
|
+
.reject { |type, id| type.blank? || id.blank? }
|
|
61
|
+
.tally
|
|
62
|
+
.sort_by { |(_, count)| -count }
|
|
63
|
+
.first(10)
|
|
64
|
+
.map { |(type, id), count| {entity_type: type, entity_id: id, count: count} }
|
|
65
|
+
|
|
66
|
+
render json: {
|
|
67
|
+
incident: serialize_incident(incident),
|
|
68
|
+
evidences: {
|
|
69
|
+
by_source: by_source,
|
|
70
|
+
by_status: by_status,
|
|
71
|
+
by_entity: by_entity,
|
|
72
|
+
duration_ms: duration_slice_for(summaries),
|
|
73
|
+
error_rate_pct: error_rate_for(summaries)
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def acknowledge_incident
|
|
79
|
+
incident = SolidEvents::Incident.find(params[:id])
|
|
80
|
+
incident.acknowledge!
|
|
81
|
+
render json: serialize_incident(incident)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def assign_incident
|
|
85
|
+
incident = SolidEvents::Incident.find(params[:id])
|
|
86
|
+
incident.assign!(
|
|
87
|
+
owner: params[:owner].presence,
|
|
88
|
+
team: params[:team].presence,
|
|
89
|
+
assigned_by: params[:assigned_by].presence,
|
|
90
|
+
assignment_note: params[:assignment_note].presence
|
|
91
|
+
)
|
|
92
|
+
render json: serialize_incident(incident)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def mute_incident
|
|
96
|
+
incident = SolidEvents::Incident.find(params[:id])
|
|
97
|
+
minutes = params[:minutes].to_i
|
|
98
|
+
minutes = 60 if minutes <= 0
|
|
99
|
+
incident.mute_for!(minutes.minutes)
|
|
100
|
+
render json: serialize_incident(incident)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def resolve_incident
|
|
104
|
+
incident = SolidEvents::Incident.find(params[:id])
|
|
105
|
+
if params[:resolved_by].present? || params[:resolution_note].present?
|
|
106
|
+
incident.resolve_with!(resolved_by: params[:resolved_by].presence || "system", resolution_note: params[:resolution_note].presence)
|
|
107
|
+
else
|
|
108
|
+
incident.resolve!
|
|
109
|
+
end
|
|
110
|
+
render json: serialize_incident(incident)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def reopen_incident
|
|
114
|
+
incident = SolidEvents::Incident.find(params[:id])
|
|
115
|
+
incident.reopen!
|
|
116
|
+
render json: serialize_incident(incident)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def trace
|
|
120
|
+
trace = SolidEvents::Trace.includes(:summary, :events, :record_links, :error_links).find(params[:id])
|
|
121
|
+
|
|
122
|
+
render json: {
|
|
123
|
+
trace: trace.canonical_event,
|
|
124
|
+
summary: trace.summary&.attributes,
|
|
125
|
+
record_links: trace.record_links.map { |link| {record_type: link.record_type, record_id: link.record_id} },
|
|
126
|
+
error_links: trace.error_links.map { |link| {solid_error_id: link.solid_error_id} }
|
|
127
|
+
}
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def traces
|
|
131
|
+
scope = SolidEvents::Trace.order(id: :desc)
|
|
132
|
+
if params[:error_fingerprint].present?
|
|
133
|
+
scope = scope.left_outer_joins(:summary).where(solid_events_summaries: {error_fingerprint: params[:error_fingerprint]})
|
|
134
|
+
end
|
|
135
|
+
if params[:entity_type].present? || params[:entity_id].present?
|
|
136
|
+
scope = scope.left_outer_joins(:summary)
|
|
137
|
+
scope = scope.where("solid_events_summaries.entity_type ILIKE ?", "%#{params[:entity_type]}%") if params[:entity_type].present?
|
|
138
|
+
scope = scope.where(solid_events_summaries: {entity_id: params[:entity_id].to_i}) if params[:entity_id].present?
|
|
139
|
+
end
|
|
140
|
+
scope = apply_feature_slice_filter(scope)
|
|
141
|
+
|
|
142
|
+
scope = apply_cursor(scope)
|
|
143
|
+
traces = scope.includes(:summary).limit(limit_param)
|
|
144
|
+
render json: {
|
|
145
|
+
data: traces.map { |trace| trace.canonical_event },
|
|
146
|
+
next_cursor: traces.last&.id
|
|
147
|
+
}
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def error_rates
|
|
151
|
+
dimension = metric_dimension_param
|
|
152
|
+
groups = summary_scope_for_metrics
|
|
153
|
+
.group(dimension)
|
|
154
|
+
.order(Arel.sql("COUNT(*) DESC"))
|
|
155
|
+
.limit(limit_param)
|
|
156
|
+
.count
|
|
157
|
+
|
|
158
|
+
data = groups.map do |value, total|
|
|
159
|
+
scoped = summary_scope_for_metrics.where(dimension => value)
|
|
160
|
+
error_count = scoped.where(status: "error").count
|
|
161
|
+
{
|
|
162
|
+
dimension: dimension,
|
|
163
|
+
value: value,
|
|
164
|
+
total_count: total,
|
|
165
|
+
error_count: error_count,
|
|
166
|
+
error_rate_pct: total.positive? ? ((error_count.to_f / total) * 100.0).round(2) : 0.0
|
|
167
|
+
}
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
render json: {window: metric_window_param, dimension: dimension, groups: data}
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def latency
|
|
174
|
+
dimension = metric_dimension_param
|
|
175
|
+
groups = summary_scope_for_metrics
|
|
176
|
+
.where.not(duration_ms: nil)
|
|
177
|
+
.group(dimension)
|
|
178
|
+
.order(Arel.sql("COUNT(*) DESC"))
|
|
179
|
+
.limit(limit_param)
|
|
180
|
+
.pluck(
|
|
181
|
+
dimension,
|
|
182
|
+
Arel.sql("COUNT(*)"),
|
|
183
|
+
Arel.sql("AVG(duration_ms)"),
|
|
184
|
+
Arel.sql("MAX(duration_ms)")
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
data = groups.map do |value, total_count, avg_duration, max_duration|
|
|
188
|
+
{
|
|
189
|
+
dimension: dimension,
|
|
190
|
+
value: value,
|
|
191
|
+
sample_count: total_count.to_i,
|
|
192
|
+
avg_duration_ms: avg_duration.to_f.round(2),
|
|
193
|
+
max_duration_ms: max_duration.to_f.round(2)
|
|
194
|
+
}
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
render json: {window: metric_window_param, dimension: dimension, groups: data}
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def compare_metrics
|
|
201
|
+
dimension = metric_dimension_param
|
|
202
|
+
metric = metric_param
|
|
203
|
+
windows = metric_compare_windows
|
|
204
|
+
|
|
205
|
+
scoped = summary_scope_base_for_metrics
|
|
206
|
+
current_stats = grouped_metric_stats(scope: scoped.where(started_at: windows[:current_start]..windows[:current_end]), dimension: dimension)
|
|
207
|
+
baseline_stats = grouped_metric_stats(scope: scoped.where(started_at: windows[:baseline_start]..windows[:baseline_end]), dimension: dimension)
|
|
208
|
+
values = (current_stats.keys + baseline_stats.keys).uniq
|
|
209
|
+
|
|
210
|
+
groups = values.map do |value|
|
|
211
|
+
current = current_stats.fetch(value, default_metric_stats)
|
|
212
|
+
baseline = baseline_stats.fetch(value, default_metric_stats)
|
|
213
|
+
current_value = metric_value_for(metric, current)
|
|
214
|
+
baseline_value = metric_value_for(metric, baseline)
|
|
215
|
+
delta = (current_value - baseline_value).round(2)
|
|
216
|
+
|
|
217
|
+
{
|
|
218
|
+
dimension: dimension,
|
|
219
|
+
value: value,
|
|
220
|
+
metric: metric,
|
|
221
|
+
current: current_value,
|
|
222
|
+
baseline: baseline_value,
|
|
223
|
+
delta: delta,
|
|
224
|
+
delta_pct: percent_delta(current_value, baseline_value),
|
|
225
|
+
current_sample_count: current[:total_count],
|
|
226
|
+
baseline_sample_count: baseline[:total_count]
|
|
227
|
+
}
|
|
228
|
+
end.sort_by { |row| [-row[:current_sample_count].to_i, row[:value].to_s] }
|
|
229
|
+
|
|
230
|
+
render json: {
|
|
231
|
+
dimension: dimension,
|
|
232
|
+
metric: metric,
|
|
233
|
+
current_window: windows[:current_window],
|
|
234
|
+
baseline_window: windows[:baseline_window],
|
|
235
|
+
groups: groups.first(limit_param)
|
|
236
|
+
}
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
def cohort_metrics
|
|
240
|
+
return render json: {error: "cohort_key is required"}, status: :unprocessable_entity if params[:cohort_key].blank?
|
|
241
|
+
|
|
242
|
+
metric = metric_param
|
|
243
|
+
cohort_key = params[:cohort_key].to_s
|
|
244
|
+
requested_values = params[:cohort_values].to_s.split(",").map(&:strip).reject(&:blank?)
|
|
245
|
+
|
|
246
|
+
rows = summary_scope_for_metrics.limit(10_000).pluck(:status, :duration_ms, :payload)
|
|
247
|
+
grouped = Hash.new { |hash, key| hash[key] = {count: 0, error_count: 0, latency_sum: 0.0} }
|
|
248
|
+
|
|
249
|
+
rows.each do |status, duration_ms, payload|
|
|
250
|
+
context = payload.to_h["context"].to_h
|
|
251
|
+
cohort_value = context[cohort_key]
|
|
252
|
+
next if cohort_value.blank?
|
|
253
|
+
|
|
254
|
+
cohort_value = cohort_value.to_s
|
|
255
|
+
next if requested_values.any? && !requested_values.include?(cohort_value)
|
|
256
|
+
|
|
257
|
+
grouped[cohort_value][:count] += 1
|
|
258
|
+
grouped[cohort_value][:error_count] += 1 if status == "error"
|
|
259
|
+
grouped[cohort_value][:latency_sum] += duration_ms.to_f
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
groups = grouped.map do |cohort_value, stats|
|
|
263
|
+
value = if metric == "latency_avg"
|
|
264
|
+
stats[:count].positive? ? (stats[:latency_sum] / stats[:count]).round(2) : 0.0
|
|
265
|
+
else
|
|
266
|
+
stats[:count].positive? ? ((stats[:error_count].to_f / stats[:count]) * 100.0).round(2) : 0.0
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
{
|
|
270
|
+
cohort_key: cohort_key,
|
|
271
|
+
cohort_value: cohort_value,
|
|
272
|
+
metric: metric,
|
|
273
|
+
value: value,
|
|
274
|
+
sample_count: stats[:count],
|
|
275
|
+
error_count: stats[:error_count]
|
|
276
|
+
}
|
|
277
|
+
end.sort_by { |row| [-row[:sample_count], row[:cohort_value]] }
|
|
278
|
+
|
|
279
|
+
render json: {
|
|
280
|
+
window: metric_window_param,
|
|
281
|
+
cohort_key: cohort_key,
|
|
282
|
+
metric: metric,
|
|
283
|
+
groups: groups.first(limit_param)
|
|
284
|
+
}
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
def journeys
|
|
288
|
+
scope = summary_scope_for_metrics
|
|
289
|
+
scope = scope.where(request_id: params[:request_id].to_s) if params[:request_id].present?
|
|
290
|
+
if params[:entity_type].present?
|
|
291
|
+
scope = scope.where(entity_type: params[:entity_type].to_s)
|
|
292
|
+
end
|
|
293
|
+
if params[:entity_id].present?
|
|
294
|
+
scope = scope.where(entity_id: params[:entity_id].to_i)
|
|
295
|
+
end
|
|
296
|
+
scope = scope.where(status: "error") if errors_only_param?
|
|
297
|
+
|
|
298
|
+
traces_per_journey = [[params[:traces_per_journey].to_i, 1].max, 50].min
|
|
299
|
+
traces_per_journey = 20 if params[:traces_per_journey].blank?
|
|
300
|
+
rows = scope.includes(:trace).order(started_at: :desc).limit(2_000).to_a
|
|
301
|
+
grouped = rows.group_by { |summary| journey_key_for(summary) }.reject { |key, _| key.blank? }
|
|
302
|
+
|
|
303
|
+
journeys = grouped.map do |key, summaries|
|
|
304
|
+
ordered = summaries.sort_by(&:started_at)
|
|
305
|
+
traces = ordered.last(traces_per_journey).map { |summary| summary.trace.canonical_event }
|
|
306
|
+
{
|
|
307
|
+
journey_key: key,
|
|
308
|
+
request_id: ordered.last.request_id,
|
|
309
|
+
entity_type: ordered.last.entity_type,
|
|
310
|
+
entity_id: ordered.last.entity_id,
|
|
311
|
+
trace_count: summaries.size,
|
|
312
|
+
error_count: summaries.count { |summary| summary.status == "error" },
|
|
313
|
+
started_at: ordered.first.started_at,
|
|
314
|
+
finished_at: ordered.last.finished_at || ordered.last.started_at,
|
|
315
|
+
traces: traces
|
|
316
|
+
}
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
sorted = journeys.sort_by { |journey| journey[:finished_at] || Time.at(0) }.reverse.first(limit_param)
|
|
320
|
+
render json: {window: metric_window_param, errors_only: errors_only_param?, journeys: sorted}
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def materialized_journeys
|
|
324
|
+
return render json: {data: [], next_cursor: nil} unless journey_table_available?
|
|
325
|
+
|
|
326
|
+
scope = SolidEvents::Journey.order(id: :desc)
|
|
327
|
+
scope = scope.where(request_id: params[:request_id].to_s) if params[:request_id].present?
|
|
328
|
+
if params[:entity_type].present?
|
|
329
|
+
scope = scope.where(entity_type: params[:entity_type].to_s)
|
|
330
|
+
end
|
|
331
|
+
if params[:entity_id].present?
|
|
332
|
+
scope = scope.where(entity_id: params[:entity_id].to_i)
|
|
333
|
+
end
|
|
334
|
+
scope = apply_cursor(scope)
|
|
335
|
+
rows = scope.limit(limit_param)
|
|
336
|
+
|
|
337
|
+
render json: {
|
|
338
|
+
data: rows.map { |row| serialize_materialized_journey(row) },
|
|
339
|
+
next_cursor: rows.last&.id
|
|
340
|
+
}
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def causal_edges
|
|
344
|
+
return render json: {data: [], next_cursor: nil} unless causal_edges_table_available?
|
|
345
|
+
|
|
346
|
+
scope = SolidEvents::CausalEdge.order(id: :desc)
|
|
347
|
+
if params[:trace_id].present?
|
|
348
|
+
trace_id = params[:trace_id].to_i
|
|
349
|
+
scope = scope.where("from_trace_id = ? OR to_trace_id = ?", trace_id, trace_id)
|
|
350
|
+
end
|
|
351
|
+
scope = scope.where(edge_type: params[:edge_type].to_s) if params[:edge_type].present?
|
|
352
|
+
scope = apply_cursor(scope)
|
|
353
|
+
rows = scope.limit(limit_param)
|
|
354
|
+
|
|
355
|
+
render json: {
|
|
356
|
+
data: rows.map { |row| serialize_causal_edge(row) },
|
|
357
|
+
next_cursor: rows.last&.id
|
|
358
|
+
}
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
def export_traces
|
|
362
|
+
return render json: {error: "only json export is supported"}, status: :unprocessable_entity unless export_json?
|
|
363
|
+
|
|
364
|
+
scope = SolidEvents::Trace.order(id: :desc)
|
|
365
|
+
scope = scope.where(status: params[:status]) if params[:status].in?(%w[ok error])
|
|
366
|
+
scope = scope.where("started_at >= ?", window_start_for_metrics) if params[:window].present?
|
|
367
|
+
scope = scope.left_outer_joins(:summary).where(solid_events_summaries: {error_fingerprint: params[:error_fingerprint]}) if params[:error_fingerprint].present?
|
|
368
|
+
if params[:entity_type].present? || params[:entity_id].present?
|
|
369
|
+
scope = scope.left_outer_joins(:summary)
|
|
370
|
+
scope = scope.where("solid_events_summaries.entity_type ILIKE ?", "%#{params[:entity_type]}%") if params[:entity_type].present?
|
|
371
|
+
scope = scope.where(solid_events_summaries: {entity_id: params[:entity_id].to_i}) if params[:entity_id].present?
|
|
372
|
+
end
|
|
373
|
+
scope = apply_feature_slice_filter(scope)
|
|
374
|
+
scope = apply_cursor(scope)
|
|
375
|
+
|
|
376
|
+
traces = scope.includes(:summary).limit(limit_param)
|
|
377
|
+
render json: {
|
|
378
|
+
exported_at: Time.current.iso8601,
|
|
379
|
+
format: "json",
|
|
380
|
+
filters: export_filters_payload,
|
|
381
|
+
data: traces.map(&:canonical_event)
|
|
382
|
+
}
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
def export_incidents
|
|
386
|
+
return render json: {error: "only json export is supported"}, status: :unprocessable_entity unless export_json?
|
|
387
|
+
|
|
388
|
+
scope = SolidEvents::Incident.order(id: :desc)
|
|
389
|
+
scope = scope.where(status: params[:status]) if params[:status].present?
|
|
390
|
+
scope = scope.where(kind: params[:kind]) if params[:kind].present?
|
|
391
|
+
scope = scope.where(severity: params[:severity]) if params[:severity].present?
|
|
392
|
+
scope = scope.where("detected_at >= ?", window_start_for_metrics) if params[:window].present?
|
|
393
|
+
scope = apply_cursor(scope)
|
|
394
|
+
incidents = scope.limit(limit_param)
|
|
395
|
+
|
|
396
|
+
render json: {
|
|
397
|
+
exported_at: Time.current.iso8601,
|
|
398
|
+
format: "json",
|
|
399
|
+
filters: export_filters_payload,
|
|
400
|
+
data: incidents.map { |incident| serialize_incident(incident) }
|
|
401
|
+
}
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
private
|
|
405
|
+
|
|
406
|
+
def serialize_incident(incident)
|
|
407
|
+
{
|
|
408
|
+
id: incident.id,
|
|
409
|
+
kind: incident.kind,
|
|
410
|
+
severity: incident.severity,
|
|
411
|
+
status: incident.status,
|
|
412
|
+
owner: incident.owner,
|
|
413
|
+
team: incident.team,
|
|
414
|
+
assigned_by: incident.assigned_by,
|
|
415
|
+
assignment_note: incident.assignment_note,
|
|
416
|
+
assigned_at: incident.assigned_at,
|
|
417
|
+
source: incident.source,
|
|
418
|
+
name: incident.name,
|
|
419
|
+
fingerprint: incident.fingerprint,
|
|
420
|
+
payload: incident.payload,
|
|
421
|
+
detected_at: incident.detected_at,
|
|
422
|
+
last_seen_at: incident.last_seen_at,
|
|
423
|
+
acknowledged_at: incident.acknowledged_at,
|
|
424
|
+
resolved_at: incident.resolved_at,
|
|
425
|
+
resolved_by: incident.resolved_by,
|
|
426
|
+
resolution_note: incident.resolution_note,
|
|
427
|
+
muted_until: incident.muted_until
|
|
428
|
+
}
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
def serialize_incident_event(event)
|
|
432
|
+
{
|
|
433
|
+
id: event.id,
|
|
434
|
+
incident_id: event.incident_id,
|
|
435
|
+
action: event.action,
|
|
436
|
+
actor: event.actor,
|
|
437
|
+
payload: event.payload,
|
|
438
|
+
occurred_at: event.occurred_at
|
|
439
|
+
}
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
def serialize_materialized_journey(journey)
|
|
443
|
+
{
|
|
444
|
+
id: journey.id,
|
|
445
|
+
journey_key: journey.journey_key,
|
|
446
|
+
request_id: journey.request_id,
|
|
447
|
+
entity_type: journey.entity_type,
|
|
448
|
+
entity_id: journey.entity_id,
|
|
449
|
+
last_trace_id: journey.last_trace_id,
|
|
450
|
+
trace_count: journey.trace_count,
|
|
451
|
+
error_count: journey.error_count,
|
|
452
|
+
started_at: journey.started_at,
|
|
453
|
+
finished_at: journey.finished_at,
|
|
454
|
+
payload: journey.payload
|
|
455
|
+
}
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
def serialize_causal_edge(edge)
|
|
459
|
+
{
|
|
460
|
+
id: edge.id,
|
|
461
|
+
from_trace_id: edge.from_trace_id,
|
|
462
|
+
from_event_id: edge.from_event_id,
|
|
463
|
+
to_trace_id: edge.to_trace_id,
|
|
464
|
+
to_event_id: edge.to_event_id,
|
|
465
|
+
edge_type: edge.edge_type,
|
|
466
|
+
occurred_at: edge.occurred_at,
|
|
467
|
+
payload: edge.payload
|
|
468
|
+
}
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
def context_payload_for(incident)
|
|
472
|
+
traces = incident_related_traces(incident).includes(:error_links).limit(limit_param)
|
|
473
|
+
error_ids = traces.flat_map { |trace| trace.error_links.map(&:solid_error_id) }.compact.uniq
|
|
474
|
+
trace_query = incident.payload.to_h["trace_query"].to_h
|
|
475
|
+
|
|
476
|
+
{
|
|
477
|
+
incident: serialize_incident(incident),
|
|
478
|
+
evidence: {
|
|
479
|
+
trace_count: traces.size,
|
|
480
|
+
error_ids: error_ids,
|
|
481
|
+
traces: traces.map(&:canonical_event),
|
|
482
|
+
solid_errors: fetch_solid_errors(error_ids)
|
|
483
|
+
},
|
|
484
|
+
suggested_filters: trace_query,
|
|
485
|
+
links: {
|
|
486
|
+
traces_ui: traces_path(trace_query),
|
|
487
|
+
incident_traces_api: api_incident_traces_path(incident),
|
|
488
|
+
incident_lifecycle: {
|
|
489
|
+
acknowledge: "/solid_events/api/incidents/#{incident.id}/acknowledge",
|
|
490
|
+
resolve: "/solid_events/api/incidents/#{incident.id}/resolve",
|
|
491
|
+
reopen: "/solid_events/api/incidents/#{incident.id}/reopen"
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
def incident_related_traces(incident)
|
|
498
|
+
scope = SolidEvents::Trace.recent.includes(:summary)
|
|
499
|
+
query = incident.payload.to_h["trace_query"].to_h
|
|
500
|
+
trace_ids = Array(incident.payload.to_h["trace_ids"]).map(&:to_i).uniq
|
|
501
|
+
|
|
502
|
+
if trace_ids.any?
|
|
503
|
+
return scope.where(id: trace_ids)
|
|
504
|
+
end
|
|
505
|
+
|
|
506
|
+
if query["error_fingerprint"].present?
|
|
507
|
+
scope = scope.left_outer_joins(:summary).where(solid_events_summaries: {error_fingerprint: query["error_fingerprint"]})
|
|
508
|
+
end
|
|
509
|
+
if query["name"].present?
|
|
510
|
+
scope = scope.where(name: query["name"])
|
|
511
|
+
end
|
|
512
|
+
if query["source"].present?
|
|
513
|
+
scope = scope.where(source: query["source"])
|
|
514
|
+
end
|
|
515
|
+
if query["entity_type"].present? || query["entity_id"].present?
|
|
516
|
+
scope = scope.left_outer_joins(:summary)
|
|
517
|
+
scope = scope.where(solid_events_summaries: {entity_type: query["entity_type"]}) if query["entity_type"].present?
|
|
518
|
+
scope = scope.where(solid_events_summaries: {entity_id: query["entity_id"].to_i}) if query["entity_id"].present?
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
scope
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
def incident_table_available?
|
|
525
|
+
SolidEvents::Incident.connection.data_source_exists?(SolidEvents::Incident.table_name)
|
|
526
|
+
rescue StandardError
|
|
527
|
+
false
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
def journey_table_available?
|
|
531
|
+
SolidEvents::Journey.connection.data_source_exists?(SolidEvents::Journey.table_name)
|
|
532
|
+
rescue StandardError
|
|
533
|
+
false
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
def causal_edges_table_available?
|
|
537
|
+
SolidEvents::CausalEdge.connection.data_source_exists?(SolidEvents::CausalEdge.table_name)
|
|
538
|
+
rescue StandardError
|
|
539
|
+
false
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
def limit_param
|
|
543
|
+
return 50 if params[:limit].blank?
|
|
544
|
+
|
|
545
|
+
[[params[:limit].to_i, 1].max, 200].min
|
|
546
|
+
rescue StandardError
|
|
547
|
+
50
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
def apply_cursor(scope)
|
|
551
|
+
cursor = params[:cursor].to_i
|
|
552
|
+
return scope if cursor <= 0
|
|
553
|
+
|
|
554
|
+
scope.where("id < ?", cursor)
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
def authenticate_api!
|
|
558
|
+
token = SolidEvents.api_token.to_s
|
|
559
|
+
return if token.blank?
|
|
560
|
+
|
|
561
|
+
presented = request.headers["X-Solid-Events-Token"].to_s
|
|
562
|
+
auth_header = request.headers["Authorization"].to_s
|
|
563
|
+
bearer = auth_header.start_with?("Bearer ") ? auth_header.delete_prefix("Bearer ").strip : ""
|
|
564
|
+
return if ActiveSupport::SecurityUtils.secure_compare(presented, token)
|
|
565
|
+
return if ActiveSupport::SecurityUtils.secure_compare(bearer, token)
|
|
566
|
+
|
|
567
|
+
render json: {error: "unauthorized"}, status: :unauthorized
|
|
568
|
+
rescue StandardError
|
|
569
|
+
render json: {error: "unauthorized"}, status: :unauthorized
|
|
570
|
+
end
|
|
571
|
+
|
|
572
|
+
def fetch_solid_errors(error_ids)
|
|
573
|
+
return [] if error_ids.empty?
|
|
574
|
+
return [] unless defined?(SolidErrors::Error)
|
|
575
|
+
|
|
576
|
+
errors = SolidErrors::Error.where(id: error_ids)
|
|
577
|
+
errors.map do |error|
|
|
578
|
+
occurrences_count = if error.respond_to?(:occurrences)
|
|
579
|
+
error.occurrences.count
|
|
580
|
+
else
|
|
581
|
+
nil
|
|
582
|
+
end
|
|
583
|
+
{
|
|
584
|
+
id: error.id,
|
|
585
|
+
exception_class: error.try(:exception_class),
|
|
586
|
+
message: error.try(:message).to_s.first(200),
|
|
587
|
+
fingerprint: error.try(:fingerprint),
|
|
588
|
+
occurrences_count: occurrences_count
|
|
589
|
+
}
|
|
590
|
+
end
|
|
591
|
+
rescue StandardError
|
|
592
|
+
[]
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
def metric_window_param
|
|
596
|
+
window = params[:window].to_s
|
|
597
|
+
return "1h" if window == "1h"
|
|
598
|
+
return "7d" if window == "7d"
|
|
599
|
+
return "30d" if window == "30d"
|
|
600
|
+
|
|
601
|
+
"24h"
|
|
602
|
+
end
|
|
603
|
+
|
|
604
|
+
def metric_dimension_param
|
|
605
|
+
allowed = %w[source name deployment_id service_version environment_name entity_type]
|
|
606
|
+
requested = params[:dimension].to_s
|
|
607
|
+
allowed.include?(requested) ? requested : "source"
|
|
608
|
+
end
|
|
609
|
+
|
|
610
|
+
def summary_scope_for_metrics
|
|
611
|
+
summary_scope_base_for_metrics.where("started_at >= ?", window_start_for_metrics)
|
|
612
|
+
end
|
|
613
|
+
|
|
614
|
+
def summary_scope_base_for_metrics
|
|
615
|
+
scope = SolidEvents::Summary.all
|
|
616
|
+
scope = scope.where(environment_name: params[:environment_name]) if params[:environment_name].present?
|
|
617
|
+
scope = scope.where(service_name: params[:service_name]) if params[:service_name].present?
|
|
618
|
+
scope = apply_feature_slice_filter_to_summaries(scope)
|
|
619
|
+
scope
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
def apply_feature_slice_filter(scope)
|
|
623
|
+
feature_key = params[:feature_key].to_s
|
|
624
|
+
feature_value = params[:feature_value].to_s
|
|
625
|
+
return scope if feature_key.blank? || feature_value.blank?
|
|
626
|
+
return scope unless SolidEvents.feature_slice_keys.include?(feature_key)
|
|
627
|
+
|
|
628
|
+
scope.left_outer_joins(:summary)
|
|
629
|
+
.where("CAST(solid_events_summaries.payload AS TEXT) LIKE ?", "%\"#{feature_key}\":\"#{feature_value}\"%")
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
def apply_feature_slice_filter_to_summaries(scope)
|
|
633
|
+
feature_key = params[:feature_key].to_s
|
|
634
|
+
feature_value = params[:feature_value].to_s
|
|
635
|
+
return scope if feature_key.blank? || feature_value.blank?
|
|
636
|
+
return scope unless SolidEvents.feature_slice_keys.include?(feature_key)
|
|
637
|
+
|
|
638
|
+
scope.where("CAST(solid_events_summaries.payload AS TEXT) LIKE ?", "%\"#{feature_key}\":\"#{feature_value}\"%")
|
|
639
|
+
end
|
|
640
|
+
|
|
641
|
+
def window_start_for_metrics
|
|
642
|
+
case metric_window_param
|
|
643
|
+
when "1h"
|
|
644
|
+
1.hour.ago
|
|
645
|
+
when "7d"
|
|
646
|
+
7.days.ago
|
|
647
|
+
when "30d"
|
|
648
|
+
30.days.ago
|
|
649
|
+
else
|
|
650
|
+
24.hours.ago
|
|
651
|
+
end
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
def metric_param
|
|
655
|
+
allowed = %w[error_rate latency_avg]
|
|
656
|
+
requested = params[:metric].to_s
|
|
657
|
+
allowed.include?(requested) ? requested : "error_rate"
|
|
658
|
+
end
|
|
659
|
+
|
|
660
|
+
def metric_compare_windows
|
|
661
|
+
current_window = normalized_window(params[:window])
|
|
662
|
+
baseline_window = normalized_window(params[:baseline_window].presence || current_window)
|
|
663
|
+
current_duration = duration_for_window(current_window)
|
|
664
|
+
baseline_duration = duration_for_window(baseline_window)
|
|
665
|
+
current_end = Time.current
|
|
666
|
+
current_start = current_end - current_duration
|
|
667
|
+
baseline_end = current_start
|
|
668
|
+
baseline_start = baseline_end - baseline_duration
|
|
669
|
+
|
|
670
|
+
{
|
|
671
|
+
current_window: current_window,
|
|
672
|
+
baseline_window: baseline_window,
|
|
673
|
+
current_start: current_start,
|
|
674
|
+
current_end: current_end,
|
|
675
|
+
baseline_start: baseline_start,
|
|
676
|
+
baseline_end: baseline_end
|
|
677
|
+
}
|
|
678
|
+
end
|
|
679
|
+
|
|
680
|
+
def normalized_window(value)
|
|
681
|
+
window = value.to_s
|
|
682
|
+
return "1h" if window == "1h"
|
|
683
|
+
return "7d" if window == "7d"
|
|
684
|
+
return "30d" if window == "30d"
|
|
685
|
+
|
|
686
|
+
"24h"
|
|
687
|
+
end
|
|
688
|
+
|
|
689
|
+
def duration_for_window(window)
|
|
690
|
+
case window
|
|
691
|
+
when "1h" then 1.hour
|
|
692
|
+
when "7d" then 7.days
|
|
693
|
+
when "30d" then 30.days
|
|
694
|
+
else 24.hours
|
|
695
|
+
end
|
|
696
|
+
end
|
|
697
|
+
|
|
698
|
+
def grouped_metric_stats(scope:, dimension:)
|
|
699
|
+
scope.group(dimension)
|
|
700
|
+
.pluck(
|
|
701
|
+
dimension,
|
|
702
|
+
Arel.sql("COUNT(*)"),
|
|
703
|
+
Arel.sql("SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END)"),
|
|
704
|
+
Arel.sql("AVG(duration_ms)")
|
|
705
|
+
)
|
|
706
|
+
.each_with_object({}) do |(value, total_count, error_count, avg_duration_ms), memo|
|
|
707
|
+
memo[value] = {
|
|
708
|
+
total_count: total_count.to_i,
|
|
709
|
+
error_count: error_count.to_i,
|
|
710
|
+
avg_duration_ms: avg_duration_ms.to_f.round(2)
|
|
711
|
+
}
|
|
712
|
+
end
|
|
713
|
+
end
|
|
714
|
+
|
|
715
|
+
def journey_key_for(summary)
|
|
716
|
+
return "request:#{summary.request_id}" if summary.request_id.present?
|
|
717
|
+
return "entity:#{summary.entity_type}:#{summary.entity_id}" if summary.entity_type.present? && summary.entity_id.present?
|
|
718
|
+
|
|
719
|
+
nil
|
|
720
|
+
end
|
|
721
|
+
|
|
722
|
+
def errors_only_param?
|
|
723
|
+
ActiveModel::Type::Boolean.new.cast(params[:errors_only])
|
|
724
|
+
end
|
|
725
|
+
|
|
726
|
+
def export_json?
|
|
727
|
+
requested = params[:format].to_s
|
|
728
|
+
requested.blank? || requested == "json"
|
|
729
|
+
end
|
|
730
|
+
|
|
731
|
+
def export_filters_payload
|
|
732
|
+
params.to_unsafe_h.slice(
|
|
733
|
+
"status", "kind", "severity", "error_fingerprint", "entity_type", "entity_id",
|
|
734
|
+
"feature_key", "feature_value", "window", "request_id", "errors_only", "limit", "cursor", "format"
|
|
735
|
+
)
|
|
736
|
+
end
|
|
737
|
+
|
|
738
|
+
def duration_slice_for(summaries)
|
|
739
|
+
durations = summaries.map(&:duration_ms).compact
|
|
740
|
+
return {avg: 0.0, max: 0.0, p95: 0.0, sample_count: 0} if durations.empty?
|
|
741
|
+
|
|
742
|
+
sorted = durations.sort
|
|
743
|
+
index = (0.95 * (sorted.length - 1)).ceil
|
|
744
|
+
{
|
|
745
|
+
avg: (durations.sum.to_f / durations.length).round(2),
|
|
746
|
+
max: sorted.last.to_f.round(2),
|
|
747
|
+
p95: sorted[index].to_f.round(2),
|
|
748
|
+
sample_count: sorted.length
|
|
749
|
+
}
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
def error_rate_for(summaries)
|
|
753
|
+
return 0.0 if summaries.empty?
|
|
754
|
+
|
|
755
|
+
errors = summaries.count { |summary| summary.status == "error" }
|
|
756
|
+
((errors.to_f / summaries.size) * 100.0).round(2)
|
|
757
|
+
end
|
|
758
|
+
|
|
759
|
+
def default_metric_stats
|
|
760
|
+
{total_count: 0, error_count: 0, avg_duration_ms: 0.0}
|
|
761
|
+
end
|
|
762
|
+
|
|
763
|
+
def metric_value_for(metric, stats)
|
|
764
|
+
if metric == "latency_avg"
|
|
765
|
+
stats[:avg_duration_ms].to_f.round(2)
|
|
766
|
+
else
|
|
767
|
+
total = stats[:total_count].to_i
|
|
768
|
+
return 0.0 if total.zero?
|
|
769
|
+
|
|
770
|
+
((stats[:error_count].to_f / total) * 100.0).round(2)
|
|
771
|
+
end
|
|
772
|
+
end
|
|
773
|
+
|
|
774
|
+
def percent_delta(current_value, baseline_value)
|
|
775
|
+
baseline = baseline_value.to_f
|
|
776
|
+
return nil if baseline.zero?
|
|
777
|
+
|
|
778
|
+
(((current_value.to_f - baseline) / baseline) * 100.0).round(2)
|
|
779
|
+
end
|
|
780
|
+
|
|
781
|
+
end
|
|
782
|
+
end
|