decision_agent 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +212 -35
- data/bin/decision_agent +3 -8
- data/lib/decision_agent/agent.rb +19 -26
- data/lib/decision_agent/audit/null_adapter.rb +1 -2
- data/lib/decision_agent/decision.rb +3 -1
- data/lib/decision_agent/dsl/condition_evaluator.rb +4 -3
- data/lib/decision_agent/dsl/rule_parser.rb +4 -6
- data/lib/decision_agent/dsl/schema_validator.rb +27 -31
- data/lib/decision_agent/errors.rb +11 -8
- data/lib/decision_agent/evaluation.rb +3 -1
- data/lib/decision_agent/evaluation_validator.rb +78 -0
- data/lib/decision_agent/evaluators/json_rule_evaluator.rb +26 -0
- data/lib/decision_agent/evaluators/static_evaluator.rb +2 -6
- data/lib/decision_agent/monitoring/alert_manager.rb +282 -0
- data/lib/decision_agent/monitoring/dashboard/public/dashboard.css +381 -0
- data/lib/decision_agent/monitoring/dashboard/public/dashboard.js +471 -0
- data/lib/decision_agent/monitoring/dashboard/public/index.html +161 -0
- data/lib/decision_agent/monitoring/dashboard_server.rb +340 -0
- data/lib/decision_agent/monitoring/metrics_collector.rb +278 -0
- data/lib/decision_agent/monitoring/monitored_agent.rb +71 -0
- data/lib/decision_agent/monitoring/prometheus_exporter.rb +247 -0
- data/lib/decision_agent/replay/replay.rb +12 -22
- data/lib/decision_agent/scoring/base.rb +1 -1
- data/lib/decision_agent/scoring/consensus.rb +5 -5
- data/lib/decision_agent/scoring/weighted_average.rb +1 -1
- data/lib/decision_agent/version.rb +1 -1
- data/lib/decision_agent/versioning/activerecord_adapter.rb +69 -33
- data/lib/decision_agent/versioning/adapter.rb +1 -3
- data/lib/decision_agent/versioning/file_storage_adapter.rb +143 -35
- data/lib/decision_agent/versioning/version_manager.rb +4 -12
- data/lib/decision_agent/web/public/index.html +1 -1
- data/lib/decision_agent/web/server.rb +19 -24
- data/lib/decision_agent.rb +7 -0
- data/lib/generators/decision_agent/install/install_generator.rb +5 -5
- data/lib/generators/decision_agent/install/templates/migration.rb +17 -6
- data/lib/generators/decision_agent/install/templates/rule.rb +3 -3
- data/lib/generators/decision_agent/install/templates/rule_version.rb +13 -7
- data/spec/activerecord_thread_safety_spec.rb +553 -0
- data/spec/agent_spec.rb +13 -13
- data/spec/api_contract_spec.rb +16 -16
- data/spec/audit_adapters_spec.rb +3 -3
- data/spec/comprehensive_edge_cases_spec.rb +86 -86
- data/spec/dsl_validation_spec.rb +83 -83
- data/spec/edge_cases_spec.rb +23 -23
- data/spec/examples/feedback_aware_evaluator_spec.rb +7 -7
- data/spec/examples.txt +548 -0
- data/spec/issue_verification_spec.rb +685 -0
- data/spec/json_rule_evaluator_spec.rb +15 -15
- data/spec/monitoring/alert_manager_spec.rb +378 -0
- data/spec/monitoring/metrics_collector_spec.rb +281 -0
- data/spec/monitoring/monitored_agent_spec.rb +222 -0
- data/spec/monitoring/prometheus_exporter_spec.rb +242 -0
- data/spec/replay_edge_cases_spec.rb +58 -58
- data/spec/replay_spec.rb +11 -11
- data/spec/rfc8785_canonicalization_spec.rb +215 -0
- data/spec/scoring_spec.rb +1 -1
- data/spec/spec_helper.rb +9 -0
- data/spec/thread_safety_spec.rb +482 -0
- data/spec/thread_safety_spec.rb.broken +878 -0
- data/spec/versioning_spec.rb +141 -37
- data/spec/web_ui_rack_spec.rb +135 -0
- metadata +69 -6
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
require "sinatra/base"
|
|
2
|
+
require "json"
|
|
3
|
+
|
|
4
|
+
# Faye/WebSocket is optional for real-time features
|
|
5
|
+
begin
|
|
6
|
+
require "faye/websocket"
|
|
7
|
+
WEBSOCKET_AVAILABLE = true
|
|
8
|
+
rescue LoadError
|
|
9
|
+
WEBSOCKET_AVAILABLE = false
|
|
10
|
+
warn "Warning: faye-websocket gem not found. Real-time dashboard features will be disabled."
|
|
11
|
+
warn "Install with: gem install faye-websocket"
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
module DecisionAgent
|
|
15
|
+
module Monitoring
|
|
16
|
+
# Real-time monitoring dashboard server
|
|
17
|
+
class DashboardServer < Sinatra::Base
|
|
18
|
+
set :public_folder, File.expand_path("dashboard/public", __dir__)
|
|
19
|
+
set :views, File.expand_path("dashboard/views", __dir__)
|
|
20
|
+
set :bind, "0.0.0.0"
|
|
21
|
+
set :port, 4568
|
|
22
|
+
set :server, :puma
|
|
23
|
+
|
|
24
|
+
# Enable CORS
|
|
25
|
+
before do
|
|
26
|
+
headers["Access-Control-Allow-Origin"] = "*"
|
|
27
|
+
headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
|
|
28
|
+
headers["Access-Control-Allow-Headers"] = "Content-Type"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
options "*" do
|
|
32
|
+
200
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Class-level configuration
|
|
36
|
+
class << self
|
|
37
|
+
attr_accessor :metrics_collector, :prometheus_exporter, :alert_manager
|
|
38
|
+
attr_reader :websocket_clients
|
|
39
|
+
|
|
40
|
+
def configure_monitoring(metrics_collector:, prometheus_exporter:, alert_manager:)
|
|
41
|
+
@metrics_collector = metrics_collector
|
|
42
|
+
@prometheus_exporter = prometheus_exporter
|
|
43
|
+
@alert_manager = alert_manager
|
|
44
|
+
@websocket_clients = []
|
|
45
|
+
|
|
46
|
+
setup_real_time_updates
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def setup_real_time_updates
|
|
50
|
+
# Register observer for real-time metric updates
|
|
51
|
+
@metrics_collector.add_observer do |event_type, metric|
|
|
52
|
+
broadcast_to_clients({
|
|
53
|
+
type: "metric_update",
|
|
54
|
+
event: event_type,
|
|
55
|
+
data: metric,
|
|
56
|
+
timestamp: Time.now.utc.iso8601
|
|
57
|
+
})
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Register alert handler
|
|
61
|
+
@alert_manager.add_handler do |alert|
|
|
62
|
+
broadcast_to_clients({
|
|
63
|
+
type: "alert",
|
|
64
|
+
data: alert,
|
|
65
|
+
timestamp: Time.now.utc.iso8601
|
|
66
|
+
})
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def broadcast_to_clients(message)
|
|
71
|
+
return unless WEBSOCKET_AVAILABLE
|
|
72
|
+
|
|
73
|
+
json_message = message.to_json
|
|
74
|
+
@websocket_clients.each do |client|
|
|
75
|
+
client.send(json_message) if client.ready_state == Faye::WebSocket::API::OPEN
|
|
76
|
+
rescue StandardError => e
|
|
77
|
+
warn "WebSocket send failed: #{e.message}"
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def add_websocket_client(ws)
|
|
82
|
+
@websocket_clients << ws
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def remove_websocket_client(ws)
|
|
86
|
+
@websocket_clients.delete(ws)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Main dashboard page
|
|
91
|
+
get "/" do
|
|
92
|
+
send_file File.join(settings.public_folder, "index.html")
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# WebSocket endpoint for real-time updates
|
|
96
|
+
get "/ws" do
|
|
97
|
+
halt 503, { error: "WebSocket support not available. Install faye-websocket gem." }.to_json unless WEBSOCKET_AVAILABLE
|
|
98
|
+
|
|
99
|
+
if Faye::WebSocket.websocket?(request.env)
|
|
100
|
+
ws = Faye::WebSocket.new(request.env)
|
|
101
|
+
|
|
102
|
+
ws.on :open do |_event|
|
|
103
|
+
self.class.add_websocket_client(ws)
|
|
104
|
+
|
|
105
|
+
# Send initial state
|
|
106
|
+
ws.send({
|
|
107
|
+
type: "connected",
|
|
108
|
+
message: "Connected to DecisionAgent monitoring",
|
|
109
|
+
timestamp: Time.now.utc.iso8601
|
|
110
|
+
}.to_json)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
ws.on :message do |event|
|
|
114
|
+
# Handle client messages
|
|
115
|
+
handle_websocket_message(ws, event.data)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
ws.on :close do |_event|
|
|
119
|
+
self.class.remove_websocket_client(ws)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
ws.rack_response
|
|
123
|
+
else
|
|
124
|
+
status 426
|
|
125
|
+
{ error: "WebSocket connection required" }.to_json
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# API: Get current statistics
|
|
130
|
+
get "/api/stats" do
|
|
131
|
+
content_type :json
|
|
132
|
+
|
|
133
|
+
time_range = params[:time_range]&.to_i
|
|
134
|
+
stats = self.class.metrics_collector.statistics(time_range: time_range)
|
|
135
|
+
|
|
136
|
+
stats.to_json
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# API: Get time series data
|
|
140
|
+
get "/api/timeseries/:metric_type" do
|
|
141
|
+
content_type :json
|
|
142
|
+
|
|
143
|
+
metric_type = params[:metric_type].to_sym
|
|
144
|
+
bucket_size = (params[:bucket_size] || 60).to_i
|
|
145
|
+
time_range = (params[:time_range] || 3600).to_i
|
|
146
|
+
|
|
147
|
+
data = self.class.metrics_collector.time_series(
|
|
148
|
+
metric_type: metric_type,
|
|
149
|
+
bucket_size: bucket_size,
|
|
150
|
+
time_range: time_range
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
data.to_json
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# API: Prometheus metrics endpoint
|
|
157
|
+
get "/metrics" do
|
|
158
|
+
content_type PrometheusExporter::CONTENT_TYPE
|
|
159
|
+
self.class.prometheus_exporter.export
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# API: Get Prometheus metrics in JSON format
|
|
163
|
+
get "/api/metrics" do
|
|
164
|
+
content_type :json
|
|
165
|
+
self.class.prometheus_exporter.metrics_hash.to_json
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# API: Register custom KPI
|
|
169
|
+
post "/api/kpi" do
|
|
170
|
+
content_type :json
|
|
171
|
+
|
|
172
|
+
begin
|
|
173
|
+
data = JSON.parse(request.body.read, symbolize_names: true)
|
|
174
|
+
|
|
175
|
+
self.class.prometheus_exporter.register_kpi(
|
|
176
|
+
name: data[:name],
|
|
177
|
+
value: data[:value],
|
|
178
|
+
labels: data[:labels] || {},
|
|
179
|
+
help: data[:help]
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
{ success: true, message: "KPI registered" }.to_json
|
|
183
|
+
rescue StandardError => e
|
|
184
|
+
status 400
|
|
185
|
+
{ error: e.message }.to_json
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# API: Get active alerts
|
|
190
|
+
get "/api/alerts" do
|
|
191
|
+
content_type :json
|
|
192
|
+
self.class.alert_manager.active_alerts.to_json
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# API: Get all alerts
|
|
196
|
+
get "/api/alerts/all" do
|
|
197
|
+
content_type :json
|
|
198
|
+
limit = (params[:limit] || 100).to_i
|
|
199
|
+
self.class.alert_manager.all_alerts(limit: limit).to_json
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# API: Create alert rule
|
|
203
|
+
post "/api/alerts/rules" do
|
|
204
|
+
content_type :json
|
|
205
|
+
|
|
206
|
+
begin
|
|
207
|
+
data = JSON.parse(request.body.read, symbolize_names: true)
|
|
208
|
+
|
|
209
|
+
# Parse condition
|
|
210
|
+
condition = parse_alert_condition(data[:condition], data[:condition_type])
|
|
211
|
+
|
|
212
|
+
rule = self.class.alert_manager.add_rule(
|
|
213
|
+
name: data[:name],
|
|
214
|
+
condition: condition,
|
|
215
|
+
severity: (data[:severity] || :warning).to_sym,
|
|
216
|
+
threshold: data[:threshold],
|
|
217
|
+
message: data[:message],
|
|
218
|
+
cooldown: data[:cooldown] || 300
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
status 201
|
|
222
|
+
rule.to_json
|
|
223
|
+
rescue StandardError => e
|
|
224
|
+
status 400
|
|
225
|
+
{ error: e.message }.to_json
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# API: Toggle alert rule
|
|
230
|
+
put "/api/alerts/rules/:rule_id/toggle" do
|
|
231
|
+
content_type :json
|
|
232
|
+
|
|
233
|
+
begin
|
|
234
|
+
data = JSON.parse(request.body.read, symbolize_names: true)
|
|
235
|
+
enabled = data[:enabled] || false
|
|
236
|
+
|
|
237
|
+
self.class.alert_manager.toggle_rule(params[:rule_id], enabled)
|
|
238
|
+
|
|
239
|
+
{ success: true, message: "Rule #{enabled ? 'enabled' : 'disabled'}" }.to_json
|
|
240
|
+
rescue StandardError => e
|
|
241
|
+
status 400
|
|
242
|
+
{ error: e.message }.to_json
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# API: Acknowledge alert
|
|
247
|
+
post "/api/alerts/:alert_id/acknowledge" do
|
|
248
|
+
content_type :json
|
|
249
|
+
|
|
250
|
+
begin
|
|
251
|
+
data = JSON.parse(request.body.read, symbolize_names: true)
|
|
252
|
+
acknowledged_by = data[:acknowledged_by] || "user"
|
|
253
|
+
|
|
254
|
+
self.class.alert_manager.acknowledge_alert(params[:alert_id], acknowledged_by: acknowledged_by)
|
|
255
|
+
|
|
256
|
+
{ success: true, message: "Alert acknowledged" }.to_json
|
|
257
|
+
rescue StandardError => e
|
|
258
|
+
status 400
|
|
259
|
+
{ error: e.message }.to_json
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# API: Resolve alert
|
|
264
|
+
post "/api/alerts/:alert_id/resolve" do
|
|
265
|
+
content_type :json
|
|
266
|
+
|
|
267
|
+
begin
|
|
268
|
+
data = JSON.parse(request.body.read, symbolize_names: true)
|
|
269
|
+
resolved_by = data[:resolved_by] || "user"
|
|
270
|
+
|
|
271
|
+
self.class.alert_manager.resolve_alert(params[:alert_id], resolved_by: resolved_by)
|
|
272
|
+
|
|
273
|
+
{ success: true, message: "Alert resolved" }.to_json
|
|
274
|
+
rescue StandardError => e
|
|
275
|
+
status 400
|
|
276
|
+
{ error: e.message }.to_json
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Health check
|
|
281
|
+
get "/health" do
|
|
282
|
+
content_type :json
|
|
283
|
+
{
|
|
284
|
+
status: "ok",
|
|
285
|
+
version: DecisionAgent::VERSION,
|
|
286
|
+
websocket_clients: self.class.websocket_clients.size,
|
|
287
|
+
metrics_count: self.class.metrics_collector.metrics_count
|
|
288
|
+
}.to_json
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
# Class method to start the server
|
|
292
|
+
def self.start!(metrics_collector:, prometheus_exporter:, alert_manager:, port: 4568, host: "0.0.0.0")
|
|
293
|
+
configure_monitoring(
|
|
294
|
+
metrics_collector: metrics_collector,
|
|
295
|
+
prometheus_exporter: prometheus_exporter,
|
|
296
|
+
alert_manager: alert_manager
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
set :port, port
|
|
300
|
+
set :bind, host
|
|
301
|
+
run!
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
private
|
|
305
|
+
|
|
306
|
+
def handle_websocket_message(ws, data)
|
|
307
|
+
message = JSON.parse(data, symbolize_names: true)
|
|
308
|
+
|
|
309
|
+
case message[:action]
|
|
310
|
+
when "subscribe"
|
|
311
|
+
# Send current stats
|
|
312
|
+
stats = self.class.metrics_collector.statistics
|
|
313
|
+
ws.send({ type: "stats", data: stats }.to_json)
|
|
314
|
+
when "get_alerts"
|
|
315
|
+
alerts = self.class.alert_manager.active_alerts
|
|
316
|
+
ws.send({ type: "alerts", data: alerts }.to_json)
|
|
317
|
+
end
|
|
318
|
+
rescue StandardError => e
|
|
319
|
+
ws.send({ type: "error", message: e.message }.to_json)
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def parse_alert_condition(condition_data, condition_type)
|
|
323
|
+
case condition_type
|
|
324
|
+
when "high_error_rate"
|
|
325
|
+
AlertManager.high_error_rate(threshold: condition_data[:threshold] || 0.1)
|
|
326
|
+
when "low_confidence"
|
|
327
|
+
AlertManager.low_confidence(threshold: condition_data[:threshold] || 0.5)
|
|
328
|
+
when "high_latency"
|
|
329
|
+
AlertManager.high_latency(threshold_ms: condition_data[:threshold_ms] || 1000)
|
|
330
|
+
when "error_spike"
|
|
331
|
+
AlertManager.error_spike(threshold: condition_data[:threshold] || 10)
|
|
332
|
+
when "custom"
|
|
333
|
+
condition_data
|
|
334
|
+
else
|
|
335
|
+
raise "Unknown condition type: #{condition_type}"
|
|
336
|
+
end
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
end
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
require "monitor"
|
|
2
|
+
require "time"
|
|
3
|
+
|
|
4
|
+
module DecisionAgent
|
|
5
|
+
module Monitoring
|
|
6
|
+
# Thread-safe metrics collector for decision analytics
|
|
7
|
+
class MetricsCollector
|
|
8
|
+
include MonitorMixin
|
|
9
|
+
|
|
10
|
+
attr_reader :metrics, :window_size
|
|
11
|
+
|
|
12
|
+
def initialize(window_size: 3600)
|
|
13
|
+
super()
|
|
14
|
+
@window_size = window_size # Default: 1 hour window
|
|
15
|
+
@metrics = {
|
|
16
|
+
decisions: [],
|
|
17
|
+
evaluations: [],
|
|
18
|
+
performance: [],
|
|
19
|
+
errors: []
|
|
20
|
+
}
|
|
21
|
+
@observers = []
|
|
22
|
+
freeze_config
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Record a decision for analytics
|
|
26
|
+
def record_decision(decision, context, duration_ms: nil)
|
|
27
|
+
synchronize do
|
|
28
|
+
metric = {
|
|
29
|
+
timestamp: Time.now.utc,
|
|
30
|
+
decision: decision.decision,
|
|
31
|
+
confidence: decision.confidence,
|
|
32
|
+
evaluations_count: decision.evaluations.size,
|
|
33
|
+
context_size: context.to_h.size,
|
|
34
|
+
duration_ms: duration_ms,
|
|
35
|
+
evaluator_names: decision.evaluations.map(&:evaluator_name).uniq
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
@metrics[:decisions] << metric
|
|
39
|
+
cleanup_old_metrics!
|
|
40
|
+
notify_observers(:decision, metric)
|
|
41
|
+
metric
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Record individual evaluation metrics
|
|
46
|
+
def record_evaluation(evaluation)
|
|
47
|
+
synchronize do
|
|
48
|
+
metric = {
|
|
49
|
+
timestamp: Time.now.utc,
|
|
50
|
+
decision: evaluation.decision,
|
|
51
|
+
weight: evaluation.weight,
|
|
52
|
+
evaluator_name: evaluation.evaluator_name
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
@metrics[:evaluations] << metric
|
|
56
|
+
cleanup_old_metrics!
|
|
57
|
+
notify_observers(:evaluation, metric)
|
|
58
|
+
metric
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Record performance metrics
|
|
63
|
+
def record_performance(operation:, duration_ms:, success: true, metadata: {})
|
|
64
|
+
synchronize do
|
|
65
|
+
metric = {
|
|
66
|
+
timestamp: Time.now.utc,
|
|
67
|
+
operation: operation,
|
|
68
|
+
duration_ms: duration_ms,
|
|
69
|
+
success: success,
|
|
70
|
+
metadata: metadata
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
@metrics[:performance] << metric
|
|
74
|
+
cleanup_old_metrics!
|
|
75
|
+
notify_observers(:performance, metric)
|
|
76
|
+
metric
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Record error
|
|
81
|
+
def record_error(error, context: {})
|
|
82
|
+
synchronize do
|
|
83
|
+
metric = {
|
|
84
|
+
timestamp: Time.now.utc,
|
|
85
|
+
error_class: error.class.name,
|
|
86
|
+
error_message: error.message,
|
|
87
|
+
context: context
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
@metrics[:errors] << metric
|
|
91
|
+
cleanup_old_metrics!
|
|
92
|
+
notify_observers(:error, metric)
|
|
93
|
+
metric
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Get aggregated statistics
|
|
98
|
+
def statistics(time_range: nil)
|
|
99
|
+
synchronize do
|
|
100
|
+
range_start = time_range ? Time.now.utc - time_range : nil
|
|
101
|
+
|
|
102
|
+
decisions = filter_by_time(@metrics[:decisions], range_start)
|
|
103
|
+
evaluations = filter_by_time(@metrics[:evaluations], range_start)
|
|
104
|
+
performance = filter_by_time(@metrics[:performance], range_start)
|
|
105
|
+
errors = filter_by_time(@metrics[:errors], range_start)
|
|
106
|
+
|
|
107
|
+
{
|
|
108
|
+
summary: {
|
|
109
|
+
total_decisions: decisions.size,
|
|
110
|
+
total_evaluations: evaluations.size,
|
|
111
|
+
total_errors: errors.size,
|
|
112
|
+
time_range: range_start ? "Last #{time_range}s" : "All time"
|
|
113
|
+
},
|
|
114
|
+
decisions: compute_decision_stats(decisions),
|
|
115
|
+
evaluations: compute_evaluation_stats(evaluations),
|
|
116
|
+
performance: compute_performance_stats(performance),
|
|
117
|
+
errors: compute_error_stats(errors),
|
|
118
|
+
timestamp: Time.now.utc
|
|
119
|
+
}
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Get time-series data for graphing
|
|
124
|
+
def time_series(metric_type:, bucket_size: 60, time_range: 3600)
|
|
125
|
+
synchronize do
|
|
126
|
+
data = @metrics[metric_type] || []
|
|
127
|
+
range_start = Time.now.utc - time_range
|
|
128
|
+
|
|
129
|
+
buckets = {}
|
|
130
|
+
data.each do |metric|
|
|
131
|
+
next if metric[:timestamp] < range_start
|
|
132
|
+
|
|
133
|
+
bucket_key = (metric[:timestamp].to_i / bucket_size) * bucket_size
|
|
134
|
+
buckets[bucket_key] ||= []
|
|
135
|
+
buckets[bucket_key] << metric
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
buckets.sort.map do |timestamp, metrics|
|
|
139
|
+
{
|
|
140
|
+
timestamp: Time.at(timestamp).utc,
|
|
141
|
+
count: metrics.size,
|
|
142
|
+
metrics: metrics
|
|
143
|
+
}
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Register observer for real-time updates
|
|
149
|
+
def add_observer(&block)
|
|
150
|
+
synchronize do
|
|
151
|
+
@observers << block
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Clear all metrics
|
|
156
|
+
def clear!
|
|
157
|
+
synchronize do
|
|
158
|
+
@metrics.each_value(&:clear)
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Get current metrics count
|
|
163
|
+
def metrics_count
|
|
164
|
+
synchronize do
|
|
165
|
+
@metrics.transform_values(&:size)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
private
|
|
170
|
+
|
|
171
|
+
def freeze_config
|
|
172
|
+
@window_size.freeze
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def cleanup_old_metrics!
|
|
176
|
+
cutoff_time = Time.now.utc - @window_size
|
|
177
|
+
|
|
178
|
+
@metrics.each_value do |data|
|
|
179
|
+
data.delete_if { |m| m[:timestamp] < cutoff_time }
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def filter_by_time(data, start_time)
|
|
184
|
+
return data unless start_time
|
|
185
|
+
|
|
186
|
+
data.select { |m| m[:timestamp] >= start_time }
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def compute_decision_stats(decisions)
|
|
190
|
+
return {} if decisions.empty?
|
|
191
|
+
|
|
192
|
+
confidences = decisions.map { |d| d[:confidence] }
|
|
193
|
+
durations = decisions.map { |d| d[:duration_ms] }.compact
|
|
194
|
+
|
|
195
|
+
decision_distribution = decisions.group_by { |d| d[:decision] }
|
|
196
|
+
.transform_values(&:size)
|
|
197
|
+
|
|
198
|
+
{
|
|
199
|
+
total: decisions.size,
|
|
200
|
+
avg_confidence: (confidences.sum / confidences.size.to_f).round(4),
|
|
201
|
+
min_confidence: confidences.min.round(4),
|
|
202
|
+
max_confidence: confidences.max.round(4),
|
|
203
|
+
decision_distribution: decision_distribution,
|
|
204
|
+
avg_duration_ms: durations.empty? ? nil : (durations.sum / durations.size.to_f).round(2),
|
|
205
|
+
evaluators_used: decisions.flat_map { |d| d[:evaluator_names] }.uniq
|
|
206
|
+
}
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def compute_evaluation_stats(evaluations)
|
|
210
|
+
return {} if evaluations.empty?
|
|
211
|
+
|
|
212
|
+
weights = evaluations.map { |e| e[:weight] }
|
|
213
|
+
evaluator_distribution = evaluations.group_by { |e| e[:evaluator_name] }
|
|
214
|
+
.transform_values(&:size)
|
|
215
|
+
|
|
216
|
+
{
|
|
217
|
+
total: evaluations.size,
|
|
218
|
+
avg_weight: (weights.sum / weights.size.to_f).round(4),
|
|
219
|
+
evaluator_distribution: evaluator_distribution,
|
|
220
|
+
decision_distribution: evaluations.group_by { |e| e[:decision] }
|
|
221
|
+
.transform_values(&:size)
|
|
222
|
+
}
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def compute_performance_stats(performance)
|
|
226
|
+
return {} if performance.empty?
|
|
227
|
+
|
|
228
|
+
durations = performance.map { |p| p[:duration_ms] }
|
|
229
|
+
successes = performance.count { |p| p[:success] }
|
|
230
|
+
|
|
231
|
+
{
|
|
232
|
+
total_operations: performance.size,
|
|
233
|
+
successful: successes,
|
|
234
|
+
failed: performance.size - successes,
|
|
235
|
+
success_rate: (successes / performance.size.to_f).round(4),
|
|
236
|
+
avg_duration_ms: (durations.sum / durations.size.to_f).round(2),
|
|
237
|
+
min_duration_ms: durations.min.round(2),
|
|
238
|
+
max_duration_ms: durations.max.round(2),
|
|
239
|
+
p95_duration_ms: percentile(durations, 0.95).round(2),
|
|
240
|
+
p99_duration_ms: percentile(durations, 0.99).round(2)
|
|
241
|
+
}
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def compute_error_stats(errors)
|
|
245
|
+
return {} if errors.empty?
|
|
246
|
+
|
|
247
|
+
{
|
|
248
|
+
total: errors.size,
|
|
249
|
+
by_type: errors.group_by { |e| e[:error_class] }.transform_values(&:size),
|
|
250
|
+
recent_errors: errors.last(10).map do |e|
|
|
251
|
+
{
|
|
252
|
+
timestamp: e[:timestamp],
|
|
253
|
+
error: e[:error_class],
|
|
254
|
+
message: e[:error_message]
|
|
255
|
+
}
|
|
256
|
+
end
|
|
257
|
+
}
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
def percentile(array, percentile)
|
|
261
|
+
return 0 if array.empty?
|
|
262
|
+
|
|
263
|
+
sorted = array.sort
|
|
264
|
+
index = (percentile * sorted.length).ceil - 1
|
|
265
|
+
sorted[[index, 0].max]
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
def notify_observers(event_type, metric)
|
|
269
|
+
@observers.each do |observer|
|
|
270
|
+
observer.call(event_type, metric)
|
|
271
|
+
rescue StandardError => e
|
|
272
|
+
# Silently fail observer notifications to prevent disruption
|
|
273
|
+
warn "Observer notification failed: #{e.message}"
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
module DecisionAgent
|
|
2
|
+
module Monitoring
|
|
3
|
+
# Wrapper around Agent that automatically records metrics
|
|
4
|
+
class MonitoredAgent
|
|
5
|
+
attr_reader :agent, :metrics_collector
|
|
6
|
+
|
|
7
|
+
def initialize(agent:, metrics_collector:)
|
|
8
|
+
@agent = agent
|
|
9
|
+
@metrics_collector = metrics_collector
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Make a decision and automatically record metrics
|
|
13
|
+
def decide(context:, feedback: {})
|
|
14
|
+
ctx = context.is_a?(Context) ? context : Context.new(context)
|
|
15
|
+
|
|
16
|
+
start_time = Time.now
|
|
17
|
+
|
|
18
|
+
begin
|
|
19
|
+
result = @agent.decide(context: ctx, feedback: feedback)
|
|
20
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
21
|
+
|
|
22
|
+
# Record decision metrics
|
|
23
|
+
@metrics_collector.record_decision(result, ctx, duration_ms: duration_ms)
|
|
24
|
+
|
|
25
|
+
# Record each evaluation
|
|
26
|
+
result.evaluations.each do |evaluation|
|
|
27
|
+
@metrics_collector.record_evaluation(evaluation)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Record successful performance
|
|
31
|
+
@metrics_collector.record_performance(
|
|
32
|
+
operation: "decide",
|
|
33
|
+
duration_ms: duration_ms,
|
|
34
|
+
success: true,
|
|
35
|
+
metadata: {
|
|
36
|
+
evaluators_count: result.evaluations.size,
|
|
37
|
+
decision: result.decision,
|
|
38
|
+
confidence: result.confidence
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
result
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
45
|
+
|
|
46
|
+
# Record error
|
|
47
|
+
@metrics_collector.record_error(e, context: ctx.to_h)
|
|
48
|
+
|
|
49
|
+
# Record failed performance
|
|
50
|
+
@metrics_collector.record_performance(
|
|
51
|
+
operation: "decide",
|
|
52
|
+
duration_ms: duration_ms,
|
|
53
|
+
success: false,
|
|
54
|
+
metadata: { error_class: e.class.name }
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
raise
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Delegate other methods to the wrapped agent
|
|
62
|
+
def method_missing(method, ...)
|
|
63
|
+
@agent.send(method, ...)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def respond_to_missing?(method, include_private = false)
|
|
67
|
+
@agent.respond_to?(method, include_private) || super
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|