decision_agent 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +212 -35
- data/bin/decision_agent +3 -8
- data/lib/decision_agent/agent.rb +19 -26
- data/lib/decision_agent/audit/null_adapter.rb +1 -2
- data/lib/decision_agent/decision.rb +3 -1
- data/lib/decision_agent/dsl/condition_evaluator.rb +4 -3
- data/lib/decision_agent/dsl/rule_parser.rb +4 -6
- data/lib/decision_agent/dsl/schema_validator.rb +27 -31
- data/lib/decision_agent/errors.rb +11 -8
- data/lib/decision_agent/evaluation.rb +3 -1
- data/lib/decision_agent/evaluation_validator.rb +78 -0
- data/lib/decision_agent/evaluators/json_rule_evaluator.rb +26 -0
- data/lib/decision_agent/evaluators/static_evaluator.rb +2 -6
- data/lib/decision_agent/monitoring/alert_manager.rb +282 -0
- data/lib/decision_agent/monitoring/dashboard/public/dashboard.css +381 -0
- data/lib/decision_agent/monitoring/dashboard/public/dashboard.js +471 -0
- data/lib/decision_agent/monitoring/dashboard/public/index.html +161 -0
- data/lib/decision_agent/monitoring/dashboard_server.rb +340 -0
- data/lib/decision_agent/monitoring/metrics_collector.rb +278 -0
- data/lib/decision_agent/monitoring/monitored_agent.rb +71 -0
- data/lib/decision_agent/monitoring/prometheus_exporter.rb +247 -0
- data/lib/decision_agent/replay/replay.rb +12 -22
- data/lib/decision_agent/scoring/base.rb +1 -1
- data/lib/decision_agent/scoring/consensus.rb +5 -5
- data/lib/decision_agent/scoring/weighted_average.rb +1 -1
- data/lib/decision_agent/version.rb +1 -1
- data/lib/decision_agent/versioning/activerecord_adapter.rb +69 -33
- data/lib/decision_agent/versioning/adapter.rb +1 -3
- data/lib/decision_agent/versioning/file_storage_adapter.rb +143 -35
- data/lib/decision_agent/versioning/version_manager.rb +4 -12
- data/lib/decision_agent/web/public/index.html +1 -1
- data/lib/decision_agent/web/server.rb +19 -24
- data/lib/decision_agent.rb +7 -0
- data/lib/generators/decision_agent/install/install_generator.rb +5 -5
- data/lib/generators/decision_agent/install/templates/migration.rb +17 -6
- data/lib/generators/decision_agent/install/templates/rule.rb +3 -3
- data/lib/generators/decision_agent/install/templates/rule_version.rb +13 -7
- data/spec/activerecord_thread_safety_spec.rb +553 -0
- data/spec/agent_spec.rb +13 -13
- data/spec/api_contract_spec.rb +16 -16
- data/spec/audit_adapters_spec.rb +3 -3
- data/spec/comprehensive_edge_cases_spec.rb +86 -86
- data/spec/dsl_validation_spec.rb +83 -83
- data/spec/edge_cases_spec.rb +23 -23
- data/spec/examples/feedback_aware_evaluator_spec.rb +7 -7
- data/spec/examples.txt +548 -0
- data/spec/issue_verification_spec.rb +685 -0
- data/spec/json_rule_evaluator_spec.rb +15 -15
- data/spec/monitoring/alert_manager_spec.rb +378 -0
- data/spec/monitoring/metrics_collector_spec.rb +281 -0
- data/spec/monitoring/monitored_agent_spec.rb +222 -0
- data/spec/monitoring/prometheus_exporter_spec.rb +242 -0
- data/spec/replay_edge_cases_spec.rb +58 -58
- data/spec/replay_spec.rb +11 -11
- data/spec/rfc8785_canonicalization_spec.rb +215 -0
- data/spec/scoring_spec.rb +1 -1
- data/spec/spec_helper.rb +9 -0
- data/spec/thread_safety_spec.rb +482 -0
- data/spec/thread_safety_spec.rb.broken +878 -0
- data/spec/versioning_spec.rb +141 -37
- data/spec/web_ui_rack_spec.rb +135 -0
- metadata +69 -6
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
require "decision_agent/monitoring/metrics_collector"
|
|
3
|
+
require "decision_agent/monitoring/prometheus_exporter"
|
|
4
|
+
|
|
5
|
+
RSpec.describe DecisionAgent::Monitoring::PrometheusExporter do
|
|
6
|
+
let(:collector) { DecisionAgent::Monitoring::MetricsCollector.new }
|
|
7
|
+
let(:exporter) { described_class.new(metrics_collector: collector, namespace: "test") }
|
|
8
|
+
|
|
9
|
+
let(:decision) do
|
|
10
|
+
double(
|
|
11
|
+
"Decision",
|
|
12
|
+
decision: "approve",
|
|
13
|
+
confidence: 0.85,
|
|
14
|
+
evaluations: [double("Evaluation", evaluator_name: "test_evaluator")]
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
let(:context) { double("Context", to_h: { user: "test" }) }
|
|
18
|
+
|
|
19
|
+
describe "#initialize" do
|
|
20
|
+
it "initializes with metrics collector" do
|
|
21
|
+
expect(exporter).to be_a(described_class)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it "uses default namespace" do
|
|
25
|
+
exporter = described_class.new(metrics_collector: collector)
|
|
26
|
+
output = exporter.export
|
|
27
|
+
expect(output).to include("decision_agent_")
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
it "uses custom namespace" do
|
|
31
|
+
output = exporter.export
|
|
32
|
+
expect(output).to include("test_")
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
describe "#export" do
|
|
37
|
+
before do
|
|
38
|
+
# Record some metrics
|
|
39
|
+
3.times { collector.record_decision(decision, context, duration_ms: 10.0) }
|
|
40
|
+
collector.record_performance(operation: "decide", duration_ms: 15.0, success: true)
|
|
41
|
+
collector.record_error(StandardError.new("Test error"))
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it "exports in Prometheus text format" do
|
|
45
|
+
output = exporter.export
|
|
46
|
+
|
|
47
|
+
expect(output).to be_a(String)
|
|
48
|
+
expect(output).to include("# DecisionAgent Metrics Export")
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it "includes decision metrics" do
|
|
52
|
+
output = exporter.export
|
|
53
|
+
|
|
54
|
+
expect(output).to include("# HELP test_decisions_total")
|
|
55
|
+
expect(output).to include("# TYPE test_decisions_total counter")
|
|
56
|
+
expect(output).to include("test_decisions_total 3")
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
it "includes confidence metrics" do
|
|
60
|
+
output = exporter.export
|
|
61
|
+
|
|
62
|
+
expect(output).to include("# HELP test_decision_confidence_avg")
|
|
63
|
+
expect(output).to include("# TYPE test_decision_confidence_avg gauge")
|
|
64
|
+
expect(output).to include("test_decision_confidence_avg 0.85")
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
it "includes performance metrics" do
|
|
68
|
+
output = exporter.export
|
|
69
|
+
|
|
70
|
+
expect(output).to include("# HELP test_success_rate")
|
|
71
|
+
expect(output).to include("# TYPE test_success_rate gauge")
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
it "includes error metrics" do
|
|
75
|
+
output = exporter.export
|
|
76
|
+
|
|
77
|
+
expect(output).to include("# HELP test_errors_total")
|
|
78
|
+
expect(output).to include("# TYPE test_errors_total counter")
|
|
79
|
+
expect(output).to include("test_errors_total 1")
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it "includes system info" do
|
|
83
|
+
output = exporter.export
|
|
84
|
+
|
|
85
|
+
expect(output).to include("# HELP test_info")
|
|
86
|
+
expect(output).to include("# TYPE test_info gauge")
|
|
87
|
+
expect(output).to include("version=\"#{DecisionAgent::VERSION}\"")
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
it "includes decision distribution" do
|
|
91
|
+
output = exporter.export
|
|
92
|
+
|
|
93
|
+
expect(output).to include("# HELP test_decisions_by_type")
|
|
94
|
+
expect(output).to include("test_decisions_by_type{decision=\"approve\"} 3")
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
it "includes error distribution by type" do
|
|
98
|
+
output = exporter.export
|
|
99
|
+
|
|
100
|
+
expect(output).to include("# HELP test_errors_by_type")
|
|
101
|
+
expect(output).to include("test_errors_by_type{error=\"StandardError\"} 1")
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
it "includes metrics count" do
|
|
105
|
+
output = exporter.export
|
|
106
|
+
|
|
107
|
+
expect(output).to include("# HELP test_metrics_stored")
|
|
108
|
+
expect(output).to include("test_metrics_stored{type=\"decisions\"} 3")
|
|
109
|
+
expect(output).to include("test_metrics_stored{type=\"errors\"} 1")
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
describe "#register_kpi" do
|
|
114
|
+
it "registers a custom KPI" do
|
|
115
|
+
exporter.register_kpi(
|
|
116
|
+
name: "custom_metric",
|
|
117
|
+
value: 42.5,
|
|
118
|
+
help: "A custom metric"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
output = exporter.export
|
|
122
|
+
expect(output).to include("# HELP test_custom_metric A custom metric")
|
|
123
|
+
expect(output).to include("# TYPE test_custom_metric gauge")
|
|
124
|
+
expect(output).to include("test_custom_metric 42.5")
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
it "registers KPI with labels" do
|
|
128
|
+
exporter.register_kpi(
|
|
129
|
+
name: "requests",
|
|
130
|
+
value: 100,
|
|
131
|
+
labels: { endpoint: "/api/v1", method: "GET" }
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
output = exporter.export
|
|
135
|
+
expect(output).to include("test_requests{endpoint=\"/api/v1\",method=\"GET\"} 100")
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it "sanitizes metric names" do
|
|
139
|
+
exporter.register_kpi(name: "my-custom.metric!", value: 10)
|
|
140
|
+
|
|
141
|
+
output = exporter.export
|
|
142
|
+
expect(output).to include("test_my_custom_metric_")
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
it "escapes label values" do
|
|
146
|
+
exporter.register_kpi(
|
|
147
|
+
name: "metric",
|
|
148
|
+
value: 1,
|
|
149
|
+
labels: { message: 'Contains "quotes"' }
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
output = exporter.export
|
|
153
|
+
expect(output).to include('message="Contains \"quotes\""')
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
describe "#metrics_hash" do
|
|
158
|
+
before do
|
|
159
|
+
collector.record_decision(decision, context, duration_ms: 10.0)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
it "returns metrics as hash" do
|
|
163
|
+
metrics = exporter.metrics_hash
|
|
164
|
+
|
|
165
|
+
expect(metrics).to be_a(Hash)
|
|
166
|
+
expect(metrics).to have_key(:decisions)
|
|
167
|
+
expect(metrics).to have_key(:performance)
|
|
168
|
+
expect(metrics).to have_key(:errors)
|
|
169
|
+
expect(metrics).to have_key(:system)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it "includes metric types" do
|
|
173
|
+
metrics = exporter.metrics_hash
|
|
174
|
+
|
|
175
|
+
expect(metrics[:decisions][:total][:type]).to eq("counter")
|
|
176
|
+
expect(metrics[:decisions][:avg_confidence][:type]).to eq("gauge")
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
it "includes metric values" do
|
|
180
|
+
metrics = exporter.metrics_hash
|
|
181
|
+
|
|
182
|
+
expect(metrics[:decisions][:total][:value]).to eq(1)
|
|
183
|
+
expect(metrics[:decisions][:avg_confidence][:value]).to eq(0.85)
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
describe "thread safety" do
|
|
188
|
+
it "handles concurrent KPI registration" do
|
|
189
|
+
threads = 10.times.map do |i|
|
|
190
|
+
Thread.new do
|
|
191
|
+
10.times do |j|
|
|
192
|
+
exporter.register_kpi(
|
|
193
|
+
name: "metric_#{i}_#{j}",
|
|
194
|
+
value: (i * 10) + j
|
|
195
|
+
)
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
expect { threads.each(&:join) }.not_to raise_error
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
it "handles concurrent exports" do
|
|
204
|
+
threads = 5.times.map do
|
|
205
|
+
Thread.new do
|
|
206
|
+
10.times { exporter.export }
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
expect { threads.each(&:join) }.not_to raise_error
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
describe "performance metrics export" do
|
|
215
|
+
before do
|
|
216
|
+
5.times do |i|
|
|
217
|
+
collector.record_performance(
|
|
218
|
+
operation: "decide",
|
|
219
|
+
duration_ms: (i + 1) * 10.0,
|
|
220
|
+
success: true
|
|
221
|
+
)
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
it "exports summary metrics" do
|
|
226
|
+
output = exporter.export
|
|
227
|
+
|
|
228
|
+
expect(output).to include("# TYPE test_operation_duration_ms summary")
|
|
229
|
+
expect(output).to include("test_operation_duration_ms{quantile=\"0.5\"}")
|
|
230
|
+
expect(output).to include("test_operation_duration_ms{quantile=\"0.95\"}")
|
|
231
|
+
expect(output).to include("test_operation_duration_ms{quantile=\"0.99\"}")
|
|
232
|
+
expect(output).to include("test_operation_duration_ms_sum")
|
|
233
|
+
expect(output).to include("test_operation_duration_ms_count")
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
describe "content type" do
|
|
238
|
+
it "defines Prometheus content type" do
|
|
239
|
+
expect(described_class::CONTENT_TYPE).to eq("text/plain; version=0.0.4")
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|
|
@@ -23,7 +23,7 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
23
23
|
rules: [
|
|
24
24
|
{
|
|
25
25
|
id: "auto_approve",
|
|
26
|
-
if: { field: "score", op: "gte", value: 90 },
|
|
26
|
+
if: { field: "score", op: "gte", value: 90 }, # Changed threshold
|
|
27
27
|
then: { decision: "approve", weight: 0.9, reason: "Very high score" }
|
|
28
28
|
}
|
|
29
29
|
]
|
|
@@ -36,9 +36,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
36
36
|
|
|
37
37
|
original_result = agent.decide(context: { score: 85 })
|
|
38
38
|
|
|
39
|
-
expect
|
|
39
|
+
expect do
|
|
40
40
|
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
41
|
-
|
|
41
|
+
end.not_to raise_error
|
|
42
42
|
end
|
|
43
43
|
|
|
44
44
|
it "detects differences in strict mode when rules have changed" do
|
|
@@ -52,9 +52,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
52
52
|
|
|
53
53
|
# Replay uses the stored evaluations (not re-evaluating rules)
|
|
54
54
|
# So it should succeed because replay uses static evaluators from the audit payload
|
|
55
|
-
expect
|
|
55
|
+
expect do
|
|
56
56
|
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
57
|
-
|
|
57
|
+
end.not_to raise_error
|
|
58
58
|
|
|
59
59
|
# The replayed result should match the original
|
|
60
60
|
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
@@ -69,9 +69,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
69
69
|
original_result = agent.decide(context: { score: 85 })
|
|
70
70
|
|
|
71
71
|
# In non-strict mode, differences are logged but don't raise errors
|
|
72
|
-
expect
|
|
72
|
+
expect do
|
|
73
73
|
DecisionAgent::Replay.run(original_result.audit_payload, strict: false)
|
|
74
|
-
|
|
74
|
+
end.not_to raise_error
|
|
75
75
|
end
|
|
76
76
|
end
|
|
77
77
|
|
|
@@ -124,9 +124,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
124
124
|
# StaticEvaluator adds type: "static" by default
|
|
125
125
|
expect(original_result.evaluations.first.metadata).to eq({ type: "static" })
|
|
126
126
|
|
|
127
|
-
expect
|
|
127
|
+
expect do
|
|
128
128
|
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
129
|
-
|
|
129
|
+
end.not_to raise_error
|
|
130
130
|
|
|
131
131
|
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
132
132
|
expect(replayed_result.evaluations.first.metadata).to eq({ type: "static" })
|
|
@@ -140,7 +140,7 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
140
140
|
evaluator_name: "CustomEvaluator",
|
|
141
141
|
metadata: {
|
|
142
142
|
user: { id: 123, role: "admin" },
|
|
143
|
-
tags: [
|
|
143
|
+
tags: %w[urgent important],
|
|
144
144
|
history: [
|
|
145
145
|
{ action: "created", timestamp: "2025-01-01" },
|
|
146
146
|
{ action: "updated", timestamp: "2025-01-02" }
|
|
@@ -183,21 +183,21 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
183
183
|
decision: "allow",
|
|
184
184
|
weight: 0.9,
|
|
185
185
|
reason: "User authenticated successfully",
|
|
186
|
-
evaluator_name: "DeletedCustomAuthEvaluator",
|
|
186
|
+
evaluator_name: "DeletedCustomAuthEvaluator", # This evaluator no longer exists
|
|
187
187
|
metadata: { auth_method: "oauth", provider: "google" }
|
|
188
188
|
}
|
|
189
189
|
],
|
|
190
190
|
decision: "allow",
|
|
191
|
-
confidence: 1.0,
|
|
191
|
+
confidence: 1.0, # WeightedAverage normalizes single eval to 1.0
|
|
192
192
|
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
193
193
|
agent_version: "0.1.0",
|
|
194
194
|
deterministic_hash: "abc123"
|
|
195
195
|
}
|
|
196
196
|
|
|
197
197
|
# Replay should work because it uses StaticEvaluator, not the original evaluator
|
|
198
|
-
expect
|
|
198
|
+
expect do
|
|
199
199
|
DecisionAgent::Replay.run(audit_payload, strict: true)
|
|
200
|
-
|
|
200
|
+
end.not_to raise_error
|
|
201
201
|
|
|
202
202
|
replayed_result = DecisionAgent::Replay.run(audit_payload, strict: true)
|
|
203
203
|
|
|
@@ -224,12 +224,12 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
224
224
|
decision: "approve",
|
|
225
225
|
weight: 0.7,
|
|
226
226
|
reason: "ML model prediction",
|
|
227
|
-
evaluator_name: "NonExistentMLEvaluator",
|
|
227
|
+
evaluator_name: "NonExistentMLEvaluator", # Missing evaluator
|
|
228
228
|
metadata: { model_version: "v2.1" }
|
|
229
229
|
}
|
|
230
230
|
],
|
|
231
231
|
decision: "approve",
|
|
232
|
-
confidence: 1.0,
|
|
232
|
+
confidence: 1.0, # Both agree, so 100% confidence
|
|
233
233
|
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
234
234
|
agent_version: "0.1.0",
|
|
235
235
|
deterministic_hash: "def456"
|
|
@@ -260,15 +260,15 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
260
260
|
],
|
|
261
261
|
decision: "approve",
|
|
262
262
|
confidence: 0.9,
|
|
263
|
-
scoring_strategy: "DecisionAgent::Scoring::DeprecatedBayesianStrategy",
|
|
263
|
+
scoring_strategy: "DecisionAgent::Scoring::DeprecatedBayesianStrategy", # Doesn't exist
|
|
264
264
|
agent_version: "0.1.0",
|
|
265
265
|
deterministic_hash: "ghi789"
|
|
266
266
|
}
|
|
267
267
|
|
|
268
268
|
# Should fall back to WeightedAverage
|
|
269
|
-
expect
|
|
269
|
+
expect do
|
|
270
270
|
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
271
|
-
|
|
271
|
+
end.not_to raise_error
|
|
272
272
|
|
|
273
273
|
replayed_result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
274
274
|
expect(replayed_result.decision).to eq("approve")
|
|
@@ -291,9 +291,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
291
291
|
|
|
292
292
|
# Replay uses the stored scoring strategy from the audit payload
|
|
293
293
|
# So it should replay successfully
|
|
294
|
-
expect
|
|
294
|
+
expect do
|
|
295
295
|
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
296
|
-
|
|
296
|
+
end.not_to raise_error
|
|
297
297
|
end
|
|
298
298
|
end
|
|
299
299
|
|
|
@@ -305,9 +305,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
305
305
|
confidence: 0.5
|
|
306
306
|
}
|
|
307
307
|
|
|
308
|
-
expect
|
|
308
|
+
expect do
|
|
309
309
|
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
310
|
-
|
|
310
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: context/)
|
|
311
311
|
end
|
|
312
312
|
|
|
313
313
|
it "requires evaluations field" do
|
|
@@ -317,9 +317,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
317
317
|
confidence: 0.5
|
|
318
318
|
}
|
|
319
319
|
|
|
320
|
-
expect
|
|
320
|
+
expect do
|
|
321
321
|
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
322
|
-
|
|
322
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: evaluations/)
|
|
323
323
|
end
|
|
324
324
|
|
|
325
325
|
it "requires decision field" do
|
|
@@ -329,9 +329,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
329
329
|
confidence: 0.5
|
|
330
330
|
}
|
|
331
331
|
|
|
332
|
-
expect
|
|
332
|
+
expect do
|
|
333
333
|
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
334
|
-
|
|
334
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: decision/)
|
|
335
335
|
end
|
|
336
336
|
|
|
337
337
|
it "requires confidence field" do
|
|
@@ -341,9 +341,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
341
341
|
decision: "test"
|
|
342
342
|
}
|
|
343
343
|
|
|
344
|
-
expect
|
|
344
|
+
expect do
|
|
345
345
|
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
346
|
-
|
|
346
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: confidence/)
|
|
347
347
|
end
|
|
348
348
|
|
|
349
349
|
it "accepts both symbol and string keys" do
|
|
@@ -366,9 +366,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
366
366
|
"scoring_strategy" => "DecisionAgent::Scoring::MaxWeight"
|
|
367
367
|
}
|
|
368
368
|
|
|
369
|
-
expect
|
|
369
|
+
expect do
|
|
370
370
|
DecisionAgent::Replay.run(payload_with_strings, strict: true)
|
|
371
|
-
|
|
371
|
+
end.not_to raise_error
|
|
372
372
|
end
|
|
373
373
|
end
|
|
374
374
|
|
|
@@ -460,12 +460,12 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
460
460
|
|
|
461
461
|
# Modify agent_version
|
|
462
462
|
modified_payload = original_result.audit_payload.dup
|
|
463
|
-
modified_payload[:agent_version] = "99.0.0"
|
|
463
|
+
modified_payload[:agent_version] = "99.0.0" # Different version
|
|
464
464
|
|
|
465
465
|
# Non-strict mode should log but not raise
|
|
466
|
-
expect
|
|
466
|
+
expect do
|
|
467
467
|
DecisionAgent::Replay.run(modified_payload, strict: false)
|
|
468
|
-
|
|
468
|
+
end.not_to raise_error
|
|
469
469
|
|
|
470
470
|
# Should successfully replay despite version difference
|
|
471
471
|
replayed_result = DecisionAgent::Replay.run(modified_payload, strict: false)
|
|
@@ -489,7 +489,7 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
489
489
|
decision: "approve",
|
|
490
490
|
confidence: 1.0,
|
|
491
491
|
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
492
|
-
agent_version: "0.0.1",
|
|
492
|
+
agent_version: "0.0.1", # Old version
|
|
493
493
|
deterministic_hash: "old_hash"
|
|
494
494
|
}
|
|
495
495
|
|
|
@@ -514,9 +514,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
514
514
|
|
|
515
515
|
# Strict mode should still work because version is not part of deterministic comparison
|
|
516
516
|
# (only decision and confidence are compared in strict mode)
|
|
517
|
-
expect
|
|
517
|
+
expect do
|
|
518
518
|
DecisionAgent::Replay.run(modified_payload, strict: true)
|
|
519
|
-
|
|
519
|
+
end.not_to raise_error
|
|
520
520
|
end
|
|
521
521
|
end
|
|
522
522
|
|
|
@@ -543,9 +543,9 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
543
543
|
}
|
|
544
544
|
|
|
545
545
|
# Should not raise error, just creates new hash during replay
|
|
546
|
-
expect
|
|
546
|
+
expect do
|
|
547
547
|
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
548
|
-
|
|
548
|
+
end.not_to raise_error
|
|
549
549
|
|
|
550
550
|
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
551
551
|
expect(result.decision).to eq("approve")
|
|
@@ -582,24 +582,24 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
582
582
|
|
|
583
583
|
it "validates required fields before replay" do
|
|
584
584
|
# Missing context
|
|
585
|
-
expect
|
|
585
|
+
expect do
|
|
586
586
|
DecisionAgent::Replay.run({ decision: "test", confidence: 0.5, evaluations: [] }, strict: true)
|
|
587
|
-
|
|
587
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError, /context/)
|
|
588
588
|
|
|
589
589
|
# Missing evaluations
|
|
590
|
-
expect
|
|
590
|
+
expect do
|
|
591
591
|
DecisionAgent::Replay.run({ context: {}, decision: "test", confidence: 0.5 }, strict: true)
|
|
592
|
-
|
|
592
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError, /evaluations/)
|
|
593
593
|
|
|
594
594
|
# Missing decision
|
|
595
|
-
expect
|
|
595
|
+
expect do
|
|
596
596
|
DecisionAgent::Replay.run({ context: {}, evaluations: [], confidence: 0.5 }, strict: true)
|
|
597
|
-
|
|
597
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError, /decision/)
|
|
598
598
|
|
|
599
599
|
# Missing confidence
|
|
600
|
-
expect
|
|
600
|
+
expect do
|
|
601
601
|
DecisionAgent::Replay.run({ context: {}, evaluations: [], decision: "test" }, strict: true)
|
|
602
|
-
|
|
602
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError, /confidence/)
|
|
603
603
|
end
|
|
604
604
|
|
|
605
605
|
it "handles evaluation with invalid weight" do
|
|
@@ -610,7 +610,7 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
610
610
|
evaluations: [
|
|
611
611
|
{
|
|
612
612
|
decision: "approve",
|
|
613
|
-
weight: 2.5,
|
|
613
|
+
weight: 2.5, # Weight > 1.0, invalid
|
|
614
614
|
reason: "Test",
|
|
615
615
|
evaluator_name: "TestEvaluator",
|
|
616
616
|
metadata: {}
|
|
@@ -622,21 +622,21 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
622
622
|
}
|
|
623
623
|
|
|
624
624
|
# Invalid weight (> 1.0) should raise error when creating Evaluation
|
|
625
|
-
expect
|
|
625
|
+
expect do
|
|
626
626
|
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
627
|
-
|
|
627
|
+
end.to raise_error(DecisionAgent::InvalidWeightError)
|
|
628
628
|
end
|
|
629
629
|
|
|
630
630
|
it "handles completely empty audit payload" do
|
|
631
|
-
expect
|
|
631
|
+
expect do
|
|
632
632
|
DecisionAgent::Replay.run({}, strict: false)
|
|
633
|
-
|
|
633
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError)
|
|
634
634
|
end
|
|
635
635
|
|
|
636
636
|
it "handles nil audit payload" do
|
|
637
|
-
expect
|
|
637
|
+
expect do
|
|
638
638
|
DecisionAgent::Replay.run(nil, strict: false)
|
|
639
|
-
|
|
639
|
+
end.to raise_error
|
|
640
640
|
end
|
|
641
641
|
end
|
|
642
642
|
|
|
@@ -657,14 +657,14 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
657
657
|
],
|
|
658
658
|
decision: "approve",
|
|
659
659
|
confidence: 0.9,
|
|
660
|
-
scoring_strategy: "DecisionAgent::Scoring::OldStrategyName",
|
|
660
|
+
scoring_strategy: "DecisionAgent::Scoring::OldStrategyName", # Renamed or deleted
|
|
661
661
|
agent_version: "0.1.0"
|
|
662
662
|
}
|
|
663
663
|
|
|
664
664
|
# Should fall back to default strategy (WeightedAverage)
|
|
665
|
-
expect
|
|
665
|
+
expect do
|
|
666
666
|
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
667
|
-
|
|
667
|
+
end.not_to raise_error
|
|
668
668
|
|
|
669
669
|
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
670
670
|
expect(result.decision).to eq("approve")
|
|
@@ -686,7 +686,7 @@ RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
|
686
686
|
],
|
|
687
687
|
decision: "approve",
|
|
688
688
|
confidence: 0.85,
|
|
689
|
-
scoring_strategy: "MyCompany::CustomMLBasedScoringStrategy",
|
|
689
|
+
scoring_strategy: "MyCompany::CustomMLBasedScoringStrategy", # Custom strategy
|
|
690
690
|
agent_version: "0.1.0"
|
|
691
691
|
}
|
|
692
692
|
|
data/spec/replay_spec.rb
CHANGED
|
@@ -34,9 +34,9 @@ RSpec.describe DecisionAgent::Replay do
|
|
|
34
34
|
modified_payload = original_result.audit_payload.dup
|
|
35
35
|
modified_payload[:decision] = "reject"
|
|
36
36
|
|
|
37
|
-
expect
|
|
37
|
+
expect do
|
|
38
38
|
DecisionAgent::Replay.run(modified_payload, strict: true)
|
|
39
|
-
|
|
39
|
+
end.to raise_error(DecisionAgent::ReplayMismatchError) do |error|
|
|
40
40
|
expect(error.differences).to include(/decision mismatch/)
|
|
41
41
|
expect(error.expected[:decision]).to eq("reject")
|
|
42
42
|
expect(error.actual[:decision]).to eq("approve")
|
|
@@ -50,9 +50,9 @@ RSpec.describe DecisionAgent::Replay do
|
|
|
50
50
|
modified_payload = original_result.audit_payload.dup
|
|
51
51
|
modified_payload[:confidence] = 0.5
|
|
52
52
|
|
|
53
|
-
expect
|
|
53
|
+
expect do
|
|
54
54
|
DecisionAgent::Replay.run(modified_payload, strict: true)
|
|
55
|
-
|
|
55
|
+
end.to raise_error(DecisionAgent::ReplayMismatchError) do |error|
|
|
56
56
|
expect(error.differences).to include(/confidence mismatch/)
|
|
57
57
|
end
|
|
58
58
|
end
|
|
@@ -64,9 +64,9 @@ RSpec.describe DecisionAgent::Replay do
|
|
|
64
64
|
modified_payload = original_result.audit_payload.dup
|
|
65
65
|
modified_payload[:decision] = "reject"
|
|
66
66
|
|
|
67
|
-
expect
|
|
67
|
+
expect do
|
|
68
68
|
DecisionAgent::Replay.run(modified_payload, strict: false)
|
|
69
|
-
|
|
69
|
+
end.not_to raise_error
|
|
70
70
|
end
|
|
71
71
|
|
|
72
72
|
it "logs differences in non-strict mode" do
|
|
@@ -76,17 +76,17 @@ RSpec.describe DecisionAgent::Replay do
|
|
|
76
76
|
modified_payload = original_result.audit_payload.dup
|
|
77
77
|
modified_payload[:decision] = "reject"
|
|
78
78
|
|
|
79
|
-
expect
|
|
79
|
+
expect do
|
|
80
80
|
DecisionAgent::Replay.run(modified_payload, strict: false)
|
|
81
|
-
|
|
81
|
+
end.to output(/Decision changed/).to_stderr
|
|
82
82
|
end
|
|
83
83
|
|
|
84
84
|
it "validates required fields in audit payload" do
|
|
85
85
|
invalid_payload = { context: {} }
|
|
86
86
|
|
|
87
|
-
expect
|
|
87
|
+
expect do
|
|
88
88
|
DecisionAgent::Replay.run(invalid_payload, strict: true)
|
|
89
|
-
|
|
89
|
+
end.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key/)
|
|
90
90
|
end
|
|
91
91
|
|
|
92
92
|
it "reconstructs evaluations from audit payload" do
|
|
@@ -112,7 +112,7 @@ RSpec.describe DecisionAgent::Replay do
|
|
|
112
112
|
)
|
|
113
113
|
|
|
114
114
|
expect(replayed_result.evaluations.size).to eq(2)
|
|
115
|
-
expect(replayed_result.evaluations.map(&:evaluator_name)).to match_array([
|
|
115
|
+
expect(replayed_result.evaluations.map(&:evaluator_name)).to match_array(%w[Evaluator1 Evaluator2])
|
|
116
116
|
end
|
|
117
117
|
|
|
118
118
|
it "uses correct scoring strategy from audit payload" do
|