decision_agent 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +1060 -0
- data/bin/decision_agent +104 -0
- data/lib/decision_agent/agent.rb +147 -0
- data/lib/decision_agent/audit/adapter.rb +9 -0
- data/lib/decision_agent/audit/logger_adapter.rb +27 -0
- data/lib/decision_agent/audit/null_adapter.rb +8 -0
- data/lib/decision_agent/context.rb +42 -0
- data/lib/decision_agent/decision.rb +51 -0
- data/lib/decision_agent/dsl/condition_evaluator.rb +133 -0
- data/lib/decision_agent/dsl/rule_parser.rb +36 -0
- data/lib/decision_agent/dsl/schema_validator.rb +275 -0
- data/lib/decision_agent/errors.rb +62 -0
- data/lib/decision_agent/evaluation.rb +52 -0
- data/lib/decision_agent/evaluators/base.rb +15 -0
- data/lib/decision_agent/evaluators/json_rule_evaluator.rb +51 -0
- data/lib/decision_agent/evaluators/static_evaluator.rb +31 -0
- data/lib/decision_agent/replay/replay.rb +147 -0
- data/lib/decision_agent/scoring/base.rb +19 -0
- data/lib/decision_agent/scoring/consensus.rb +40 -0
- data/lib/decision_agent/scoring/max_weight.rb +16 -0
- data/lib/decision_agent/scoring/threshold.rb +40 -0
- data/lib/decision_agent/scoring/weighted_average.rb +26 -0
- data/lib/decision_agent/version.rb +3 -0
- data/lib/decision_agent/web/public/app.js +580 -0
- data/lib/decision_agent/web/public/index.html +190 -0
- data/lib/decision_agent/web/public/styles.css +558 -0
- data/lib/decision_agent/web/server.rb +255 -0
- data/lib/decision_agent.rb +29 -0
- data/spec/agent_spec.rb +249 -0
- data/spec/api_contract_spec.rb +430 -0
- data/spec/audit_adapters_spec.rb +74 -0
- data/spec/comprehensive_edge_cases_spec.rb +1777 -0
- data/spec/context_spec.rb +84 -0
- data/spec/dsl_validation_spec.rb +648 -0
- data/spec/edge_cases_spec.rb +353 -0
- data/spec/examples/feedback_aware_evaluator_spec.rb +460 -0
- data/spec/json_rule_evaluator_spec.rb +587 -0
- data/spec/replay_edge_cases_spec.rb +699 -0
- data/spec/replay_spec.rb +210 -0
- data/spec/scoring_spec.rb +225 -0
- data/spec/spec_helper.rb +28 -0
- metadata +133 -0
|
@@ -0,0 +1,699 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
4
|
+
describe "handling rule changes" do
|
|
5
|
+
let(:original_rules) do
|
|
6
|
+
{
|
|
7
|
+
version: "1.0",
|
|
8
|
+
ruleset: "approval",
|
|
9
|
+
rules: [
|
|
10
|
+
{
|
|
11
|
+
id: "auto_approve",
|
|
12
|
+
if: { field: "score", op: "gte", value: 80 },
|
|
13
|
+
then: { decision: "approve", weight: 0.9, reason: "High score" }
|
|
14
|
+
}
|
|
15
|
+
]
|
|
16
|
+
}
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
let(:modified_rules) do
|
|
20
|
+
{
|
|
21
|
+
version: "2.0",
|
|
22
|
+
ruleset: "approval",
|
|
23
|
+
rules: [
|
|
24
|
+
{
|
|
25
|
+
id: "auto_approve",
|
|
26
|
+
if: { field: "score", op: "gte", value: 90 }, # Changed threshold
|
|
27
|
+
then: { decision: "approve", weight: 0.9, reason: "Very high score" }
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "successfully replays with strict mode when rules haven't changed" do
|
|
34
|
+
evaluator = DecisionAgent::Evaluators::JsonRuleEvaluator.new(rules_json: original_rules)
|
|
35
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
36
|
+
|
|
37
|
+
original_result = agent.decide(context: { score: 85 })
|
|
38
|
+
|
|
39
|
+
expect {
|
|
40
|
+
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
41
|
+
}.not_to raise_error
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it "detects differences in strict mode when rules have changed" do
|
|
45
|
+
# Original decision with old rules
|
|
46
|
+
evaluator_v1 = DecisionAgent::Evaluators::JsonRuleEvaluator.new(rules_json: original_rules)
|
|
47
|
+
agent_v1 = DecisionAgent::Agent.new(evaluators: [evaluator_v1])
|
|
48
|
+
original_result = agent_v1.decide(context: { score: 85 })
|
|
49
|
+
|
|
50
|
+
# Now the rules have changed (threshold increased from 80 to 90)
|
|
51
|
+
# Score of 85 no longer matches, so replay should detect a difference
|
|
52
|
+
|
|
53
|
+
# Replay uses the stored evaluations (not re-evaluating rules)
|
|
54
|
+
# So it should succeed because replay uses static evaluators from the audit payload
|
|
55
|
+
expect {
|
|
56
|
+
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
57
|
+
}.not_to raise_error
|
|
58
|
+
|
|
59
|
+
# The replayed result should match the original
|
|
60
|
+
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
61
|
+
expect(replayed_result.decision).to eq(original_result.decision)
|
|
62
|
+
expect(replayed_result.confidence).to eq(original_result.confidence)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
it "allows evolution in non-strict mode" do
|
|
66
|
+
evaluator = DecisionAgent::Evaluators::JsonRuleEvaluator.new(rules_json: original_rules)
|
|
67
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
68
|
+
|
|
69
|
+
original_result = agent.decide(context: { score: 85 })
|
|
70
|
+
|
|
71
|
+
# In non-strict mode, differences are logged but don't raise errors
|
|
72
|
+
expect {
|
|
73
|
+
DecisionAgent::Replay.run(original_result.audit_payload, strict: false)
|
|
74
|
+
}.not_to raise_error
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
describe "metadata comparison" do
|
|
79
|
+
it "preserves and replays metadata correctly" do
|
|
80
|
+
rules = {
|
|
81
|
+
version: "1.0",
|
|
82
|
+
ruleset: "test",
|
|
83
|
+
rules: [
|
|
84
|
+
{
|
|
85
|
+
id: "metadata_test_rule",
|
|
86
|
+
if: { field: "user", op: "eq", value: "alice" },
|
|
87
|
+
then: {
|
|
88
|
+
decision: "approve",
|
|
89
|
+
weight: 0.8,
|
|
90
|
+
reason: "Trusted user"
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
evaluator = DecisionAgent::Evaluators::JsonRuleEvaluator.new(rules_json: rules)
|
|
97
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
98
|
+
|
|
99
|
+
original_result = agent.decide(context: { user: "alice" })
|
|
100
|
+
|
|
101
|
+
# Verify metadata is in the audit payload
|
|
102
|
+
expect(original_result.audit_payload[:evaluations].first[:metadata]).to include(
|
|
103
|
+
rule_id: "metadata_test_rule"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Replay should preserve metadata
|
|
107
|
+
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
108
|
+
|
|
109
|
+
expect(replayed_result.evaluations.first.metadata).to eq(
|
|
110
|
+
original_result.evaluations.first.metadata
|
|
111
|
+
)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
it "handles metadata from static evaluators" do
|
|
115
|
+
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
116
|
+
decision: "approve",
|
|
117
|
+
weight: 0.7,
|
|
118
|
+
reason: "No custom metadata"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
122
|
+
original_result = agent.decide(context: { user: "bob" })
|
|
123
|
+
|
|
124
|
+
# StaticEvaluator adds type: "static" by default
|
|
125
|
+
expect(original_result.evaluations.first.metadata).to eq({ type: "static" })
|
|
126
|
+
|
|
127
|
+
expect {
|
|
128
|
+
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
129
|
+
}.not_to raise_error
|
|
130
|
+
|
|
131
|
+
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
132
|
+
expect(replayed_result.evaluations.first.metadata).to eq({ type: "static" })
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
it "handles complex nested metadata" do
|
|
136
|
+
evaluation = DecisionAgent::Evaluation.new(
|
|
137
|
+
decision: "escalate",
|
|
138
|
+
weight: 0.85,
|
|
139
|
+
reason: "Complex case",
|
|
140
|
+
evaluator_name: "CustomEvaluator",
|
|
141
|
+
metadata: {
|
|
142
|
+
user: { id: 123, role: "admin" },
|
|
143
|
+
tags: ["urgent", "important"],
|
|
144
|
+
history: [
|
|
145
|
+
{ action: "created", timestamp: "2025-01-01" },
|
|
146
|
+
{ action: "updated", timestamp: "2025-01-02" }
|
|
147
|
+
]
|
|
148
|
+
}
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
static_eval = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
152
|
+
decision: evaluation.decision,
|
|
153
|
+
weight: evaluation.weight,
|
|
154
|
+
reason: evaluation.reason
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
agent = DecisionAgent::Agent.new(evaluators: [static_eval])
|
|
158
|
+
original_result = agent.decide(context: { test: true })
|
|
159
|
+
|
|
160
|
+
# Manually construct audit payload with complex metadata
|
|
161
|
+
payload = original_result.audit_payload.dup
|
|
162
|
+
payload[:evaluations] = [evaluation.to_h]
|
|
163
|
+
|
|
164
|
+
replayed_result = DecisionAgent::Replay.run(payload, strict: false)
|
|
165
|
+
|
|
166
|
+
expect(replayed_result.evaluations.first.metadata).to be_a(Hash)
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
describe "handling missing evaluators in replay" do
|
|
171
|
+
it "replays successfully even if original evaluator class doesn't exist" do
|
|
172
|
+
# This simulates a scenario where we had a CustomEvaluator that no longer exists
|
|
173
|
+
# but we can still replay the decision from the audit log
|
|
174
|
+
|
|
175
|
+
# WeightedAverage normalizes confidence: with one eval of weight 0.9, confidence = 0.9/0.9 = 1.0
|
|
176
|
+
# So we need to use the correct confidence value that WeightedAverage would produce
|
|
177
|
+
audit_payload = {
|
|
178
|
+
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
179
|
+
context: { user: "charlie", action: "login" },
|
|
180
|
+
feedback: {},
|
|
181
|
+
evaluations: [
|
|
182
|
+
{
|
|
183
|
+
decision: "allow",
|
|
184
|
+
weight: 0.9,
|
|
185
|
+
reason: "User authenticated successfully",
|
|
186
|
+
evaluator_name: "DeletedCustomAuthEvaluator", # This evaluator no longer exists
|
|
187
|
+
metadata: { auth_method: "oauth", provider: "google" }
|
|
188
|
+
}
|
|
189
|
+
],
|
|
190
|
+
decision: "allow",
|
|
191
|
+
confidence: 1.0, # WeightedAverage normalizes single eval to 1.0
|
|
192
|
+
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
193
|
+
agent_version: "0.1.0",
|
|
194
|
+
deterministic_hash: "abc123"
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
# Replay should work because it uses StaticEvaluator, not the original evaluator
|
|
198
|
+
expect {
|
|
199
|
+
DecisionAgent::Replay.run(audit_payload, strict: true)
|
|
200
|
+
}.not_to raise_error
|
|
201
|
+
|
|
202
|
+
replayed_result = DecisionAgent::Replay.run(audit_payload, strict: true)
|
|
203
|
+
|
|
204
|
+
expect(replayed_result.decision).to eq("allow")
|
|
205
|
+
expect(replayed_result.confidence).to eq(1.0)
|
|
206
|
+
expect(replayed_result.evaluations.first.evaluator_name).to eq("DeletedCustomAuthEvaluator")
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
it "handles multiple evaluators where some are missing" do
|
|
210
|
+
# WeightedAverage with two evals agreeing: confidence = (0.8 + 0.7) / (0.8 + 0.7) = 1.0
|
|
211
|
+
audit_payload = {
|
|
212
|
+
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
213
|
+
context: { user: "dave" },
|
|
214
|
+
feedback: {},
|
|
215
|
+
evaluations: [
|
|
216
|
+
{
|
|
217
|
+
decision: "approve",
|
|
218
|
+
weight: 0.8,
|
|
219
|
+
reason: "Rule matched",
|
|
220
|
+
evaluator_name: "RuleEngine",
|
|
221
|
+
metadata: { rule_id: "rule_123" }
|
|
222
|
+
},
|
|
223
|
+
{
|
|
224
|
+
decision: "approve",
|
|
225
|
+
weight: 0.7,
|
|
226
|
+
reason: "ML model prediction",
|
|
227
|
+
evaluator_name: "NonExistentMLEvaluator", # Missing evaluator
|
|
228
|
+
metadata: { model_version: "v2.1" }
|
|
229
|
+
}
|
|
230
|
+
],
|
|
231
|
+
decision: "approve",
|
|
232
|
+
confidence: 1.0, # Both agree, so 100% confidence
|
|
233
|
+
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
234
|
+
agent_version: "0.1.0",
|
|
235
|
+
deterministic_hash: "def456"
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
replayed_result = DecisionAgent::Replay.run(audit_payload, strict: true)
|
|
239
|
+
|
|
240
|
+
expect(replayed_result.decision).to eq("approve")
|
|
241
|
+
expect(replayed_result.evaluations.size).to eq(2)
|
|
242
|
+
expect(replayed_result.evaluations.map(&:evaluator_name)).to include("NonExistentMLEvaluator")
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
describe "scoring strategy evolution" do
|
|
247
|
+
it "handles unknown scoring strategies gracefully" do
|
|
248
|
+
audit_payload = {
|
|
249
|
+
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
250
|
+
context: { test: true },
|
|
251
|
+
feedback: {},
|
|
252
|
+
evaluations: [
|
|
253
|
+
{
|
|
254
|
+
decision: "approve",
|
|
255
|
+
weight: 0.9,
|
|
256
|
+
reason: "Test",
|
|
257
|
+
evaluator_name: "TestEvaluator",
|
|
258
|
+
metadata: {}
|
|
259
|
+
}
|
|
260
|
+
],
|
|
261
|
+
decision: "approve",
|
|
262
|
+
confidence: 0.9,
|
|
263
|
+
scoring_strategy: "DecisionAgent::Scoring::DeprecatedBayesianStrategy", # Doesn't exist
|
|
264
|
+
agent_version: "0.1.0",
|
|
265
|
+
deterministic_hash: "ghi789"
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
# Should fall back to WeightedAverage
|
|
269
|
+
expect {
|
|
270
|
+
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
271
|
+
}.not_to raise_error
|
|
272
|
+
|
|
273
|
+
replayed_result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
274
|
+
expect(replayed_result.decision).to eq("approve")
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
it "detects scoring strategy mismatch in strict mode" do
|
|
278
|
+
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
279
|
+
decision: "approve",
|
|
280
|
+
weight: 0.6,
|
|
281
|
+
reason: "Test"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Create decision with WeightedAverage
|
|
285
|
+
agent_weighted = DecisionAgent::Agent.new(
|
|
286
|
+
evaluators: [evaluator],
|
|
287
|
+
scoring_strategy: DecisionAgent::Scoring::WeightedAverage.new
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
original_result = agent_weighted.decide(context: { test: true })
|
|
291
|
+
|
|
292
|
+
# Replay uses the stored scoring strategy from the audit payload
|
|
293
|
+
# So it should replay successfully
|
|
294
|
+
expect {
|
|
295
|
+
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
296
|
+
}.not_to raise_error
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
describe "audit payload validation" do
|
|
301
|
+
it "requires context field" do
|
|
302
|
+
incomplete_payload = {
|
|
303
|
+
evaluations: [],
|
|
304
|
+
decision: "test",
|
|
305
|
+
confidence: 0.5
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
expect {
|
|
309
|
+
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
310
|
+
}.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: context/)
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
it "requires evaluations field" do
|
|
314
|
+
incomplete_payload = {
|
|
315
|
+
context: { test: true },
|
|
316
|
+
decision: "test",
|
|
317
|
+
confidence: 0.5
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
expect {
|
|
321
|
+
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
322
|
+
}.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: evaluations/)
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
it "requires decision field" do
|
|
326
|
+
incomplete_payload = {
|
|
327
|
+
context: { test: true },
|
|
328
|
+
evaluations: [],
|
|
329
|
+
confidence: 0.5
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
expect {
|
|
333
|
+
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
334
|
+
}.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: decision/)
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
it "requires confidence field" do
|
|
338
|
+
incomplete_payload = {
|
|
339
|
+
context: { test: true },
|
|
340
|
+
evaluations: [],
|
|
341
|
+
decision: "test"
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
expect {
|
|
345
|
+
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
346
|
+
}.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: confidence/)
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
it "accepts both symbol and string keys" do
|
|
350
|
+
# Use MaxWeight strategy which preserves the original weight as confidence
|
|
351
|
+
payload_with_strings = {
|
|
352
|
+
"timestamp" => "2025-01-15T10:00:00.123456Z",
|
|
353
|
+
"context" => { "test" => true },
|
|
354
|
+
"feedback" => {},
|
|
355
|
+
"evaluations" => [
|
|
356
|
+
{
|
|
357
|
+
"decision" => "approve",
|
|
358
|
+
"weight" => 0.9,
|
|
359
|
+
"reason" => "Test",
|
|
360
|
+
"evaluator_name" => "TestEvaluator",
|
|
361
|
+
"metadata" => {}
|
|
362
|
+
}
|
|
363
|
+
],
|
|
364
|
+
"decision" => "approve",
|
|
365
|
+
"confidence" => 0.9,
|
|
366
|
+
"scoring_strategy" => "DecisionAgent::Scoring::MaxWeight"
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
expect {
|
|
370
|
+
DecisionAgent::Replay.run(payload_with_strings, strict: true)
|
|
371
|
+
}.not_to raise_error
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
describe "deterministic hash verification" do
|
|
376
|
+
it "can verify replay produced the same deterministic hash" do
|
|
377
|
+
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
378
|
+
decision: "approve",
|
|
379
|
+
weight: 0.8,
|
|
380
|
+
reason: "Test"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
384
|
+
original_result = agent.decide(context: { user: "test" })
|
|
385
|
+
|
|
386
|
+
original_hash = original_result.audit_payload[:deterministic_hash]
|
|
387
|
+
|
|
388
|
+
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
389
|
+
replayed_hash = replayed_result.audit_payload[:deterministic_hash]
|
|
390
|
+
|
|
391
|
+
# Hashes should match because same context, evaluations, decision, confidence, and strategy
|
|
392
|
+
expect(replayed_hash).to eq(original_hash)
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
it "hash changes when context changes" do
|
|
396
|
+
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
397
|
+
decision: "approve",
|
|
398
|
+
weight: 0.8,
|
|
399
|
+
reason: "Test"
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
403
|
+
|
|
404
|
+
result1 = agent.decide(context: { user: "alice" })
|
|
405
|
+
result2 = agent.decide(context: { user: "bob" })
|
|
406
|
+
|
|
407
|
+
expect(result1.audit_payload[:deterministic_hash]).not_to eq(
|
|
408
|
+
result2.audit_payload[:deterministic_hash]
|
|
409
|
+
)
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
describe "feedback preservation in replay" do
|
|
414
|
+
it "preserves original feedback in replay" do
|
|
415
|
+
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
416
|
+
decision: "approve",
|
|
417
|
+
weight: 0.8,
|
|
418
|
+
reason: "Test"
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
422
|
+
|
|
423
|
+
original_feedback = { user_id: "manager_123", source: "manual_review" }
|
|
424
|
+
original_result = agent.decide(context: { test: true }, feedback: original_feedback)
|
|
425
|
+
|
|
426
|
+
expect(original_result.audit_payload[:feedback]).to eq(original_feedback)
|
|
427
|
+
|
|
428
|
+
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
429
|
+
|
|
430
|
+
expect(replayed_result.audit_payload[:feedback]).to eq(original_feedback)
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
it "handles empty feedback" do
|
|
434
|
+
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
435
|
+
decision: "approve",
|
|
436
|
+
weight: 0.8,
|
|
437
|
+
reason: "Test"
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
441
|
+
original_result = agent.decide(context: { test: true })
|
|
442
|
+
|
|
443
|
+
expect(original_result.audit_payload[:feedback]).to eq({})
|
|
444
|
+
|
|
445
|
+
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
446
|
+
expect(replayed_result.audit_payload[:feedback]).to eq({})
|
|
447
|
+
end
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
describe "version mismatch scenarios" do
|
|
451
|
+
it "logs warning when agent_version differs in non-strict mode" do
|
|
452
|
+
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
453
|
+
decision: "approve",
|
|
454
|
+
weight: 0.8,
|
|
455
|
+
reason: "Test"
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
459
|
+
original_result = agent.decide(context: { test: true })
|
|
460
|
+
|
|
461
|
+
# Modify agent_version
|
|
462
|
+
modified_payload = original_result.audit_payload.dup
|
|
463
|
+
modified_payload[:agent_version] = "99.0.0" # Different version
|
|
464
|
+
|
|
465
|
+
# Non-strict mode should log but not raise
|
|
466
|
+
expect {
|
|
467
|
+
DecisionAgent::Replay.run(modified_payload, strict: false)
|
|
468
|
+
}.not_to raise_error
|
|
469
|
+
|
|
470
|
+
# Should successfully replay despite version difference
|
|
471
|
+
replayed_result = DecisionAgent::Replay.run(modified_payload, strict: false)
|
|
472
|
+
expect(replayed_result.decision).to eq("approve")
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
it "accepts different agent_version in non-strict mode" do
|
|
476
|
+
audit_payload = {
|
|
477
|
+
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
478
|
+
context: { test: true },
|
|
479
|
+
feedback: {},
|
|
480
|
+
evaluations: [
|
|
481
|
+
{
|
|
482
|
+
decision: "approve",
|
|
483
|
+
weight: 0.9,
|
|
484
|
+
reason: "Test",
|
|
485
|
+
evaluator_name: "TestEvaluator",
|
|
486
|
+
metadata: {}
|
|
487
|
+
}
|
|
488
|
+
],
|
|
489
|
+
decision: "approve",
|
|
490
|
+
confidence: 1.0,
|
|
491
|
+
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
492
|
+
agent_version: "0.0.1", # Old version
|
|
493
|
+
deterministic_hash: "old_hash"
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
# Should accept and replay successfully
|
|
497
|
+
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
498
|
+
expect(result.decision).to eq("approve")
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
it "replays successfully in strict mode regardless of version" do
|
|
502
|
+
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
503
|
+
decision: "approve",
|
|
504
|
+
weight: 0.8,
|
|
505
|
+
reason: "Test"
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
509
|
+
original_result = agent.decide(context: { test: true })
|
|
510
|
+
|
|
511
|
+
# Modify agent_version
|
|
512
|
+
modified_payload = original_result.audit_payload.dup
|
|
513
|
+
modified_payload[:agent_version] = "2.0.0"
|
|
514
|
+
|
|
515
|
+
# Strict mode should still work because version is not part of deterministic comparison
|
|
516
|
+
# (only decision and confidence are compared in strict mode)
|
|
517
|
+
expect {
|
|
518
|
+
DecisionAgent::Replay.run(modified_payload, strict: true)
|
|
519
|
+
}.not_to raise_error
|
|
520
|
+
end
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
describe "corrupted audit payload scenarios" do
|
|
524
|
+
it "handles missing deterministic_hash gracefully" do
|
|
525
|
+
audit_payload = {
|
|
526
|
+
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
527
|
+
context: { test: true },
|
|
528
|
+
feedback: {},
|
|
529
|
+
evaluations: [
|
|
530
|
+
{
|
|
531
|
+
decision: "approve",
|
|
532
|
+
weight: 0.9,
|
|
533
|
+
reason: "Test",
|
|
534
|
+
evaluator_name: "TestEvaluator",
|
|
535
|
+
metadata: {}
|
|
536
|
+
}
|
|
537
|
+
],
|
|
538
|
+
decision: "approve",
|
|
539
|
+
confidence: 1.0,
|
|
540
|
+
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
541
|
+
agent_version: "0.1.0"
|
|
542
|
+
# deterministic_hash is missing
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
# Should not raise error, just creates new hash during replay
|
|
546
|
+
expect {
|
|
547
|
+
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
548
|
+
}.not_to raise_error
|
|
549
|
+
|
|
550
|
+
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
551
|
+
expect(result.decision).to eq("approve")
|
|
552
|
+
expect(result.audit_payload[:deterministic_hash]).to be_a(String)
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
it "handles invalid deterministic_hash gracefully" do
|
|
556
|
+
audit_payload = {
|
|
557
|
+
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
558
|
+
context: { test: true },
|
|
559
|
+
feedback: {},
|
|
560
|
+
evaluations: [
|
|
561
|
+
{
|
|
562
|
+
decision: "approve",
|
|
563
|
+
weight: 0.9,
|
|
564
|
+
reason: "Test",
|
|
565
|
+
evaluator_name: "TestEvaluator",
|
|
566
|
+
metadata: {}
|
|
567
|
+
}
|
|
568
|
+
],
|
|
569
|
+
decision: "approve",
|
|
570
|
+
confidence: 1.0,
|
|
571
|
+
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
572
|
+
agent_version: "0.1.0",
|
|
573
|
+
deterministic_hash: "corrupted_invalid_hash_12345"
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
# Should replay successfully, generating new hash
|
|
577
|
+
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
578
|
+
expect(result.decision).to eq("approve")
|
|
579
|
+
# New hash should be different from corrupted one
|
|
580
|
+
expect(result.audit_payload[:deterministic_hash]).not_to eq("corrupted_invalid_hash_12345")
|
|
581
|
+
end
|
|
582
|
+
|
|
583
|
+
it "validates required fields before replay" do
|
|
584
|
+
# Missing context
|
|
585
|
+
expect {
|
|
586
|
+
DecisionAgent::Replay.run({ decision: "test", confidence: 0.5, evaluations: [] }, strict: true)
|
|
587
|
+
}.to raise_error(DecisionAgent::InvalidRuleDslError, /context/)
|
|
588
|
+
|
|
589
|
+
# Missing evaluations
|
|
590
|
+
expect {
|
|
591
|
+
DecisionAgent::Replay.run({ context: {}, decision: "test", confidence: 0.5 }, strict: true)
|
|
592
|
+
}.to raise_error(DecisionAgent::InvalidRuleDslError, /evaluations/)
|
|
593
|
+
|
|
594
|
+
# Missing decision
|
|
595
|
+
expect {
|
|
596
|
+
DecisionAgent::Replay.run({ context: {}, evaluations: [], confidence: 0.5 }, strict: true)
|
|
597
|
+
}.to raise_error(DecisionAgent::InvalidRuleDslError, /decision/)
|
|
598
|
+
|
|
599
|
+
# Missing confidence
|
|
600
|
+
expect {
|
|
601
|
+
DecisionAgent::Replay.run({ context: {}, evaluations: [], decision: "test" }, strict: true)
|
|
602
|
+
}.to raise_error(DecisionAgent::InvalidRuleDslError, /confidence/)
|
|
603
|
+
end
|
|
604
|
+
|
|
605
|
+
it "handles evaluation with invalid weight" do
|
|
606
|
+
audit_payload = {
|
|
607
|
+
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
608
|
+
context: { test: true },
|
|
609
|
+
feedback: {},
|
|
610
|
+
evaluations: [
|
|
611
|
+
{
|
|
612
|
+
decision: "approve",
|
|
613
|
+
weight: 2.5, # Weight > 1.0, invalid
|
|
614
|
+
reason: "Test",
|
|
615
|
+
evaluator_name: "TestEvaluator",
|
|
616
|
+
metadata: {}
|
|
617
|
+
}
|
|
618
|
+
],
|
|
619
|
+
decision: "approve",
|
|
620
|
+
confidence: 1.0,
|
|
621
|
+
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage"
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
# Invalid weight (> 1.0) should raise error when creating Evaluation
|
|
625
|
+
expect {
|
|
626
|
+
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
627
|
+
}.to raise_error(DecisionAgent::InvalidWeightError)
|
|
628
|
+
end
|
|
629
|
+
|
|
630
|
+
it "handles completely empty audit payload" do
|
|
631
|
+
expect {
|
|
632
|
+
DecisionAgent::Replay.run({}, strict: false)
|
|
633
|
+
}.to raise_error(DecisionAgent::InvalidRuleDslError)
|
|
634
|
+
end
|
|
635
|
+
|
|
636
|
+
it "handles nil audit payload" do
|
|
637
|
+
expect {
|
|
638
|
+
DecisionAgent::Replay.run(nil, strict: false)
|
|
639
|
+
}.to raise_error
|
|
640
|
+
end
|
|
641
|
+
end
|
|
642
|
+
|
|
643
|
+
describe "scoring strategy class rename scenarios" do
|
|
644
|
+
it "handles renamed scoring strategy class in non-strict mode" do
|
|
645
|
+
audit_payload = {
|
|
646
|
+
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
647
|
+
context: { test: true },
|
|
648
|
+
feedback: {},
|
|
649
|
+
evaluations: [
|
|
650
|
+
{
|
|
651
|
+
decision: "approve",
|
|
652
|
+
weight: 0.9,
|
|
653
|
+
reason: "Test",
|
|
654
|
+
evaluator_name: "TestEvaluator",
|
|
655
|
+
metadata: {}
|
|
656
|
+
}
|
|
657
|
+
],
|
|
658
|
+
decision: "approve",
|
|
659
|
+
confidence: 0.9,
|
|
660
|
+
scoring_strategy: "DecisionAgent::Scoring::OldStrategyName", # Renamed or deleted
|
|
661
|
+
agent_version: "0.1.0"
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
# Should fall back to default strategy (WeightedAverage)
|
|
665
|
+
expect {
|
|
666
|
+
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
667
|
+
}.not_to raise_error
|
|
668
|
+
|
|
669
|
+
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
670
|
+
expect(result.decision).to eq("approve")
|
|
671
|
+
end
|
|
672
|
+
|
|
673
|
+
it "handles custom scoring strategy not in current codebase" do
|
|
674
|
+
audit_payload = {
|
|
675
|
+
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
676
|
+
context: { test: true },
|
|
677
|
+
feedback: {},
|
|
678
|
+
evaluations: [
|
|
679
|
+
{
|
|
680
|
+
decision: "approve",
|
|
681
|
+
weight: 0.85,
|
|
682
|
+
reason: "Test",
|
|
683
|
+
evaluator_name: "TestEvaluator",
|
|
684
|
+
metadata: {}
|
|
685
|
+
}
|
|
686
|
+
],
|
|
687
|
+
decision: "approve",
|
|
688
|
+
confidence: 0.85,
|
|
689
|
+
scoring_strategy: "MyCompany::CustomMLBasedScoringStrategy", # Custom strategy
|
|
690
|
+
agent_version: "0.1.0"
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
# Should use fallback strategy
|
|
694
|
+
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
695
|
+
expect(result).not_to be_nil
|
|
696
|
+
expect(result.decision).to eq("approve")
|
|
697
|
+
end
|
|
698
|
+
end
|
|
699
|
+
end
|