decision_agent 0.3.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +272 -7
- data/lib/decision_agent/agent.rb +72 -1
- data/lib/decision_agent/context.rb +1 -0
- data/lib/decision_agent/data_enrichment/cache/memory_adapter.rb +86 -0
- data/lib/decision_agent/data_enrichment/cache_adapter.rb +49 -0
- data/lib/decision_agent/data_enrichment/circuit_breaker.rb +135 -0
- data/lib/decision_agent/data_enrichment/client.rb +220 -0
- data/lib/decision_agent/data_enrichment/config.rb +78 -0
- data/lib/decision_agent/data_enrichment/errors.rb +36 -0
- data/lib/decision_agent/decision.rb +102 -2
- data/lib/decision_agent/dmn/feel/evaluator.rb +28 -6
- data/lib/decision_agent/dsl/condition_evaluator.rb +982 -839
- data/lib/decision_agent/dsl/schema_validator.rb +51 -13
- data/lib/decision_agent/evaluators/dmn_evaluator.rb +106 -19
- data/lib/decision_agent/evaluators/json_rule_evaluator.rb +69 -9
- data/lib/decision_agent/explainability/condition_trace.rb +83 -0
- data/lib/decision_agent/explainability/explainability_result.rb +52 -0
- data/lib/decision_agent/explainability/rule_trace.rb +39 -0
- data/lib/decision_agent/explainability/trace_collector.rb +24 -0
- data/lib/decision_agent/monitoring/alert_manager.rb +5 -1
- data/lib/decision_agent/simulation/errors.rb +18 -0
- data/lib/decision_agent/simulation/impact_analyzer.rb +498 -0
- data/lib/decision_agent/simulation/monte_carlo_simulator.rb +635 -0
- data/lib/decision_agent/simulation/replay_engine.rb +486 -0
- data/lib/decision_agent/simulation/scenario_engine.rb +318 -0
- data/lib/decision_agent/simulation/scenario_library.rb +163 -0
- data/lib/decision_agent/simulation/shadow_test_engine.rb +287 -0
- data/lib/decision_agent/simulation/what_if_analyzer.rb +1002 -0
- data/lib/decision_agent/simulation.rb +17 -0
- data/lib/decision_agent/version.rb +1 -1
- data/lib/decision_agent/versioning/activerecord_adapter.rb +23 -8
- data/lib/decision_agent/web/public/app.js +119 -0
- data/lib/decision_agent/web/public/index.html +49 -0
- data/lib/decision_agent/web/public/simulation.html +130 -0
- data/lib/decision_agent/web/public/simulation_impact.html +478 -0
- data/lib/decision_agent/web/public/simulation_replay.html +551 -0
- data/lib/decision_agent/web/public/simulation_shadow.html +546 -0
- data/lib/decision_agent/web/public/simulation_whatif.html +532 -0
- data/lib/decision_agent/web/public/styles.css +65 -0
- data/lib/decision_agent/web/server.rb +594 -23
- data/lib/decision_agent.rb +60 -2
- metadata +53 -73
- data/spec/ab_testing/ab_test_assignment_spec.rb +0 -253
- data/spec/ab_testing/ab_test_manager_spec.rb +0 -612
- data/spec/ab_testing/ab_test_spec.rb +0 -270
- data/spec/ab_testing/ab_testing_agent_spec.rb +0 -655
- data/spec/ab_testing/storage/adapter_spec.rb +0 -64
- data/spec/ab_testing/storage/memory_adapter_spec.rb +0 -485
- data/spec/activerecord_thread_safety_spec.rb +0 -553
- data/spec/advanced_operators_spec.rb +0 -3150
- data/spec/agent_spec.rb +0 -289
- data/spec/api_contract_spec.rb +0 -430
- data/spec/audit_adapters_spec.rb +0 -92
- data/spec/auth/access_audit_logger_spec.rb +0 -394
- data/spec/auth/authenticator_spec.rb +0 -112
- data/spec/auth/password_reset_spec.rb +0 -294
- data/spec/auth/permission_checker_spec.rb +0 -207
- data/spec/auth/permission_spec.rb +0 -73
- data/spec/auth/rbac_adapter_spec.rb +0 -778
- data/spec/auth/rbac_config_spec.rb +0 -82
- data/spec/auth/role_spec.rb +0 -51
- data/spec/auth/session_manager_spec.rb +0 -172
- data/spec/auth/session_spec.rb +0 -112
- data/spec/auth/user_spec.rb +0 -130
- data/spec/comprehensive_edge_cases_spec.rb +0 -1777
- data/spec/context_spec.rb +0 -127
- data/spec/decision_agent_spec.rb +0 -96
- data/spec/decision_spec.rb +0 -423
- data/spec/dmn/decision_graph_spec.rb +0 -282
- data/spec/dmn/decision_tree_spec.rb +0 -203
- data/spec/dmn/feel/errors_spec.rb +0 -18
- data/spec/dmn/feel/functions_spec.rb +0 -400
- data/spec/dmn/feel/simple_parser_spec.rb +0 -274
- data/spec/dmn/feel/types_spec.rb +0 -176
- data/spec/dmn/feel_parser_spec.rb +0 -489
- data/spec/dmn/hit_policy_spec.rb +0 -202
- data/spec/dmn/integration_spec.rb +0 -226
- data/spec/dsl/condition_evaluator_spec.rb +0 -774
- data/spec/dsl_validation_spec.rb +0 -648
- data/spec/edge_cases_spec.rb +0 -353
- data/spec/evaluation_spec.rb +0 -364
- data/spec/evaluation_validator_spec.rb +0 -165
- data/spec/examples/feedback_aware_evaluator_spec.rb +0 -460
- data/spec/examples.txt +0 -1909
- data/spec/fixtures/dmn/complex_decision.dmn +0 -81
- data/spec/fixtures/dmn/invalid_structure.dmn +0 -31
- data/spec/fixtures/dmn/simple_decision.dmn +0 -40
- data/spec/issue_verification_spec.rb +0 -759
- data/spec/json_rule_evaluator_spec.rb +0 -587
- data/spec/monitoring/alert_manager_spec.rb +0 -378
- data/spec/monitoring/metrics_collector_spec.rb +0 -501
- data/spec/monitoring/monitored_agent_spec.rb +0 -225
- data/spec/monitoring/prometheus_exporter_spec.rb +0 -242
- data/spec/monitoring/storage/activerecord_adapter_spec.rb +0 -498
- data/spec/monitoring/storage/base_adapter_spec.rb +0 -61
- data/spec/monitoring/storage/memory_adapter_spec.rb +0 -247
- data/spec/performance_optimizations_spec.rb +0 -493
- data/spec/replay_edge_cases_spec.rb +0 -699
- data/spec/replay_spec.rb +0 -210
- data/spec/rfc8785_canonicalization_spec.rb +0 -215
- data/spec/scoring_spec.rb +0 -225
- data/spec/spec_helper.rb +0 -60
- data/spec/testing/batch_test_importer_spec.rb +0 -693
- data/spec/testing/batch_test_runner_spec.rb +0 -307
- data/spec/testing/test_coverage_analyzer_spec.rb +0 -292
- data/spec/testing/test_result_comparator_spec.rb +0 -392
- data/spec/testing/test_scenario_spec.rb +0 -113
- data/spec/thread_safety_spec.rb +0 -490
- data/spec/thread_safety_spec.rb.broken +0 -878
- data/spec/versioning/adapter_spec.rb +0 -156
- data/spec/versioning_spec.rb +0 -1030
- data/spec/web/middleware/auth_middleware_spec.rb +0 -133
- data/spec/web/middleware/permission_middleware_spec.rb +0 -247
- data/spec/web_ui_rack_spec.rb +0 -2134
|
@@ -1,699 +0,0 @@
|
|
|
1
|
-
require "spec_helper"
|
|
2
|
-
|
|
3
|
-
RSpec.describe "DecisionAgent::Replay Edge Cases" do
|
|
4
|
-
describe "handling rule changes" do
|
|
5
|
-
let(:original_rules) do
|
|
6
|
-
{
|
|
7
|
-
version: "1.0",
|
|
8
|
-
ruleset: "approval",
|
|
9
|
-
rules: [
|
|
10
|
-
{
|
|
11
|
-
id: "auto_approve",
|
|
12
|
-
if: { field: "score", op: "gte", value: 80 },
|
|
13
|
-
then: { decision: "approve", weight: 0.9, reason: "High score" }
|
|
14
|
-
}
|
|
15
|
-
]
|
|
16
|
-
}
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
let(:modified_rules) do
|
|
20
|
-
{
|
|
21
|
-
version: "2.0",
|
|
22
|
-
ruleset: "approval",
|
|
23
|
-
rules: [
|
|
24
|
-
{
|
|
25
|
-
id: "auto_approve",
|
|
26
|
-
if: { field: "score", op: "gte", value: 90 }, # Changed threshold
|
|
27
|
-
then: { decision: "approve", weight: 0.9, reason: "Very high score" }
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
it "successfully replays with strict mode when rules haven't changed" do
|
|
34
|
-
evaluator = DecisionAgent::Evaluators::JsonRuleEvaluator.new(rules_json: original_rules)
|
|
35
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
36
|
-
|
|
37
|
-
original_result = agent.decide(context: { score: 85 })
|
|
38
|
-
|
|
39
|
-
expect do
|
|
40
|
-
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
41
|
-
end.not_to raise_error
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
it "detects differences in strict mode when rules have changed" do
|
|
45
|
-
# Original decision with old rules
|
|
46
|
-
evaluator_v1 = DecisionAgent::Evaluators::JsonRuleEvaluator.new(rules_json: original_rules)
|
|
47
|
-
agent_v1 = DecisionAgent::Agent.new(evaluators: [evaluator_v1])
|
|
48
|
-
original_result = agent_v1.decide(context: { score: 85 })
|
|
49
|
-
|
|
50
|
-
# Now the rules have changed (threshold increased from 80 to 90)
|
|
51
|
-
# Score of 85 no longer matches, so replay should detect a difference
|
|
52
|
-
|
|
53
|
-
# Replay uses the stored evaluations (not re-evaluating rules)
|
|
54
|
-
# So it should succeed because replay uses static evaluators from the audit payload
|
|
55
|
-
expect do
|
|
56
|
-
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
57
|
-
end.not_to raise_error
|
|
58
|
-
|
|
59
|
-
# The replayed result should match the original
|
|
60
|
-
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
61
|
-
expect(replayed_result.decision).to eq(original_result.decision)
|
|
62
|
-
expect(replayed_result.confidence).to eq(original_result.confidence)
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
it "allows evolution in non-strict mode" do
|
|
66
|
-
evaluator = DecisionAgent::Evaluators::JsonRuleEvaluator.new(rules_json: original_rules)
|
|
67
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
68
|
-
|
|
69
|
-
original_result = agent.decide(context: { score: 85 })
|
|
70
|
-
|
|
71
|
-
# In non-strict mode, differences are logged but don't raise errors
|
|
72
|
-
expect do
|
|
73
|
-
DecisionAgent::Replay.run(original_result.audit_payload, strict: false)
|
|
74
|
-
end.not_to raise_error
|
|
75
|
-
end
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
describe "metadata comparison" do
|
|
79
|
-
it "preserves and replays metadata correctly" do
|
|
80
|
-
rules = {
|
|
81
|
-
version: "1.0",
|
|
82
|
-
ruleset: "test",
|
|
83
|
-
rules: [
|
|
84
|
-
{
|
|
85
|
-
id: "metadata_test_rule",
|
|
86
|
-
if: { field: "user", op: "eq", value: "alice" },
|
|
87
|
-
then: {
|
|
88
|
-
decision: "approve",
|
|
89
|
-
weight: 0.8,
|
|
90
|
-
reason: "Trusted user"
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
]
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
evaluator = DecisionAgent::Evaluators::JsonRuleEvaluator.new(rules_json: rules)
|
|
97
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
98
|
-
|
|
99
|
-
original_result = agent.decide(context: { user: "alice" })
|
|
100
|
-
|
|
101
|
-
# Verify metadata is in the audit payload
|
|
102
|
-
expect(original_result.audit_payload[:evaluations].first[:metadata]).to include(
|
|
103
|
-
rule_id: "metadata_test_rule"
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
# Replay should preserve metadata
|
|
107
|
-
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
108
|
-
|
|
109
|
-
expect(replayed_result.evaluations.first.metadata).to eq(
|
|
110
|
-
original_result.evaluations.first.metadata
|
|
111
|
-
)
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
it "handles metadata from static evaluators" do
|
|
115
|
-
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
116
|
-
decision: "approve",
|
|
117
|
-
weight: 0.7,
|
|
118
|
-
reason: "No custom metadata"
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
122
|
-
original_result = agent.decide(context: { user: "bob" })
|
|
123
|
-
|
|
124
|
-
# StaticEvaluator adds type: "static" by default
|
|
125
|
-
expect(original_result.evaluations.first.metadata).to eq({ type: "static" })
|
|
126
|
-
|
|
127
|
-
expect do
|
|
128
|
-
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
129
|
-
end.not_to raise_error
|
|
130
|
-
|
|
131
|
-
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
132
|
-
expect(replayed_result.evaluations.first.metadata).to eq({ type: "static" })
|
|
133
|
-
end
|
|
134
|
-
|
|
135
|
-
it "handles complex nested metadata" do
|
|
136
|
-
evaluation = DecisionAgent::Evaluation.new(
|
|
137
|
-
decision: "escalate",
|
|
138
|
-
weight: 0.85,
|
|
139
|
-
reason: "Complex case",
|
|
140
|
-
evaluator_name: "CustomEvaluator",
|
|
141
|
-
metadata: {
|
|
142
|
-
user: { id: 123, role: "admin" },
|
|
143
|
-
tags: %w[urgent important],
|
|
144
|
-
history: [
|
|
145
|
-
{ action: "created", timestamp: "2025-01-01" },
|
|
146
|
-
{ action: "updated", timestamp: "2025-01-02" }
|
|
147
|
-
]
|
|
148
|
-
}
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
static_eval = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
152
|
-
decision: evaluation.decision,
|
|
153
|
-
weight: evaluation.weight,
|
|
154
|
-
reason: evaluation.reason
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
agent = DecisionAgent::Agent.new(evaluators: [static_eval])
|
|
158
|
-
original_result = agent.decide(context: { test: true })
|
|
159
|
-
|
|
160
|
-
# Manually construct audit payload with complex metadata
|
|
161
|
-
payload = original_result.audit_payload.dup
|
|
162
|
-
payload[:evaluations] = [evaluation.to_h]
|
|
163
|
-
|
|
164
|
-
replayed_result = DecisionAgent::Replay.run(payload, strict: false)
|
|
165
|
-
|
|
166
|
-
expect(replayed_result.evaluations.first.metadata).to be_a(Hash)
|
|
167
|
-
end
|
|
168
|
-
end
|
|
169
|
-
|
|
170
|
-
describe "handling missing evaluators in replay" do
|
|
171
|
-
it "replays successfully even if original evaluator class doesn't exist" do
|
|
172
|
-
# This simulates a scenario where we had a CustomEvaluator that no longer exists
|
|
173
|
-
# but we can still replay the decision from the audit log
|
|
174
|
-
|
|
175
|
-
# WeightedAverage normalizes confidence: with one eval of weight 0.9, confidence = 0.9/0.9 = 1.0
|
|
176
|
-
# So we need to use the correct confidence value that WeightedAverage would produce
|
|
177
|
-
audit_payload = {
|
|
178
|
-
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
179
|
-
context: { user: "charlie", action: "login" },
|
|
180
|
-
feedback: {},
|
|
181
|
-
evaluations: [
|
|
182
|
-
{
|
|
183
|
-
decision: "allow",
|
|
184
|
-
weight: 0.9,
|
|
185
|
-
reason: "User authenticated successfully",
|
|
186
|
-
evaluator_name: "DeletedCustomAuthEvaluator", # This evaluator no longer exists
|
|
187
|
-
metadata: { auth_method: "oauth", provider: "google" }
|
|
188
|
-
}
|
|
189
|
-
],
|
|
190
|
-
decision: "allow",
|
|
191
|
-
confidence: 1.0, # WeightedAverage normalizes single eval to 1.0
|
|
192
|
-
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
193
|
-
agent_version: "0.1.0",
|
|
194
|
-
deterministic_hash: "abc123"
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
# Replay should work because it uses StaticEvaluator, not the original evaluator
|
|
198
|
-
expect do
|
|
199
|
-
DecisionAgent::Replay.run(audit_payload, strict: true)
|
|
200
|
-
end.not_to raise_error
|
|
201
|
-
|
|
202
|
-
replayed_result = DecisionAgent::Replay.run(audit_payload, strict: true)
|
|
203
|
-
|
|
204
|
-
expect(replayed_result.decision).to eq("allow")
|
|
205
|
-
expect(replayed_result.confidence).to eq(1.0)
|
|
206
|
-
expect(replayed_result.evaluations.first.evaluator_name).to eq("DeletedCustomAuthEvaluator")
|
|
207
|
-
end
|
|
208
|
-
|
|
209
|
-
it "handles multiple evaluators where some are missing" do
|
|
210
|
-
# WeightedAverage with two evals agreeing: confidence = (0.8 + 0.7) / (0.8 + 0.7) = 1.0
|
|
211
|
-
audit_payload = {
|
|
212
|
-
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
213
|
-
context: { user: "dave" },
|
|
214
|
-
feedback: {},
|
|
215
|
-
evaluations: [
|
|
216
|
-
{
|
|
217
|
-
decision: "approve",
|
|
218
|
-
weight: 0.8,
|
|
219
|
-
reason: "Rule matched",
|
|
220
|
-
evaluator_name: "RuleEngine",
|
|
221
|
-
metadata: { rule_id: "rule_123" }
|
|
222
|
-
},
|
|
223
|
-
{
|
|
224
|
-
decision: "approve",
|
|
225
|
-
weight: 0.7,
|
|
226
|
-
reason: "ML model prediction",
|
|
227
|
-
evaluator_name: "NonExistentMLEvaluator", # Missing evaluator
|
|
228
|
-
metadata: { model_version: "v2.1" }
|
|
229
|
-
}
|
|
230
|
-
],
|
|
231
|
-
decision: "approve",
|
|
232
|
-
confidence: 1.0, # Both agree, so 100% confidence
|
|
233
|
-
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
234
|
-
agent_version: "0.1.0",
|
|
235
|
-
deterministic_hash: "def456"
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
replayed_result = DecisionAgent::Replay.run(audit_payload, strict: true)
|
|
239
|
-
|
|
240
|
-
expect(replayed_result.decision).to eq("approve")
|
|
241
|
-
expect(replayed_result.evaluations.size).to eq(2)
|
|
242
|
-
expect(replayed_result.evaluations.map(&:evaluator_name)).to include("NonExistentMLEvaluator")
|
|
243
|
-
end
|
|
244
|
-
end
|
|
245
|
-
|
|
246
|
-
describe "scoring strategy evolution" do
|
|
247
|
-
it "handles unknown scoring strategies gracefully" do
|
|
248
|
-
audit_payload = {
|
|
249
|
-
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
250
|
-
context: { test: true },
|
|
251
|
-
feedback: {},
|
|
252
|
-
evaluations: [
|
|
253
|
-
{
|
|
254
|
-
decision: "approve",
|
|
255
|
-
weight: 0.9,
|
|
256
|
-
reason: "Test",
|
|
257
|
-
evaluator_name: "TestEvaluator",
|
|
258
|
-
metadata: {}
|
|
259
|
-
}
|
|
260
|
-
],
|
|
261
|
-
decision: "approve",
|
|
262
|
-
confidence: 0.9,
|
|
263
|
-
scoring_strategy: "DecisionAgent::Scoring::DeprecatedBayesianStrategy", # Doesn't exist
|
|
264
|
-
agent_version: "0.1.0",
|
|
265
|
-
deterministic_hash: "ghi789"
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
# Should fall back to WeightedAverage
|
|
269
|
-
expect do
|
|
270
|
-
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
271
|
-
end.not_to raise_error
|
|
272
|
-
|
|
273
|
-
replayed_result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
274
|
-
expect(replayed_result.decision).to eq("approve")
|
|
275
|
-
end
|
|
276
|
-
|
|
277
|
-
it "detects scoring strategy mismatch in strict mode" do
|
|
278
|
-
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
279
|
-
decision: "approve",
|
|
280
|
-
weight: 0.6,
|
|
281
|
-
reason: "Test"
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
# Create decision with WeightedAverage
|
|
285
|
-
agent_weighted = DecisionAgent::Agent.new(
|
|
286
|
-
evaluators: [evaluator],
|
|
287
|
-
scoring_strategy: DecisionAgent::Scoring::WeightedAverage.new
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
original_result = agent_weighted.decide(context: { test: true })
|
|
291
|
-
|
|
292
|
-
# Replay uses the stored scoring strategy from the audit payload
|
|
293
|
-
# So it should replay successfully
|
|
294
|
-
expect do
|
|
295
|
-
DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
296
|
-
end.not_to raise_error
|
|
297
|
-
end
|
|
298
|
-
end
|
|
299
|
-
|
|
300
|
-
describe "audit payload validation" do
|
|
301
|
-
it "requires context field" do
|
|
302
|
-
incomplete_payload = {
|
|
303
|
-
evaluations: [],
|
|
304
|
-
decision: "test",
|
|
305
|
-
confidence: 0.5
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
expect do
|
|
309
|
-
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
310
|
-
end.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: context/)
|
|
311
|
-
end
|
|
312
|
-
|
|
313
|
-
it "requires evaluations field" do
|
|
314
|
-
incomplete_payload = {
|
|
315
|
-
context: { test: true },
|
|
316
|
-
decision: "test",
|
|
317
|
-
confidence: 0.5
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
expect do
|
|
321
|
-
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
322
|
-
end.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: evaluations/)
|
|
323
|
-
end
|
|
324
|
-
|
|
325
|
-
it "requires decision field" do
|
|
326
|
-
incomplete_payload = {
|
|
327
|
-
context: { test: true },
|
|
328
|
-
evaluations: [],
|
|
329
|
-
confidence: 0.5
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
expect do
|
|
333
|
-
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
334
|
-
end.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: decision/)
|
|
335
|
-
end
|
|
336
|
-
|
|
337
|
-
it "requires confidence field" do
|
|
338
|
-
incomplete_payload = {
|
|
339
|
-
context: { test: true },
|
|
340
|
-
evaluations: [],
|
|
341
|
-
decision: "test"
|
|
342
|
-
}
|
|
343
|
-
|
|
344
|
-
expect do
|
|
345
|
-
DecisionAgent::Replay.run(incomplete_payload, strict: false)
|
|
346
|
-
end.to raise_error(DecisionAgent::InvalidRuleDslError, /missing required key: confidence/)
|
|
347
|
-
end
|
|
348
|
-
|
|
349
|
-
it "accepts both symbol and string keys" do
|
|
350
|
-
# Use MaxWeight strategy which preserves the original weight as confidence
|
|
351
|
-
payload_with_strings = {
|
|
352
|
-
"timestamp" => "2025-01-15T10:00:00.123456Z",
|
|
353
|
-
"context" => { "test" => true },
|
|
354
|
-
"feedback" => {},
|
|
355
|
-
"evaluations" => [
|
|
356
|
-
{
|
|
357
|
-
"decision" => "approve",
|
|
358
|
-
"weight" => 0.9,
|
|
359
|
-
"reason" => "Test",
|
|
360
|
-
"evaluator_name" => "TestEvaluator",
|
|
361
|
-
"metadata" => {}
|
|
362
|
-
}
|
|
363
|
-
],
|
|
364
|
-
"decision" => "approve",
|
|
365
|
-
"confidence" => 0.9,
|
|
366
|
-
"scoring_strategy" => "DecisionAgent::Scoring::MaxWeight"
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
expect do
|
|
370
|
-
DecisionAgent::Replay.run(payload_with_strings, strict: true)
|
|
371
|
-
end.not_to raise_error
|
|
372
|
-
end
|
|
373
|
-
end
|
|
374
|
-
|
|
375
|
-
describe "deterministic hash verification" do
|
|
376
|
-
it "can verify replay produced the same deterministic hash" do
|
|
377
|
-
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
378
|
-
decision: "approve",
|
|
379
|
-
weight: 0.8,
|
|
380
|
-
reason: "Test"
|
|
381
|
-
)
|
|
382
|
-
|
|
383
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
384
|
-
original_result = agent.decide(context: { user: "test" })
|
|
385
|
-
|
|
386
|
-
original_hash = original_result.audit_payload[:deterministic_hash]
|
|
387
|
-
|
|
388
|
-
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
389
|
-
replayed_hash = replayed_result.audit_payload[:deterministic_hash]
|
|
390
|
-
|
|
391
|
-
# Hashes should match because same context, evaluations, decision, confidence, and strategy
|
|
392
|
-
expect(replayed_hash).to eq(original_hash)
|
|
393
|
-
end
|
|
394
|
-
|
|
395
|
-
it "hash changes when context changes" do
|
|
396
|
-
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
397
|
-
decision: "approve",
|
|
398
|
-
weight: 0.8,
|
|
399
|
-
reason: "Test"
|
|
400
|
-
)
|
|
401
|
-
|
|
402
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
403
|
-
|
|
404
|
-
result1 = agent.decide(context: { user: "alice" })
|
|
405
|
-
result2 = agent.decide(context: { user: "bob" })
|
|
406
|
-
|
|
407
|
-
expect(result1.audit_payload[:deterministic_hash]).not_to eq(
|
|
408
|
-
result2.audit_payload[:deterministic_hash]
|
|
409
|
-
)
|
|
410
|
-
end
|
|
411
|
-
end
|
|
412
|
-
|
|
413
|
-
describe "feedback preservation in replay" do
|
|
414
|
-
it "preserves original feedback in replay" do
|
|
415
|
-
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
416
|
-
decision: "approve",
|
|
417
|
-
weight: 0.8,
|
|
418
|
-
reason: "Test"
|
|
419
|
-
)
|
|
420
|
-
|
|
421
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
422
|
-
|
|
423
|
-
original_feedback = { user_id: "manager_123", source: "manual_review" }
|
|
424
|
-
original_result = agent.decide(context: { test: true }, feedback: original_feedback)
|
|
425
|
-
|
|
426
|
-
expect(original_result.audit_payload[:feedback]).to eq(original_feedback)
|
|
427
|
-
|
|
428
|
-
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
429
|
-
|
|
430
|
-
expect(replayed_result.audit_payload[:feedback]).to eq(original_feedback)
|
|
431
|
-
end
|
|
432
|
-
|
|
433
|
-
it "handles empty feedback" do
|
|
434
|
-
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
435
|
-
decision: "approve",
|
|
436
|
-
weight: 0.8,
|
|
437
|
-
reason: "Test"
|
|
438
|
-
)
|
|
439
|
-
|
|
440
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
441
|
-
original_result = agent.decide(context: { test: true })
|
|
442
|
-
|
|
443
|
-
expect(original_result.audit_payload[:feedback]).to eq({})
|
|
444
|
-
|
|
445
|
-
replayed_result = DecisionAgent::Replay.run(original_result.audit_payload, strict: true)
|
|
446
|
-
expect(replayed_result.audit_payload[:feedback]).to eq({})
|
|
447
|
-
end
|
|
448
|
-
end
|
|
449
|
-
|
|
450
|
-
describe "version mismatch scenarios" do
|
|
451
|
-
it "logs warning when agent_version differs in non-strict mode" do
|
|
452
|
-
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
453
|
-
decision: "approve",
|
|
454
|
-
weight: 0.8,
|
|
455
|
-
reason: "Test"
|
|
456
|
-
)
|
|
457
|
-
|
|
458
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
459
|
-
original_result = agent.decide(context: { test: true })
|
|
460
|
-
|
|
461
|
-
# Modify agent_version
|
|
462
|
-
modified_payload = original_result.audit_payload.dup
|
|
463
|
-
modified_payload[:agent_version] = "99.0.0" # Different version
|
|
464
|
-
|
|
465
|
-
# Non-strict mode should log but not raise
|
|
466
|
-
expect do
|
|
467
|
-
DecisionAgent::Replay.run(modified_payload, strict: false)
|
|
468
|
-
end.not_to raise_error
|
|
469
|
-
|
|
470
|
-
# Should successfully replay despite version difference
|
|
471
|
-
replayed_result = DecisionAgent::Replay.run(modified_payload, strict: false)
|
|
472
|
-
expect(replayed_result.decision).to eq("approve")
|
|
473
|
-
end
|
|
474
|
-
|
|
475
|
-
it "accepts different agent_version in non-strict mode" do
|
|
476
|
-
audit_payload = {
|
|
477
|
-
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
478
|
-
context: { test: true },
|
|
479
|
-
feedback: {},
|
|
480
|
-
evaluations: [
|
|
481
|
-
{
|
|
482
|
-
decision: "approve",
|
|
483
|
-
weight: 0.9,
|
|
484
|
-
reason: "Test",
|
|
485
|
-
evaluator_name: "TestEvaluator",
|
|
486
|
-
metadata: {}
|
|
487
|
-
}
|
|
488
|
-
],
|
|
489
|
-
decision: "approve",
|
|
490
|
-
confidence: 1.0,
|
|
491
|
-
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
492
|
-
agent_version: "0.0.1", # Old version
|
|
493
|
-
deterministic_hash: "old_hash"
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
# Should accept and replay successfully
|
|
497
|
-
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
498
|
-
expect(result.decision).to eq("approve")
|
|
499
|
-
end
|
|
500
|
-
|
|
501
|
-
it "replays successfully in strict mode regardless of version" do
|
|
502
|
-
evaluator = DecisionAgent::Evaluators::StaticEvaluator.new(
|
|
503
|
-
decision: "approve",
|
|
504
|
-
weight: 0.8,
|
|
505
|
-
reason: "Test"
|
|
506
|
-
)
|
|
507
|
-
|
|
508
|
-
agent = DecisionAgent::Agent.new(evaluators: [evaluator])
|
|
509
|
-
original_result = agent.decide(context: { test: true })
|
|
510
|
-
|
|
511
|
-
# Modify agent_version
|
|
512
|
-
modified_payload = original_result.audit_payload.dup
|
|
513
|
-
modified_payload[:agent_version] = "2.0.0"
|
|
514
|
-
|
|
515
|
-
# Strict mode should still work because version is not part of deterministic comparison
|
|
516
|
-
# (only decision and confidence are compared in strict mode)
|
|
517
|
-
expect do
|
|
518
|
-
DecisionAgent::Replay.run(modified_payload, strict: true)
|
|
519
|
-
end.not_to raise_error
|
|
520
|
-
end
|
|
521
|
-
end
|
|
522
|
-
|
|
523
|
-
describe "corrupted audit payload scenarios" do
|
|
524
|
-
it "handles missing deterministic_hash gracefully" do
|
|
525
|
-
audit_payload = {
|
|
526
|
-
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
527
|
-
context: { test: true },
|
|
528
|
-
feedback: {},
|
|
529
|
-
evaluations: [
|
|
530
|
-
{
|
|
531
|
-
decision: "approve",
|
|
532
|
-
weight: 0.9,
|
|
533
|
-
reason: "Test",
|
|
534
|
-
evaluator_name: "TestEvaluator",
|
|
535
|
-
metadata: {}
|
|
536
|
-
}
|
|
537
|
-
],
|
|
538
|
-
decision: "approve",
|
|
539
|
-
confidence: 1.0,
|
|
540
|
-
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
541
|
-
agent_version: "0.1.0"
|
|
542
|
-
# deterministic_hash is missing
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
# Should not raise error, just creates new hash during replay
|
|
546
|
-
expect do
|
|
547
|
-
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
548
|
-
end.not_to raise_error
|
|
549
|
-
|
|
550
|
-
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
551
|
-
expect(result.decision).to eq("approve")
|
|
552
|
-
expect(result.audit_payload[:deterministic_hash]).to be_a(String)
|
|
553
|
-
end
|
|
554
|
-
|
|
555
|
-
it "handles invalid deterministic_hash gracefully" do
|
|
556
|
-
audit_payload = {
|
|
557
|
-
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
558
|
-
context: { test: true },
|
|
559
|
-
feedback: {},
|
|
560
|
-
evaluations: [
|
|
561
|
-
{
|
|
562
|
-
decision: "approve",
|
|
563
|
-
weight: 0.9,
|
|
564
|
-
reason: "Test",
|
|
565
|
-
evaluator_name: "TestEvaluator",
|
|
566
|
-
metadata: {}
|
|
567
|
-
}
|
|
568
|
-
],
|
|
569
|
-
decision: "approve",
|
|
570
|
-
confidence: 1.0,
|
|
571
|
-
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage",
|
|
572
|
-
agent_version: "0.1.0",
|
|
573
|
-
deterministic_hash: "corrupted_invalid_hash_12345"
|
|
574
|
-
}
|
|
575
|
-
|
|
576
|
-
# Should replay successfully, generating new hash
|
|
577
|
-
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
578
|
-
expect(result.decision).to eq("approve")
|
|
579
|
-
# New hash should be different from corrupted one
|
|
580
|
-
expect(result.audit_payload[:deterministic_hash]).not_to eq("corrupted_invalid_hash_12345")
|
|
581
|
-
end
|
|
582
|
-
|
|
583
|
-
it "validates required fields before replay" do
|
|
584
|
-
# Missing context
|
|
585
|
-
expect do
|
|
586
|
-
DecisionAgent::Replay.run({ decision: "test", confidence: 0.5, evaluations: [] }, strict: true)
|
|
587
|
-
end.to raise_error(DecisionAgent::InvalidRuleDslError, /context/)
|
|
588
|
-
|
|
589
|
-
# Missing evaluations
|
|
590
|
-
expect do
|
|
591
|
-
DecisionAgent::Replay.run({ context: {}, decision: "test", confidence: 0.5 }, strict: true)
|
|
592
|
-
end.to raise_error(DecisionAgent::InvalidRuleDslError, /evaluations/)
|
|
593
|
-
|
|
594
|
-
# Missing decision
|
|
595
|
-
expect do
|
|
596
|
-
DecisionAgent::Replay.run({ context: {}, evaluations: [], confidence: 0.5 }, strict: true)
|
|
597
|
-
end.to raise_error(DecisionAgent::InvalidRuleDslError, /decision/)
|
|
598
|
-
|
|
599
|
-
# Missing confidence
|
|
600
|
-
expect do
|
|
601
|
-
DecisionAgent::Replay.run({ context: {}, evaluations: [], decision: "test" }, strict: true)
|
|
602
|
-
end.to raise_error(DecisionAgent::InvalidRuleDslError, /confidence/)
|
|
603
|
-
end
|
|
604
|
-
|
|
605
|
-
it "handles evaluation with invalid weight" do
|
|
606
|
-
audit_payload = {
|
|
607
|
-
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
608
|
-
context: { test: true },
|
|
609
|
-
feedback: {},
|
|
610
|
-
evaluations: [
|
|
611
|
-
{
|
|
612
|
-
decision: "approve",
|
|
613
|
-
weight: 2.5, # Weight > 1.0, invalid
|
|
614
|
-
reason: "Test",
|
|
615
|
-
evaluator_name: "TestEvaluator",
|
|
616
|
-
metadata: {}
|
|
617
|
-
}
|
|
618
|
-
],
|
|
619
|
-
decision: "approve",
|
|
620
|
-
confidence: 1.0,
|
|
621
|
-
scoring_strategy: "DecisionAgent::Scoring::WeightedAverage"
|
|
622
|
-
}
|
|
623
|
-
|
|
624
|
-
# Invalid weight (> 1.0) should raise error when creating Evaluation
|
|
625
|
-
expect do
|
|
626
|
-
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
627
|
-
end.to raise_error(DecisionAgent::InvalidWeightError)
|
|
628
|
-
end
|
|
629
|
-
|
|
630
|
-
it "handles completely empty audit payload" do
|
|
631
|
-
expect do
|
|
632
|
-
DecisionAgent::Replay.run({}, strict: false)
|
|
633
|
-
end.to raise_error(DecisionAgent::InvalidRuleDslError)
|
|
634
|
-
end
|
|
635
|
-
|
|
636
|
-
it "handles nil audit payload" do
|
|
637
|
-
expect do
|
|
638
|
-
DecisionAgent::Replay.run(nil, strict: false)
|
|
639
|
-
end.to raise_error
|
|
640
|
-
end
|
|
641
|
-
end
|
|
642
|
-
|
|
643
|
-
describe "scoring strategy class rename scenarios" do
|
|
644
|
-
it "handles renamed scoring strategy class in non-strict mode" do
|
|
645
|
-
audit_payload = {
|
|
646
|
-
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
647
|
-
context: { test: true },
|
|
648
|
-
feedback: {},
|
|
649
|
-
evaluations: [
|
|
650
|
-
{
|
|
651
|
-
decision: "approve",
|
|
652
|
-
weight: 0.9,
|
|
653
|
-
reason: "Test",
|
|
654
|
-
evaluator_name: "TestEvaluator",
|
|
655
|
-
metadata: {}
|
|
656
|
-
}
|
|
657
|
-
],
|
|
658
|
-
decision: "approve",
|
|
659
|
-
confidence: 0.9,
|
|
660
|
-
scoring_strategy: "DecisionAgent::Scoring::OldStrategyName", # Renamed or deleted
|
|
661
|
-
agent_version: "0.1.0"
|
|
662
|
-
}
|
|
663
|
-
|
|
664
|
-
# Should fall back to default strategy (WeightedAverage)
|
|
665
|
-
expect do
|
|
666
|
-
DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
667
|
-
end.not_to raise_error
|
|
668
|
-
|
|
669
|
-
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
670
|
-
expect(result.decision).to eq("approve")
|
|
671
|
-
end
|
|
672
|
-
|
|
673
|
-
it "handles custom scoring strategy not in current codebase" do
|
|
674
|
-
audit_payload = {
|
|
675
|
-
timestamp: "2025-01-15T10:00:00.123456Z",
|
|
676
|
-
context: { test: true },
|
|
677
|
-
feedback: {},
|
|
678
|
-
evaluations: [
|
|
679
|
-
{
|
|
680
|
-
decision: "approve",
|
|
681
|
-
weight: 0.85,
|
|
682
|
-
reason: "Test",
|
|
683
|
-
evaluator_name: "TestEvaluator",
|
|
684
|
-
metadata: {}
|
|
685
|
-
}
|
|
686
|
-
],
|
|
687
|
-
decision: "approve",
|
|
688
|
-
confidence: 0.85,
|
|
689
|
-
scoring_strategy: "MyCompany::CustomMLBasedScoringStrategy", # Custom strategy
|
|
690
|
-
agent_version: "0.1.0"
|
|
691
|
-
}
|
|
692
|
-
|
|
693
|
-
# Should use fallback strategy
|
|
694
|
-
result = DecisionAgent::Replay.run(audit_payload, strict: false)
|
|
695
|
-
expect(result).not_to be_nil
|
|
696
|
-
expect(result.decision).to eq("approve")
|
|
697
|
-
end
|
|
698
|
-
end
|
|
699
|
-
end
|