decision_agent 0.1.3 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -233
  3. data/lib/decision_agent/ab_testing/ab_test.rb +197 -0
  4. data/lib/decision_agent/ab_testing/ab_test_assignment.rb +76 -0
  5. data/lib/decision_agent/ab_testing/ab_test_manager.rb +317 -0
  6. data/lib/decision_agent/ab_testing/ab_testing_agent.rb +188 -0
  7. data/lib/decision_agent/ab_testing/storage/activerecord_adapter.rb +155 -0
  8. data/lib/decision_agent/ab_testing/storage/adapter.rb +67 -0
  9. data/lib/decision_agent/ab_testing/storage/memory_adapter.rb +116 -0
  10. data/lib/decision_agent/agent.rb +5 -3
  11. data/lib/decision_agent/auth/access_audit_logger.rb +122 -0
  12. data/lib/decision_agent/auth/authenticator.rb +127 -0
  13. data/lib/decision_agent/auth/password_reset_manager.rb +57 -0
  14. data/lib/decision_agent/auth/password_reset_token.rb +33 -0
  15. data/lib/decision_agent/auth/permission.rb +29 -0
  16. data/lib/decision_agent/auth/permission_checker.rb +43 -0
  17. data/lib/decision_agent/auth/rbac_adapter.rb +278 -0
  18. data/lib/decision_agent/auth/rbac_config.rb +51 -0
  19. data/lib/decision_agent/auth/role.rb +56 -0
  20. data/lib/decision_agent/auth/session.rb +33 -0
  21. data/lib/decision_agent/auth/session_manager.rb +57 -0
  22. data/lib/decision_agent/auth/user.rb +70 -0
  23. data/lib/decision_agent/context.rb +24 -4
  24. data/lib/decision_agent/decision.rb +10 -3
  25. data/lib/decision_agent/dsl/condition_evaluator.rb +378 -1
  26. data/lib/decision_agent/dsl/schema_validator.rb +8 -1
  27. data/lib/decision_agent/errors.rb +38 -0
  28. data/lib/decision_agent/evaluation.rb +10 -3
  29. data/lib/decision_agent/evaluation_validator.rb +8 -13
  30. data/lib/decision_agent/monitoring/dashboard_server.rb +1 -0
  31. data/lib/decision_agent/monitoring/metrics_collector.rb +164 -7
  32. data/lib/decision_agent/monitoring/storage/activerecord_adapter.rb +253 -0
  33. data/lib/decision_agent/monitoring/storage/base_adapter.rb +90 -0
  34. data/lib/decision_agent/monitoring/storage/memory_adapter.rb +222 -0
  35. data/lib/decision_agent/testing/batch_test_importer.rb +373 -0
  36. data/lib/decision_agent/testing/batch_test_runner.rb +244 -0
  37. data/lib/decision_agent/testing/test_coverage_analyzer.rb +191 -0
  38. data/lib/decision_agent/testing/test_result_comparator.rb +235 -0
  39. data/lib/decision_agent/testing/test_scenario.rb +42 -0
  40. data/lib/decision_agent/version.rb +10 -1
  41. data/lib/decision_agent/versioning/activerecord_adapter.rb +1 -1
  42. data/lib/decision_agent/versioning/file_storage_adapter.rb +96 -28
  43. data/lib/decision_agent/web/middleware/auth_middleware.rb +45 -0
  44. data/lib/decision_agent/web/middleware/permission_middleware.rb +94 -0
  45. data/lib/decision_agent/web/public/app.js +184 -29
  46. data/lib/decision_agent/web/public/batch_testing.html +640 -0
  47. data/lib/decision_agent/web/public/index.html +37 -9
  48. data/lib/decision_agent/web/public/login.html +298 -0
  49. data/lib/decision_agent/web/public/users.html +679 -0
  50. data/lib/decision_agent/web/server.rb +873 -7
  51. data/lib/decision_agent.rb +59 -0
  52. data/lib/generators/decision_agent/install/install_generator.rb +37 -0
  53. data/lib/generators/decision_agent/install/templates/ab_test_assignment_model.rb +45 -0
  54. data/lib/generators/decision_agent/install/templates/ab_test_model.rb +54 -0
  55. data/lib/generators/decision_agent/install/templates/ab_testing_migration.rb +43 -0
  56. data/lib/generators/decision_agent/install/templates/ab_testing_tasks.rake +189 -0
  57. data/lib/generators/decision_agent/install/templates/decision_agent_tasks.rake +114 -0
  58. data/lib/generators/decision_agent/install/templates/decision_log.rb +57 -0
  59. data/lib/generators/decision_agent/install/templates/error_metric.rb +53 -0
  60. data/lib/generators/decision_agent/install/templates/evaluation_metric.rb +43 -0
  61. data/lib/generators/decision_agent/install/templates/monitoring_migration.rb +109 -0
  62. data/lib/generators/decision_agent/install/templates/performance_metric.rb +76 -0
  63. data/lib/generators/decision_agent/install/templates/rule_version.rb +1 -1
  64. data/spec/ab_testing/ab_test_assignment_spec.rb +253 -0
  65. data/spec/ab_testing/ab_test_manager_spec.rb +612 -0
  66. data/spec/ab_testing/ab_test_spec.rb +270 -0
  67. data/spec/ab_testing/ab_testing_agent_spec.rb +481 -0
  68. data/spec/ab_testing/storage/adapter_spec.rb +64 -0
  69. data/spec/ab_testing/storage/memory_adapter_spec.rb +485 -0
  70. data/spec/advanced_operators_spec.rb +1003 -0
  71. data/spec/agent_spec.rb +40 -0
  72. data/spec/audit_adapters_spec.rb +18 -0
  73. data/spec/auth/access_audit_logger_spec.rb +394 -0
  74. data/spec/auth/authenticator_spec.rb +112 -0
  75. data/spec/auth/password_reset_spec.rb +294 -0
  76. data/spec/auth/permission_checker_spec.rb +207 -0
  77. data/spec/auth/permission_spec.rb +73 -0
  78. data/spec/auth/rbac_adapter_spec.rb +550 -0
  79. data/spec/auth/rbac_config_spec.rb +82 -0
  80. data/spec/auth/role_spec.rb +51 -0
  81. data/spec/auth/session_manager_spec.rb +172 -0
  82. data/spec/auth/session_spec.rb +112 -0
  83. data/spec/auth/user_spec.rb +130 -0
  84. data/spec/context_spec.rb +43 -0
  85. data/spec/decision_agent_spec.rb +96 -0
  86. data/spec/decision_spec.rb +423 -0
  87. data/spec/dsl/condition_evaluator_spec.rb +774 -0
  88. data/spec/evaluation_spec.rb +364 -0
  89. data/spec/evaluation_validator_spec.rb +165 -0
  90. data/spec/examples.txt +1542 -548
  91. data/spec/issue_verification_spec.rb +95 -21
  92. data/spec/monitoring/metrics_collector_spec.rb +221 -3
  93. data/spec/monitoring/monitored_agent_spec.rb +1 -1
  94. data/spec/monitoring/prometheus_exporter_spec.rb +1 -1
  95. data/spec/monitoring/storage/activerecord_adapter_spec.rb +498 -0
  96. data/spec/monitoring/storage/base_adapter_spec.rb +61 -0
  97. data/spec/monitoring/storage/memory_adapter_spec.rb +247 -0
  98. data/spec/performance_optimizations_spec.rb +486 -0
  99. data/spec/spec_helper.rb +23 -0
  100. data/spec/testing/batch_test_importer_spec.rb +693 -0
  101. data/spec/testing/batch_test_runner_spec.rb +307 -0
  102. data/spec/testing/test_coverage_analyzer_spec.rb +292 -0
  103. data/spec/testing/test_result_comparator_spec.rb +392 -0
  104. data/spec/testing/test_scenario_spec.rb +113 -0
  105. data/spec/versioning/adapter_spec.rb +156 -0
  106. data/spec/versioning_spec.rb +253 -0
  107. data/spec/web/middleware/auth_middleware_spec.rb +133 -0
  108. data/spec/web/middleware/permission_middleware_spec.rb +247 -0
  109. data/spec/web_ui_rack_spec.rb +1705 -0
  110. metadata +123 -6
@@ -0,0 +1,244 @@
1
+ require "json"
2
+
3
+ module DecisionAgent
4
+ module Testing
5
+ # Result of a single test scenario execution
6
+ class TestResult
7
+ attr_reader :scenario_id, :decision, :confidence, :execution_time_ms, :error, :evaluations
8
+
9
+ def initialize(scenario_id:, decision: nil, confidence: nil, execution_time_ms: 0, error: nil, evaluations: [])
10
+ @scenario_id = scenario_id.to_s.freeze
11
+ @decision = decision&.to_s&.freeze
12
+ @confidence = confidence&.to_f
13
+ @execution_time_ms = execution_time_ms.to_f
14
+ @error = error
15
+ @evaluations = evaluations.freeze
16
+
17
+ freeze
18
+ end
19
+
20
+ def success?
21
+ @error.nil?
22
+ end
23
+
24
+ def to_h
25
+ {
26
+ scenario_id: @scenario_id,
27
+ decision: @decision,
28
+ confidence: @confidence,
29
+ execution_time_ms: @execution_time_ms,
30
+ error: @error&.message,
31
+ success: success?,
32
+ evaluations: @evaluations.map { |e| e.respond_to?(:to_h) ? e.to_h : e }
33
+ }
34
+ end
35
+ end
36
+
37
+ # Executes batch tests against an agent
38
+ class BatchTestRunner
39
+ attr_reader :agent, :results
40
+
41
+ def initialize(agent)
42
+ @agent = agent
43
+ @results = []
44
+ @checkpoint_file = nil
45
+ end
46
+
47
+ # Run batch tests against scenarios
48
+ # @param scenarios [Array<TestScenario>] Test scenarios to execute
49
+ # @param options [Hash] Execution options
50
+ # - :parallel [Boolean] Use parallel execution (default: true)
51
+ # - :thread_count [Integer] Number of threads for parallel execution (default: 4)
52
+ # - :progress_callback [Proc] Callback for progress updates (called with { completed: N, total: M, percentage: X })
53
+ # - :feedback [Hash] Optional feedback to pass to agent
54
+ # - :checkpoint_file [String] Path to checkpoint file for resume capability (optional)
55
+ # @return [Array<TestResult>] Array of test results
56
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
57
+ def run(scenarios, options = {})
58
+ @results = []
59
+ @checkpoint_file = options[:checkpoint_file]
60
+ options = {
61
+ parallel: true,
62
+ thread_count: 4,
63
+ progress_callback: nil,
64
+ feedback: {},
65
+ checkpoint_file: nil
66
+ }.merge(options)
67
+
68
+ total = scenarios.size
69
+ completed = 0
70
+ mutex = Mutex.new
71
+
72
+ # Load checkpoint if exists
73
+ completed_scenario_ids = load_checkpoint if @checkpoint_file && File.exist?(@checkpoint_file)
74
+
75
+ # Filter out already completed scenarios
76
+ remaining_scenarios = if completed_scenario_ids&.any?
77
+ scenarios.reject { |s| completed_scenario_ids.include?(s.id) }
78
+ else
79
+ scenarios
80
+ end
81
+
82
+ if options[:parallel] && remaining_scenarios.size > 1
83
+ run_parallel(remaining_scenarios, options, mutex) do |result|
84
+ completed += 1
85
+ save_checkpoint(result.scenario_id) if @checkpoint_file
86
+ options[:progress_callback]&.call(
87
+ completed: completed + (completed_scenario_ids&.size || 0),
88
+ total: total,
89
+ percentage: ((completed + (completed_scenario_ids&.size || 0)).to_f / total * 100).round(2)
90
+ )
91
+ end
92
+ else
93
+ remaining_scenarios.each_with_index do |scenario, index|
94
+ result = execute_scenario(scenario, options[:feedback])
95
+ @results << result
96
+ save_checkpoint(result.scenario_id) if @checkpoint_file
97
+ completed = index + 1
98
+ options[:progress_callback]&.call(
99
+ completed: completed + (completed_scenario_ids&.size || 0),
100
+ total: total,
101
+ percentage: ((completed + (completed_scenario_ids&.size || 0)).to_f / total * 100).round(2)
102
+ )
103
+ end
104
+ end
105
+
106
+ # Clean up checkpoint file on successful completion
107
+ delete_checkpoint if @checkpoint_file && File.exist?(@checkpoint_file)
108
+
109
+ @results
110
+ end
111
+
112
+ # Resume batch test execution from a checkpoint
113
+ # @param scenarios [Array<TestScenario>] All test scenarios (including already completed ones)
114
+ # @param checkpoint_file [String] Path to checkpoint file
115
+ # @param options [Hash] Same as run method
116
+ # @return [Array<TestResult>] Array of test results (only newly executed ones)
117
+ def resume(scenarios, checkpoint_file, options = {})
118
+ options[:checkpoint_file] = checkpoint_file
119
+ run(scenarios, options)
120
+ end
121
+
122
+ # Get execution statistics
123
+ # @return [Hash] Statistics about the batch test run
124
+ def statistics
125
+ return {} if @results.empty?
126
+
127
+ successful = @results.count(&:success?)
128
+ failed = @results.size - successful
129
+ execution_times = @results.map(&:execution_time_ms).compact
130
+
131
+ {
132
+ total: @results.size,
133
+ successful: successful,
134
+ failed: failed,
135
+ success_rate: successful.to_f / @results.size,
136
+ avg_execution_time_ms: execution_times.any? ? execution_times.sum / execution_times.size : 0,
137
+ min_execution_time_ms: execution_times.min || 0,
138
+ max_execution_time_ms: execution_times.max || 0,
139
+ total_execution_time_ms: execution_times.sum
140
+ }
141
+ end
142
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
143
+
144
+ private
145
+
146
+ def run_parallel(scenarios, options, mutex)
147
+ thread_count = [options[:thread_count], scenarios.size].min
148
+ queue = Queue.new
149
+ scenarios.each { |s| queue << s }
150
+
151
+ threads = Array.new(thread_count) do
152
+ Thread.new do
153
+ loop do
154
+ scenario = begin
155
+ queue.pop(true)
156
+ rescue StandardError
157
+ nil
158
+ end
159
+ break unless scenario
160
+
161
+ result = execute_scenario(scenario, options[:feedback])
162
+ mutex.synchronize do
163
+ @results << result
164
+ yield result
165
+ end
166
+ end
167
+ end
168
+ end
169
+
170
+ threads.each(&:join)
171
+ end
172
+
173
+ def execute_scenario(scenario, feedback)
174
+ start_time = Time.now
175
+
176
+ begin
177
+ decision = @agent.decide(context: scenario.context, feedback: feedback)
178
+
179
+ execution_time_ms = ((Time.now - start_time) * 1000).round(2)
180
+
181
+ TestResult.new(
182
+ scenario_id: scenario.id,
183
+ decision: decision.decision,
184
+ confidence: decision.confidence,
185
+ execution_time_ms: execution_time_ms,
186
+ evaluations: decision.evaluations
187
+ )
188
+ rescue StandardError => e
189
+ execution_time_ms = ((Time.now - start_time) * 1000).round(2)
190
+
191
+ TestResult.new(
192
+ scenario_id: scenario.id,
193
+ execution_time_ms: execution_time_ms,
194
+ error: e
195
+ )
196
+ end
197
+ end
198
+
199
+ def save_checkpoint(scenario_id)
200
+ return unless @checkpoint_file
201
+
202
+ checkpoint_data = load_checkpoint_data
203
+ checkpoint_data[:completed_scenario_ids] << scenario_id.to_s unless checkpoint_data[:completed_scenario_ids].include?(scenario_id.to_s)
204
+ checkpoint_data[:last_updated] = Time.now.to_i
205
+
206
+ File.write(@checkpoint_file, JSON.pretty_generate(checkpoint_data))
207
+ rescue StandardError => e
208
+ # Silently fail checkpoint saving to not interrupt test execution
209
+ warn "Failed to save checkpoint: #{e.message}" if $VERBOSE
210
+ end
211
+
212
+ def load_checkpoint
213
+ return [] unless @checkpoint_file && File.exist?(@checkpoint_file)
214
+
215
+ checkpoint_data = load_checkpoint_data
216
+ checkpoint_data[:completed_scenario_ids] || []
217
+ rescue StandardError => e
218
+ warn "Failed to load checkpoint: #{e.message}" if $VERBOSE
219
+ []
220
+ end
221
+
222
+ def load_checkpoint_data
223
+ return { completed_scenario_ids: [], last_updated: nil } unless @checkpoint_file && File.exist?(@checkpoint_file)
224
+
225
+ content = File.read(@checkpoint_file)
226
+ data = JSON.parse(content, symbolize_names: true)
227
+ data[:completed_scenario_ids] ||= []
228
+ data
229
+ rescue JSON::ParserError
230
+ { completed_scenario_ids: [], last_updated: nil }
231
+ rescue StandardError
232
+ { completed_scenario_ids: [], last_updated: nil }
233
+ end
234
+
235
+ def delete_checkpoint
236
+ return unless @checkpoint_file && File.exist?(@checkpoint_file)
237
+
238
+ File.delete(@checkpoint_file)
239
+ rescue StandardError => e
240
+ warn "Failed to delete checkpoint: #{e.message}" if $VERBOSE
241
+ end
242
+ end
243
+ end
244
+ end
@@ -0,0 +1,191 @@
1
+ require "set"
2
+
3
+ module DecisionAgent
4
+ module Testing
5
+ # Coverage report for test scenarios
6
+ class CoverageReport
7
+ attr_reader :total_rules, :covered_rules, :untested_rules, :coverage_percentage, :rule_coverage, :condition_coverage
8
+
9
+ def initialize(total_rules:, covered_rules:, untested_rules:, coverage_percentage:, rule_coverage:, condition_coverage:)
10
+ @total_rules = total_rules
11
+ @covered_rules = covered_rules
12
+ @untested_rules = untested_rules.freeze
13
+ @coverage_percentage = coverage_percentage
14
+ @rule_coverage = rule_coverage.freeze
15
+ @condition_coverage = condition_coverage.freeze
16
+
17
+ freeze
18
+ end
19
+
20
+ def to_h
21
+ {
22
+ total_rules: @total_rules,
23
+ covered_rules: @covered_rules,
24
+ untested_rules: @untested_rules,
25
+ coverage_percentage: @coverage_percentage,
26
+ rule_coverage: @rule_coverage,
27
+ condition_coverage: @condition_coverage
28
+ }
29
+ end
30
+ end
31
+
32
+ # Analyzes test coverage of rules and conditions
33
+ class TestCoverageAnalyzer
34
+ def initialize
35
+ @executed_rules = Set.new
36
+ @executed_conditions = Set.new
37
+ @rule_evaluation_count = {}
38
+ @condition_evaluation_count = {}
39
+ end
40
+
41
+ # Analyze coverage from test results
42
+ # @param results [Array<TestResult>] Test results from batch execution
43
+ # @param agent [Agent] The agent used for testing (to get all available rules)
44
+ # @return [CoverageReport] Coverage report
45
+ def analyze(results, agent = nil)
46
+ reset
47
+
48
+ # Track which rules and conditions were executed
49
+ results.each do |result|
50
+ next unless result.success?
51
+
52
+ result.evaluations.each do |evaluation|
53
+ track_evaluation(evaluation)
54
+ end
55
+ end
56
+
57
+ # Get all available rules from agent if provided
58
+ all_rules = agent ? extract_rules_from_agent(agent) : []
59
+ all_conditions = agent ? extract_conditions_from_agent(agent) : []
60
+
61
+ generate_report(all_rules, all_conditions)
62
+ end
63
+
64
+ # Get coverage percentage
65
+ # @return [Float] Coverage percentage (0.0 to 1.0)
66
+ def coverage_percentage
67
+ return 0.0 if @executed_rules.empty?
68
+
69
+ total = @rule_evaluation_count.size
70
+ return 0.0 if total.zero?
71
+
72
+ @executed_rules.size.to_f / total
73
+ end
74
+
75
+ private
76
+
77
+ def reset
78
+ @executed_rules = Set.new
79
+ @executed_conditions = Set.new
80
+ @rule_evaluation_count = {}
81
+ @condition_evaluation_count = {}
82
+ end
83
+
84
+ def track_evaluation(evaluation)
85
+ # Extract rule identifier from evaluation
86
+ rule_id = extract_rule_id(evaluation)
87
+ condition_id = extract_condition_id(evaluation)
88
+
89
+ if rule_id
90
+ @executed_rules << rule_id
91
+ @rule_evaluation_count[rule_id] = (@rule_evaluation_count[rule_id] || 0) + 1
92
+ end
93
+
94
+ return unless condition_id
95
+
96
+ @executed_conditions << condition_id
97
+ @condition_evaluation_count[condition_id] = (@condition_evaluation_count[condition_id] || 0) + 1
98
+ end
99
+
100
+ def extract_rule_id(evaluation)
101
+ # Try to get rule_id from metadata
102
+ return evaluation.metadata[:rule_id] if evaluation.respond_to?(:metadata) && evaluation.metadata.is_a?(Hash)
103
+
104
+ # Fallback to evaluator_name as rule identifier
105
+ return evaluation.evaluator_name if evaluation.respond_to?(:evaluator_name)
106
+
107
+ nil
108
+ end
109
+
110
+ def extract_condition_id(evaluation)
111
+ # Try to get condition_id from metadata
112
+ return evaluation.metadata[:condition_id] if evaluation.respond_to?(:metadata) && evaluation.metadata.is_a?(Hash)
113
+
114
+ nil
115
+ end
116
+
117
+ def extract_rules_from_agent(agent)
118
+ rules = []
119
+
120
+ agent.evaluators.each do |evaluator|
121
+ # Try to extract rule information from evaluator
122
+ if evaluator.respond_to?(:rules)
123
+ rules.concat(Array(evaluator.rules))
124
+ elsif evaluator.respond_to?(:rule_id)
125
+ rules << evaluator.rule_id
126
+ else
127
+ # Use evaluator class name as rule identifier
128
+ rules << evaluator.class.name
129
+ end
130
+ end
131
+
132
+ rules.uniq
133
+ end
134
+
135
+ def extract_conditions_from_agent(agent)
136
+ conditions = []
137
+
138
+ agent.evaluators.each do |evaluator|
139
+ # Try to extract condition information from evaluator
140
+ if evaluator.respond_to?(:conditions)
141
+ conditions.concat(Array(evaluator.conditions))
142
+ elsif evaluator.respond_to?(:condition_id)
143
+ conditions << evaluator.condition_id
144
+ end
145
+ end
146
+
147
+ conditions.uniq
148
+ end
149
+
150
+ def generate_report(all_rules, all_conditions)
151
+ total_rules = all_rules.any? ? all_rules.size : @executed_rules.size
152
+ covered_rules = @executed_rules.size
153
+ untested_rules = all_rules.any? ? (all_rules - @executed_rules.to_a) : []
154
+
155
+ # Cap coverage at 1.0 (100%)
156
+ coverage_percentage = if total_rules.positive?
157
+ [(covered_rules.to_f / total_rules), 1.0].min
158
+ else
159
+ 0.0
160
+ end
161
+
162
+ # Build rule coverage details
163
+ rule_coverage = all_rules.map do |rule|
164
+ {
165
+ rule_id: rule,
166
+ covered: @executed_rules.include?(rule),
167
+ execution_count: @rule_evaluation_count[rule] || 0
168
+ }
169
+ end
170
+
171
+ # Build condition coverage details
172
+ condition_coverage = all_conditions.map do |condition|
173
+ {
174
+ condition_id: condition,
175
+ covered: @executed_conditions.include?(condition),
176
+ execution_count: @condition_evaluation_count[condition] || 0
177
+ }
178
+ end
179
+
180
+ CoverageReport.new(
181
+ total_rules: total_rules,
182
+ covered_rules: covered_rules,
183
+ untested_rules: untested_rules,
184
+ coverage_percentage: coverage_percentage,
185
+ rule_coverage: rule_coverage,
186
+ condition_coverage: condition_coverage
187
+ )
188
+ end
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,235 @@
1
+ module DecisionAgent
2
+ module Testing
3
+ # Comparison result for a single test scenario
4
+ class ComparisonResult
5
+ attr_reader :scenario_id, :match, :decision_match, :confidence_match, :differences, :actual, :expected
6
+
7
+ # rubocop:disable Metrics/ParameterLists
8
+ def initialize(scenario_id:, match:, decision_match:, confidence_match:, differences:, actual:, expected:)
9
+ @scenario_id = scenario_id.to_s.freeze
10
+ @match = match
11
+ @decision_match = decision_match
12
+ @confidence_match = confidence_match
13
+ @differences = differences.freeze
14
+ @actual = actual
15
+ @expected = expected
16
+
17
+ freeze
18
+ end
19
+ # rubocop:enable Metrics/ParameterLists
20
+
21
+ def to_h
22
+ {
23
+ scenario_id: @scenario_id,
24
+ match: @match,
25
+ decision_match: @decision_match,
26
+ confidence_match: @confidence_match,
27
+ differences: @differences,
28
+ actual: {
29
+ decision: @actual[:decision],
30
+ confidence: @actual[:confidence]
31
+ },
32
+ expected: {
33
+ decision: @expected[:decision],
34
+ confidence: @expected[:confidence]
35
+ }
36
+ }
37
+ end
38
+ end
39
+
40
+ # Compares test results with expected outcomes
41
+ class TestResultComparator
42
+ attr_reader :comparison_results
43
+
44
+ def initialize(options = {})
45
+ @options = {
46
+ confidence_tolerance: 0.01, # 1% tolerance for confidence comparison
47
+ fuzzy_match: false # Whether to do fuzzy matching on decisions
48
+ }.merge(options)
49
+ @comparison_results = []
50
+ end
51
+
52
+ # Compare test results with expected results from scenarios
53
+ # @param results [Array<TestResult>] Actual test results
54
+ # @param scenarios [Array<TestScenario>] Test scenarios with expected results
55
+ # @return [Hash] Comparison summary with accuracy metrics
56
+ def compare(results, scenarios)
57
+ @comparison_results = []
58
+
59
+ # Create a map of scenario_id -> scenario for quick lookup
60
+ scenarios.each_with_object({}) do |scenario, map|
61
+ map[scenario.id] = scenario
62
+ end
63
+
64
+ # Create a map of scenario_id -> result for quick lookup
65
+ result_map = results.each_with_object({}) do |result, map|
66
+ map[result.scenario_id] = result
67
+ end
68
+
69
+ # Compare each scenario with its result
70
+ scenarios.each do |scenario|
71
+ next unless scenario.expected_result?
72
+
73
+ result = result_map[scenario.id]
74
+ # Only compare if we have a result (skip if result is missing)
75
+ next unless result
76
+
77
+ comparison = compare_single(scenario, result)
78
+ @comparison_results << comparison
79
+ end
80
+
81
+ generate_summary
82
+ end
83
+
84
+ # Generate a summary report
85
+ # @return [Hash] Summary with accuracy metrics and mismatches
86
+ def generate_summary
87
+ return empty_summary if @comparison_results.empty?
88
+
89
+ total = @comparison_results.size
90
+ matches = @comparison_results.count(&:match)
91
+ mismatches = total - matches
92
+
93
+ {
94
+ total: total,
95
+ matches: matches,
96
+ mismatches: mismatches,
97
+ accuracy_rate: matches.to_f / total,
98
+ decision_accuracy: @comparison_results.count(&:decision_match).to_f / total,
99
+ confidence_accuracy: @comparison_results.count(&:confidence_match).to_f / total,
100
+ mismatches_detail: @comparison_results.reject(&:match).map(&:to_h)
101
+ }
102
+ end
103
+
104
+ # Export comparison results to CSV
105
+ # @param file_path [String] Path to output CSV file
106
+ def export_csv(file_path)
107
+ require "csv"
108
+
109
+ CSV.open(file_path, "w") do |csv|
110
+ csv << %w[scenario_id match decision_match confidence_match expected_decision actual_decision expected_confidence
111
+ actual_confidence differences]
112
+ @comparison_results.each do |result|
113
+ csv << [
114
+ result.scenario_id,
115
+ result.match,
116
+ result.decision_match,
117
+ result.confidence_match,
118
+ result.expected[:decision],
119
+ result.actual[:decision],
120
+ result.expected[:confidence],
121
+ result.actual[:confidence],
122
+ result.differences.join("; ")
123
+ ]
124
+ end
125
+ end
126
+ end
127
+
128
+ # Export comparison results to JSON
129
+ # @param file_path [String] Path to output JSON file
130
+ def export_json(file_path)
131
+ require "json"
132
+
133
+ File.write(file_path, JSON.pretty_generate({
134
+ summary: generate_summary,
135
+ results: @comparison_results.map(&:to_h)
136
+ }))
137
+ end
138
+
139
+ private
140
+
141
+ # rubocop:disable Metrics/MethodLength, Metrics/PerceivedComplexity
142
+ def compare_single(scenario, result)
143
+ differences = []
144
+ confidence_match = false
145
+
146
+ if result.nil? || !result.success?
147
+ differences << "Test execution failed: #{result&.error&.message || 'No result'}"
148
+ return ComparisonResult.new(
149
+ scenario_id: scenario.id,
150
+ match: false,
151
+ decision_match: false,
152
+ confidence_match: false,
153
+ differences: differences,
154
+ actual: { decision: nil, confidence: nil },
155
+ expected: {
156
+ decision: scenario.expected_decision,
157
+ confidence: scenario.expected_confidence
158
+ }
159
+ )
160
+ end
161
+
162
+ # Compare decision
163
+ expected_decision = scenario.expected_decision&.to_s
164
+ actual_decision = result.decision&.to_s
165
+
166
+ decision_match = if expected_decision.nil?
167
+ true # No expectation, so it matches
168
+ elsif @options[:fuzzy_match]
169
+ fuzzy_decision_match?(expected_decision, actual_decision)
170
+ else
171
+ expected_decision == actual_decision
172
+ end
173
+
174
+ differences << "Decision mismatch: expected '#{expected_decision}', got '#{actual_decision}'" unless decision_match
175
+
176
+ # Compare confidence
177
+ expected_confidence = scenario.expected_confidence
178
+ actual_confidence = result.confidence
179
+
180
+ if expected_confidence.nil?
181
+ confidence_match = true # No expectation, so it matches
182
+ elsif actual_confidence.nil?
183
+ confidence_match = false
184
+ differences << "Confidence missing in actual result"
185
+ else
186
+ tolerance = @options[:confidence_tolerance]
187
+ confidence_match = (expected_confidence - actual_confidence).abs <= tolerance
188
+ unless confidence_match
189
+ diff = (expected_confidence - actual_confidence).abs.round(4)
190
+ differences << "Confidence mismatch: expected #{expected_confidence}, got #{actual_confidence} (diff: #{diff})"
191
+ end
192
+ end
193
+
194
+ match = decision_match && confidence_match
195
+
196
+ ComparisonResult.new(
197
+ scenario_id: scenario.id,
198
+ match: match,
199
+ decision_match: decision_match,
200
+ confidence_match: confidence_match,
201
+ differences: differences,
202
+ actual: {
203
+ decision: actual_decision,
204
+ confidence: actual_confidence
205
+ },
206
+ expected: {
207
+ decision: expected_decision,
208
+ confidence: expected_confidence
209
+ }
210
+ )
211
+ end
212
+ # rubocop:enable Metrics/MethodLength, Metrics/PerceivedComplexity
213
+
214
+ def fuzzy_decision_match?(expected, actual)
215
+ return true if expected == actual
216
+ return true if expected&.downcase == actual&.downcase
217
+ return true if expected&.strip == actual&.strip
218
+
219
+ false
220
+ end
221
+
222
+ def empty_summary
223
+ {
224
+ total: 0,
225
+ matches: 0,
226
+ mismatches: 0,
227
+ accuracy_rate: 0.0,
228
+ decision_accuracy: 0.0,
229
+ confidence_accuracy: 0.0,
230
+ mismatches_detail: []
231
+ }
232
+ end
233
+ end
234
+ end
235
+ end