ruby-skill-bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +794 -0
  4. data/bin/skill-bench +15 -0
  5. data/docs/architecture.md +200 -0
  6. data/docs/first-eval-guide.md +522 -0
  7. data/docs/testing-guide.md +361 -0
  8. data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
  9. data/lib/skill_bench/agent/react_agent/step.rb +92 -0
  10. data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
  11. data/lib/skill_bench/agent/react_agent.rb +58 -0
  12. data/lib/skill_bench/agent/runner.rb +108 -0
  13. data/lib/skill_bench/agent/summary.rb +39 -0
  14. data/lib/skill_bench/agent.rb +10 -0
  15. data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
  16. data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
  17. data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
  18. data/lib/skill_bench/cli/eval_command.rb +40 -0
  19. data/lib/skill_bench/cli/help_printer.rb +47 -0
  20. data/lib/skill_bench/cli/init_command.rb +69 -0
  21. data/lib/skill_bench/cli/result_printer.rb +20 -0
  22. data/lib/skill_bench/cli/run_command.rb +72 -0
  23. data/lib/skill_bench/cli/skill_command.rb +79 -0
  24. data/lib/skill_bench/cli.rb +51 -0
  25. data/lib/skill_bench/client.rb +23 -0
  26. data/lib/skill_bench/clients/all.rb +19 -0
  27. data/lib/skill_bench/clients/base_client.rb +212 -0
  28. data/lib/skill_bench/clients/provider_config.rb +47 -0
  29. data/lib/skill_bench/clients/provider_registry.rb +56 -0
  30. data/lib/skill_bench/clients/provider_schemas.rb +73 -0
  31. data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
  32. data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
  33. data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
  34. data/lib/skill_bench/clients/providers/gemini.rb +63 -0
  35. data/lib/skill_bench/clients/providers/groq.rb +39 -0
  36. data/lib/skill_bench/clients/providers/null_client.rb +50 -0
  37. data/lib/skill_bench/clients/providers/ollama.rb +63 -0
  38. data/lib/skill_bench/clients/providers/openai.rb +39 -0
  39. data/lib/skill_bench/clients/providers/opencode.rb +56 -0
  40. data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
  41. data/lib/skill_bench/clients/request_builder.rb +43 -0
  42. data/lib/skill_bench/clients/response_error_handler.rb +73 -0
  43. data/lib/skill_bench/clients/response_parser.rb +93 -0
  44. data/lib/skill_bench/clients/retry_handler.rb +78 -0
  45. data/lib/skill_bench/commands/eval_new.rb +89 -0
  46. data/lib/skill_bench/commands/init.rb +39 -0
  47. data/lib/skill_bench/commands/run.rb +21 -0
  48. data/lib/skill_bench/commands/skill_new.rb +115 -0
  49. data/lib/skill_bench/config/applier.rb +67 -0
  50. data/lib/skill_bench/config/defaults.rb +42 -0
  51. data/lib/skill_bench/config/env_overrides.rb +117 -0
  52. data/lib/skill_bench/config/facade_readers.rb +65 -0
  53. data/lib/skill_bench/config/facade_writers.rb +120 -0
  54. data/lib/skill_bench/config/json_loader.rb +84 -0
  55. data/lib/skill_bench/config/store.rb +177 -0
  56. data/lib/skill_bench/config.rb +172 -0
  57. data/lib/skill_bench/criteria.rb +141 -0
  58. data/lib/skill_bench/delta_report.rb +97 -0
  59. data/lib/skill_bench/dimension.rb +69 -0
  60. data/lib/skill_bench/error_logger.rb +35 -0
  61. data/lib/skill_bench/evaluate_command.rb +120 -0
  62. data/lib/skill_bench/evaluation/generator.rb +191 -0
  63. data/lib/skill_bench/evaluation/runner.rb +81 -0
  64. data/lib/skill_bench/evaluation.rb +10 -0
  65. data/lib/skill_bench/execution/context_hydrator.rb +97 -0
  66. data/lib/skill_bench/execution/sandbox.rb +174 -0
  67. data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
  68. data/lib/skill_bench/execution.rb +10 -0
  69. data/lib/skill_bench/history_recorder/history_file.rb +71 -0
  70. data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
  71. data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
  72. data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
  73. data/lib/skill_bench/history_recorder.rb +40 -0
  74. data/lib/skill_bench/interactive.rb +61 -0
  75. data/lib/skill_bench/judge/judge.rb +72 -0
  76. data/lib/skill_bench/judge/prompt.rb +121 -0
  77. data/lib/skill_bench/judge/response.rb +158 -0
  78. data/lib/skill_bench/judge.rb +10 -0
  79. data/lib/skill_bench/migration/provider_migrator.rb +30 -0
  80. data/lib/skill_bench/models/config.rb +61 -0
  81. data/lib/skill_bench/models/criteria_validator.rb +106 -0
  82. data/lib/skill_bench/models/eval.rb +81 -0
  83. data/lib/skill_bench/models/provider.rb +70 -0
  84. data/lib/skill_bench/models/skill.rb +32 -0
  85. data/lib/skill_bench/output_formatter.rb +132 -0
  86. data/lib/skill_bench/package_verifier.rb +80 -0
  87. data/lib/skill_bench/rails/skill_templates.rb +99 -0
  88. data/lib/skill_bench/runner.rb +89 -0
  89. data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
  90. data/lib/skill_bench/services/feedback_generator.rb +122 -0
  91. data/lib/skill_bench/services/formatting_helpers.rb +45 -0
  92. data/lib/skill_bench/services/iteration_formatter.rb +30 -0
  93. data/lib/skill_bench/services/json_formatter.rb +18 -0
  94. data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
  95. data/lib/skill_bench/services/junit_formatter.rb +42 -0
  96. data/lib/skill_bench/services/option_parser_service.rb +63 -0
  97. data/lib/skill_bench/services/output_persistence_service.rb +77 -0
  98. data/lib/skill_bench/services/result_printer_service.rb +126 -0
  99. data/lib/skill_bench/services/runner_service.rb +381 -0
  100. data/lib/skill_bench/services/skill_resolver.rb +78 -0
  101. data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
  102. data/lib/skill_bench/services/template_registry.rb +148 -0
  103. data/lib/skill_bench/task/evaluator.rb +94 -0
  104. data/lib/skill_bench/task/file_reader.rb +69 -0
  105. data/lib/skill_bench/task.rb +10 -0
  106. data/lib/skill_bench/tools/argument_parser.rb +20 -0
  107. data/lib/skill_bench/tools/base.rb +73 -0
  108. data/lib/skill_bench/tools/dispatcher.rb +61 -0
  109. data/lib/skill_bench/tools/read_file.rb +66 -0
  110. data/lib/skill_bench/tools/registry.rb +23 -0
  111. data/lib/skill_bench/tools/run_command.rb +89 -0
  112. data/lib/skill_bench/tools/write_file.rb +78 -0
  113. data/lib/skill_bench/tools.rb +33 -0
  114. data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
  115. data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
  116. data/lib/skill_bench/trend_tracker.rb +66 -0
  117. data/lib/skill_bench/version.rb +6 -0
  118. data/lib/skill_bench.rb +103 -0
  119. metadata +247 -0
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base'
4
+ require_relative '../error_logger'
5
+
6
+ module SkillBench
7
+ module Tools
8
+ # Handles reading the contents of a file within the working directory.
9
+ class ReadFile < Base
10
+ # @return [Hash] The tool definition for the LLM API.
11
+ def self.definition
12
+ {
13
+ type: 'function',
14
+ function: {
15
+ name: 'read_file',
16
+ description: 'Read the contents of a file.',
17
+ parameters: {
18
+ type: 'object',
19
+ properties: {
20
+ path: { type: 'string', description: 'Relative path to the file to read.' }
21
+ },
22
+ required: ['path'],
23
+ additionalProperties: false
24
+ }
25
+ }
26
+ }
27
+ end
28
+
29
+ # Reads the contents of a file.
30
+ #
31
+ # @param path [String] The relative path to the file.
32
+ # @param working_dir_path [Pathname] The working directory to resolve the path against.
33
+ # @return [String] The file contents, or an error message if not found.
34
+ def self.call(path, working_dir_path)
35
+ validation_error = validate_read_file_path(path)
36
+ return validation_error if validation_error
37
+
38
+ target = secure_path(path, working_dir_path)
39
+ return 'Error: File not found' unless target.exist? && target.file?
40
+ return 'Error: File is not readable' unless target.readable?
41
+
42
+ target.read
43
+ rescue ArgumentError
44
+ raise
45
+ rescue StandardError => e
46
+ SkillBench::ErrorLogger.log_error(e, 'ReadFile Error')
47
+ "Error reading file: #{e.message}"
48
+ end
49
+
50
+ class << self
51
+ private
52
+
53
+ def validate_read_file_path(path)
54
+ return 'Error: Invalid path. Path must be a string.' unless path.is_a?(String)
55
+
56
+ normalized = path.strip
57
+ return 'Error: Invalid path. Path must not be empty.' if normalized.empty?
58
+ raise ArgumentError, "Path traversal attempt: #{path}" if normalized.include?('..') || normalized.include?('\\')
59
+ return 'Error: Invalid path. Allowed characters are letters, numbers, dot, underscore, hyphen, and slash.' unless normalized.match?(%r{\A[a-zA-Z0-9._\-/]+\z})
60
+
61
+ nil
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'read_file'
4
+ require_relative 'write_file'
5
+ require_relative 'run_command'
6
+
7
+ module SkillBench
8
+ module Tools
9
+ # Registry for all available tools, providing their definitions to the LLM.
10
+ class Registry
11
+ # Returns an array of tool definitions in the format expected by the LLM API.
12
+ #
13
+ # @return [Array<Hash>] The list of available tools with their names, descriptions, and schemas.
14
+ def self.definitions
15
+ [
16
+ ReadFile.definition,
17
+ WriteFile.definition,
18
+ RunCommand.definition
19
+ ]
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'timeout'
5
+ require 'shellwords'
6
+ require_relative '../config'
7
+
8
+ module SkillBench
9
+ module Tools
10
+ # Handles executing a shell command within the working directory.
11
+ class RunCommand
12
+ # Commands that are always blocked even if listed in allowed_commands,
13
+ # because they can be used to escape the sandbox or execute arbitrary code.
14
+ DANGEROUS_COMMANDS = %w[
15
+ bash sh zsh fish dash ksh csh tcsh
16
+ python python3 python2 ruby perl node
17
+ php lua tcl wish
18
+ curl wget nc ncat socat
19
+ eval exec
20
+ sudo su doas
21
+ chmod chown mount umount
22
+ dd mkfs fdisk parted
23
+ insmod rmmod modprobe
24
+ systemctl service
25
+ passwd useradd userdel groupadd groupdel
26
+ ].freeze
27
+
28
+ # @return [Hash] The tool definition for the LLM API.
29
+ def self.definition
30
+ {
31
+ type: 'function',
32
+ function: {
33
+ name: 'run_command',
34
+ description: 'Execute a shell command (e.g., rspec).',
35
+ parameters: {
36
+ type: 'object',
37
+ properties: {
38
+ command: { type: 'string', description: 'The shell command to run.' }
39
+ },
40
+ required: ['command'],
41
+ additionalProperties: false
42
+ }
43
+ }
44
+ }
45
+ end
46
+
47
+ # Executes a shell command within the working directory (host or container).
48
+ #
49
+ # Tokenizes the command string before execution so that arguments are passed
50
+ # directly to the OS without shell interpretation, preventing shell injection.
51
+ #
52
+ # @param command [String] The command to run (e.g. "rspec spec/models").
53
+ # @param working_dir_path [Pathname] The host directory (ignored if container_id present).
54
+ # @param container_id [String, nil] The Docker container ID for isolated execution.
55
+ # @return [String] A formatted string containing the exit status, STDOUT, and STDERR.
56
+ # @raise [Timeout::Error] Internally rescued; returns a timeout message string.
57
+ def self.call(command, working_dir_path, container_id = nil)
58
+ argv = command.shellsplit
59
+ return 'Error: Empty command.' if argv.empty?
60
+
61
+ base_cmd = argv.first
62
+ return "Error: Command '#{base_cmd}' is blocked for security reasons." if DANGEROUS_COMMANDS.include?(base_cmd)
63
+
64
+ allowed = SkillBench::Config.allowed_commands
65
+ return 'Error: No allowed commands configured. Set allowed_commands in skill-bench.json or use --mode mock.' if allowed.nil?
66
+ return "Error: Command '#{base_cmd}' is not permitted." unless allowed.include?(base_cmd)
67
+
68
+ max_time = SkillBench::Config.max_execution_time
69
+ Timeout.timeout(max_time) do
70
+ stdout_str, stderr_str, status = if container_id
71
+ docker_cmd = ['docker', 'exec', '-w', '/sandbox', container_id] + argv
72
+ Open3.capture3(*docker_cmd)
73
+ else
74
+ Open3.capture3(*argv, chdir: working_dir_path.to_s)
75
+ end
76
+ <<~RESULT
77
+ Exit Status: #{status.exitstatus}
78
+ STDOUT:
79
+ #{stdout_str}
80
+ STDERR:
81
+ #{stderr_str}
82
+ RESULT
83
+ end
84
+ rescue Timeout::Error
85
+ "Error: Command execution timed out after #{max_time} seconds."
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base'
4
+
5
+ # Top-level namespace for the Rails Agent Evaluator.
6
+ module SkillBench
7
+ # Contains tool implementations for the evaluator.
8
+ module Tools
9
+ # Handles writing content to a file within the working directory.
10
+ class WriteFile < Base
11
+ # @return [Hash] The tool definition for the LLM API.
12
+ def self.definition
13
+ {
14
+ type: 'function',
15
+ function: {
16
+ name: 'write_file',
17
+ description: 'Write content to a file. Overwrites the file if it exists.',
18
+ parameters: {
19
+ type: 'object',
20
+ properties: {
21
+ path: { type: 'string', description: 'Relative path to the file to write.' },
22
+ content: { type: 'string', description: 'The content to write into the file.' }
23
+ },
24
+ required: %w[path content],
25
+ additionalProperties: false
26
+ }
27
+ }
28
+ }
29
+ end
30
+
31
+ # Writes content to a file. Creates missing parent directories.
32
+ #
33
+ # @param path [String] The relative path to the file.
34
+ # @param content [String] The content to write.
35
+ # @param working_dir_path [Pathname] The working directory to resolve the path against.
36
+ # @return [String] A success message.
37
+ def self.call(path, content, working_dir_path)
38
+ validate_write_path!(path)
39
+
40
+ target = secure_path(path, working_dir_path)
41
+ target.dirname.mkpath
42
+ # Re-verify path after mkpath to mitigate TOCTOU vulnerabilities
43
+ target = secure_path(path, working_dir_path)
44
+
45
+ File.open(target, File::WRONLY | File::CREAT | File::TRUNC, 0o644) do |f|
46
+ f.write(content)
47
+ end
48
+ "Successfully wrote to #{path}"
49
+ end
50
+
51
+ class << self
52
+ private
53
+
54
+ # Validates the path against strict security rules to prevent traversal.
55
+ # Following recommendations to disallow directory separators and multiple dots.
56
+ #
57
+ # @param path [String] The relative path to validate.
58
+ # @raise [ArgumentError] if the path is invalid, empty, or attempts traversal.
59
+ # @return [void]
60
+ def validate_write_path!(path)
61
+ raise ArgumentError, 'Path must be a string' unless path.is_a?(String)
62
+
63
+ normalized = path.strip
64
+ raise ArgumentError, 'Path cannot be empty' if normalized.empty?
65
+
66
+ # Allow forward slashes for nested directories, but reject '..'
67
+ raise ArgumentError, "Path traversal attempt: #{path}" if normalized.include?('..')
68
+
69
+ raise ArgumentError, "Backslashes are not allowed in path: #{path}" if normalized.include?('\\')
70
+
71
+ return if normalized.match?(%r{\A[a-zA-Z0-9._\-/]+\z})
72
+
73
+ raise ArgumentError, "Invalid characters in path: #{path}"
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'tools/base'
4
+ require_relative 'tools/argument_parser'
5
+ require_relative 'tools/read_file'
6
+ require_relative 'tools/write_file'
7
+ require_relative 'tools/run_command'
8
+ require_relative 'tools/dispatcher'
9
+ require_relative 'tools/registry'
10
+
11
+ module SkillBench
12
+ # Provides the definitions and execution logic for tools available to the ReAct agent.
13
+ # Currently supports reading files, writing files, and running shell commands.
14
+ module Tools
15
+ # Returns an array of tool definitions in the format expected by the LLM API.
16
+ #
17
+ # @return [Array<Hash>] The list of available tools with their names, descriptions, and schemas.
18
+ def self.definitions
19
+ Registry.definitions
20
+ end
21
+
22
+ # Executes a specified tool with the given arguments within a working directory.
23
+ #
24
+ # @param name [String] The name of the tool to execute (e.g., 'read_file').
25
+ # @param arguments [String] A JSON string containing the arguments for the tool.
26
+ # @param working_dir [String] The base directory in which the tool should operate.
27
+ # @param container_id [String, nil] The Docker container ID for isolated execution.
28
+ # @return [String] The result of the tool execution, or an error message.
29
+ def self.execute(name, arguments, working_dir, container_id = nil)
30
+ Dispatcher.call(name, arguments, working_dir, container_id)
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'pathname'
5
+
6
+ module SkillBench
7
+ class TrendTracker
8
+ # Handles history file persistence operations including backup management
9
+ class Persistence
10
+ # @param history_file [String] Path to the history JSON file
11
+ def initialize(history_file)
12
+ @history_file = File.expand_path(history_file)
13
+ end
14
+
15
+ # Loads history from file with corruption recovery
16
+ #
17
+ # @return [Array<Hash>] List of historical entries
18
+ def load
19
+ return [] unless File.exist?(history_file)
20
+
21
+ JSON.parse(File.read(history_file), symbolize_names: true)
22
+ rescue JSON::ParserError => e
23
+ backup = read_backup
24
+ return backup if backup
25
+
26
+ SkillBench::ErrorLogger.log_error(e, "History file #{history_file} corrupted")
27
+ []
28
+ end
29
+
30
+ # Writes history to file with atomic operation and backup.
31
+ # Returns a result hash so callers do not need to rescue SystemCallError.
32
+ #
33
+ # @param history [Array<Hash>] History entries to write
34
+ # @return [Hash] { success: true } on success, { success: false, error: { message: '...' } } on failure
35
+ def write(history)
36
+ json = JSON.pretty_generate(history)
37
+ temp_file = "#{history_file}.tmp"
38
+ File.write(temp_file, json)
39
+ File.rename(temp_file, history_file)
40
+
41
+ begin
42
+ File.write("#{history_file}.bak", json)
43
+ rescue SystemCallError => e
44
+ warn "Backup write failed for #{history_file}: #{e.message}"
45
+ end
46
+
47
+ { success: true }
48
+ rescue SystemCallError => e
49
+ { success: false, error: { message: e.message } }
50
+ end
51
+
52
+ private
53
+
54
+ attr_reader :history_file
55
+
56
+ # Reads backup file if it exists
57
+ #
58
+ # @return [Array<Hash>, nil] Backup data or nil if unavailable
59
+ def read_backup
60
+ backup_path = "#{history_file}.bak"
61
+ return nil unless File.exist?(backup_path)
62
+
63
+ JSON.parse(File.read(backup_path), symbolize_names: true)
64
+ rescue JSON::ParserError
65
+ nil
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ class TrendTracker
5
+ # Calculates performance trends between evaluation results
6
+ class TrendCalculator
7
+ # Computes trend comparison between current result and historical entries
8
+ #
9
+ # @param entries [Array<Hash>] Historical entries
10
+ # @param current_entry [Hash] Current evaluation entry
11
+ # @return [Hash, nil] Trend data or nil if no matching history exists
12
+ def self.compute_trend(entries, current_entry)
13
+ matching = filter_matching_entries(entries, current_entry)
14
+ return nil if matching.empty?
15
+
16
+ previous = matching.last
17
+ current_baseline = current_entry[:baseline_total]
18
+ current_context = current_entry[:context_total]
19
+ previous_baseline = previous[:baseline_total]
20
+ previous_context = previous[:context_total]
21
+ return nil unless current_baseline && current_context && previous_baseline && previous_context
22
+
23
+ {
24
+ baseline_trend: trend_direction(current_baseline, previous_baseline),
25
+ context_trend: trend_direction(current_context, previous_context),
26
+ baseline_delta: current_baseline - previous_baseline,
27
+ context_delta: current_context - previous_context,
28
+ previous_run: previous[:timestamp]
29
+ }
30
+ end
31
+
32
+ class << self
33
+ private
34
+
35
+ # Filters historical entries to match current evaluation configuration
36
+ #
37
+ # @param entries [Array<Hash>] Historical entries
38
+ # @param current_entry [Hash] Current evaluation entry
39
+ # @return [Array<Hash>] Matching entries
40
+ def filter_matching_entries(entries, current_entry)
41
+ entries.select do |entry|
42
+ entry[:eval_name] == current_entry[:eval_name] &&
43
+ entry[:skill_names].sort == current_entry[:skill_names].sort
44
+ end
45
+ end
46
+
47
+ # Determines trend direction between two values
48
+ #
49
+ # @param current [Numeric] Current value
50
+ # @param previous [Numeric] Previous value
51
+ # @return [Symbol] :improved, :regressed, or :unchanged
52
+ def trend_direction(current, previous)
53
+ return :unchanged if current == previous
54
+
55
+ current > previous ? :improved : :regressed
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'time'
4
+ require_relative 'trend_tracker/persistence'
5
+ require_relative 'trend_tracker/trend_calculator'
6
+
7
+ module SkillBench
8
+ # Tracks evaluation results over time and computes trend deltas.
9
+ class TrendTracker
10
+ DEFAULT_HISTORY_FILE = '.skill-bench-trends.json'
11
+
12
+ # @param history_file [String] Path to the history JSON file.
13
+ def initialize(history_file: DEFAULT_HISTORY_FILE)
14
+ @persistence = Persistence.new(history_file)
15
+ end
16
+
17
+ # Records an evaluation result.
18
+ #
19
+ # @param result [Hash] The evaluation result from EvaluationRunner.
20
+ # @return [Hash] Service response.
21
+ def record(result)
22
+ history = @persistence.load
23
+ history << extract_entry(result)
24
+ write_result = @persistence.write(history)
25
+
26
+ return { success: false, response: { error: write_result[:error] } } unless write_result[:success]
27
+
28
+ { success: true, response: { recorded: true } }
29
+ rescue StandardError => e
30
+ SkillBench::ErrorLogger.log_error(e, 'TrendTracker Error')
31
+ { success: false, response: { error: { message: e.message } } }
32
+ end
33
+
34
+ # Loads the full history.
35
+ #
36
+ # @return [Array<Hash>] List of historical entries.
37
+ def history
38
+ @persistence.load
39
+ end
40
+
41
+ # Computes the trend of the given result against the most recent matching history entry.
42
+ #
43
+ # @param result [Hash] The current evaluation result.
44
+ # @return [Hash, nil] Trend data or nil if no matching history exists.
45
+ def trend_for(result)
46
+ entries = @persistence.load
47
+ current = extract_entry(result)
48
+ TrendCalculator.compute_trend(entries, current)
49
+ end
50
+
51
+ private
52
+
53
+ def extract_entry(result)
54
+ report = result.dig(:response, :report)
55
+ {
56
+ timestamp: Time.now.iso8601,
57
+ eval_name: result[:eval_name],
58
+ skill_names: result[:skill_names],
59
+ verdict: report&.verdict,
60
+ baseline_total: report&.baseline_total,
61
+ context_total: report&.context_total,
62
+ deltas: report&.deltas
63
+ }
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ # The current gem version.
5
+ VERSION = '0.1.0'
6
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Ruby Skill Bench - AI Agent Skills Evaluation Engine
4
+ #
5
+ # @example Basic usage
6
+ # require 'skill_bench'
7
+ # SkillBench::CLI.call(ARGV)
8
+
9
+ # Core modules
10
+ require_relative 'skill_bench/version'
11
+ require_relative 'skill_bench/dimension'
12
+ require_relative 'skill_bench/criteria'
13
+ require_relative 'skill_bench/delta_report'
14
+ require_relative 'skill_bench/cli'
15
+ require_relative 'skill_bench/config'
16
+ require_relative 'skill_bench/output_formatter'
17
+ require_relative 'skill_bench/client'
18
+
19
+ # Judge subsystem
20
+ require_relative 'skill_bench/judge'
21
+ require_relative 'skill_bench/judge/judge'
22
+ require_relative 'skill_bench/judge/prompt'
23
+ require_relative 'skill_bench/judge/response'
24
+
25
+ # Agent subsystem
26
+ require_relative 'skill_bench/agent'
27
+ require_relative 'skill_bench/agent/runner'
28
+ require_relative 'skill_bench/agent/summary'
29
+ require_relative 'skill_bench/agent/react_agent'
30
+
31
+ # Task subsystem
32
+ require_relative 'skill_bench/task'
33
+ require_relative 'skill_bench/task/evaluator'
34
+ require_relative 'skill_bench/task/file_reader'
35
+
36
+ # Evaluation orchestration
37
+ require_relative 'skill_bench/evaluation'
38
+ require_relative 'skill_bench/evaluation/runner'
39
+ require_relative 'skill_bench/evaluation/generator'
40
+
41
+ # Execution environment
42
+ require_relative 'skill_bench/execution'
43
+ require_relative 'skill_bench/execution/context_hydrator'
44
+ require_relative 'skill_bench/execution/sandbox'
45
+ require_relative 'skill_bench/execution/source_path_resolver'
46
+
47
+ # Clients
48
+ require_relative 'skill_bench/clients/all'
49
+ require_relative 'skill_bench/clients/provider_schemas'
50
+
51
+ # Config management
52
+ require_relative 'skill_bench/config/store'
53
+ require_relative 'skill_bench/config/defaults'
54
+ require_relative 'skill_bench/config/applier'
55
+ require_relative 'skill_bench/config/env_overrides'
56
+ require_relative 'skill_bench/config/json_loader'
57
+ require_relative 'skill_bench/config/facade_readers'
58
+ require_relative 'skill_bench/config/facade_writers'
59
+
60
+ # Models
61
+ require_relative 'skill_bench/models/config'
62
+ require_relative 'skill_bench/models/criteria_validator'
63
+ require_relative 'skill_bench/models/eval'
64
+ require_relative 'skill_bench/models/skill'
65
+ require_relative 'skill_bench/models/provider'
66
+
67
+ # Commands
68
+ require_relative 'skill_bench/commands/init'
69
+ require_relative 'skill_bench/commands/run'
70
+ require_relative 'skill_bench/commands/skill_new'
71
+ require_relative 'skill_bench/commands/eval_new'
72
+
73
+ # Services
74
+ require_relative 'skill_bench/services/runner_service'
75
+ require_relative 'skill_bench/services/template_registry'
76
+
77
+ # Tools
78
+ require_relative 'skill_bench/tools'
79
+
80
+ # History recording
81
+ require_relative 'skill_bench/history_recorder'
82
+ require_relative 'skill_bench/history_recorder/persistence_service'
83
+ require_relative 'skill_bench/history_recorder/summary_service'
84
+
85
+ # Trend tracking
86
+ require_relative 'skill_bench/trend_tracker'
87
+ require_relative 'skill_bench/trend_tracker/persistence'
88
+ require_relative 'skill_bench/trend_tracker/trend_calculator'
89
+
90
+ # Rails integrations
91
+ require_relative 'skill_bench/rails/skill_templates'
92
+
93
+ # Migration utilities
94
+ require_relative 'skill_bench/migration/provider_migrator'
95
+
96
+ # Interactive mode
97
+ require_relative 'skill_bench/interactive'
98
+
99
+ # Package verification
100
+ require_relative 'skill_bench/package_verifier'
101
+
102
+ # Utility modules
103
+ require_relative 'skill_bench/error_logger'