ruby-skill-bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +794 -0
- data/bin/skill-bench +15 -0
- data/docs/architecture.md +200 -0
- data/docs/first-eval-guide.md +522 -0
- data/docs/testing-guide.md +361 -0
- data/lib/skill_bench/agent/react_agent/loop_runner.rb +69 -0
- data/lib/skill_bench/agent/react_agent/step.rb +92 -0
- data/lib/skill_bench/agent/react_agent/tool_executor.rb +88 -0
- data/lib/skill_bench/agent/react_agent.rb +58 -0
- data/lib/skill_bench/agent/runner.rb +108 -0
- data/lib/skill_bench/agent/summary.rb +39 -0
- data/lib/skill_bench/agent.rb +10 -0
- data/lib/skill_bench/cli/eval/eval_command_registry.rb +35 -0
- data/lib/skill_bench/cli/eval/eval_commands.rb +112 -0
- data/lib/skill_bench/cli/eval/eval_options.rb +75 -0
- data/lib/skill_bench/cli/eval_command.rb +40 -0
- data/lib/skill_bench/cli/help_printer.rb +47 -0
- data/lib/skill_bench/cli/init_command.rb +69 -0
- data/lib/skill_bench/cli/result_printer.rb +20 -0
- data/lib/skill_bench/cli/run_command.rb +72 -0
- data/lib/skill_bench/cli/skill_command.rb +79 -0
- data/lib/skill_bench/cli.rb +51 -0
- data/lib/skill_bench/client.rb +23 -0
- data/lib/skill_bench/clients/all.rb +19 -0
- data/lib/skill_bench/clients/base_client.rb +212 -0
- data/lib/skill_bench/clients/provider_config.rb +47 -0
- data/lib/skill_bench/clients/provider_registry.rb +56 -0
- data/lib/skill_bench/clients/provider_schemas.rb +73 -0
- data/lib/skill_bench/clients/providers/anthropic.rb +219 -0
- data/lib/skill_bench/clients/providers/azure_openai.rb +69 -0
- data/lib/skill_bench/clients/providers/deepseek.rb +39 -0
- data/lib/skill_bench/clients/providers/gemini.rb +63 -0
- data/lib/skill_bench/clients/providers/groq.rb +39 -0
- data/lib/skill_bench/clients/providers/null_client.rb +50 -0
- data/lib/skill_bench/clients/providers/ollama.rb +63 -0
- data/lib/skill_bench/clients/providers/openai.rb +39 -0
- data/lib/skill_bench/clients/providers/opencode.rb +56 -0
- data/lib/skill_bench/clients/providers/openrouter.rb +40 -0
- data/lib/skill_bench/clients/request_builder.rb +43 -0
- data/lib/skill_bench/clients/response_error_handler.rb +73 -0
- data/lib/skill_bench/clients/response_parser.rb +93 -0
- data/lib/skill_bench/clients/retry_handler.rb +78 -0
- data/lib/skill_bench/commands/eval_new.rb +89 -0
- data/lib/skill_bench/commands/init.rb +39 -0
- data/lib/skill_bench/commands/run.rb +21 -0
- data/lib/skill_bench/commands/skill_new.rb +115 -0
- data/lib/skill_bench/config/applier.rb +67 -0
- data/lib/skill_bench/config/defaults.rb +42 -0
- data/lib/skill_bench/config/env_overrides.rb +117 -0
- data/lib/skill_bench/config/facade_readers.rb +65 -0
- data/lib/skill_bench/config/facade_writers.rb +120 -0
- data/lib/skill_bench/config/json_loader.rb +84 -0
- data/lib/skill_bench/config/store.rb +177 -0
- data/lib/skill_bench/config.rb +172 -0
- data/lib/skill_bench/criteria.rb +141 -0
- data/lib/skill_bench/delta_report.rb +97 -0
- data/lib/skill_bench/dimension.rb +69 -0
- data/lib/skill_bench/error_logger.rb +35 -0
- data/lib/skill_bench/evaluate_command.rb +120 -0
- data/lib/skill_bench/evaluation/generator.rb +191 -0
- data/lib/skill_bench/evaluation/runner.rb +81 -0
- data/lib/skill_bench/evaluation.rb +10 -0
- data/lib/skill_bench/execution/context_hydrator.rb +97 -0
- data/lib/skill_bench/execution/sandbox.rb +174 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +60 -0
- data/lib/skill_bench/execution.rb +10 -0
- data/lib/skill_bench/history_recorder/history_file.rb +71 -0
- data/lib/skill_bench/history_recorder/history_path_resolver.rb +87 -0
- data/lib/skill_bench/history_recorder/persistence_service.rb +38 -0
- data/lib/skill_bench/history_recorder/summary_service.rb +61 -0
- data/lib/skill_bench/history_recorder.rb +40 -0
- data/lib/skill_bench/interactive.rb +61 -0
- data/lib/skill_bench/judge/judge.rb +72 -0
- data/lib/skill_bench/judge/prompt.rb +121 -0
- data/lib/skill_bench/judge/response.rb +158 -0
- data/lib/skill_bench/judge.rb +10 -0
- data/lib/skill_bench/migration/provider_migrator.rb +30 -0
- data/lib/skill_bench/models/config.rb +61 -0
- data/lib/skill_bench/models/criteria_validator.rb +106 -0
- data/lib/skill_bench/models/eval.rb +81 -0
- data/lib/skill_bench/models/provider.rb +70 -0
- data/lib/skill_bench/models/skill.rb +32 -0
- data/lib/skill_bench/output_formatter.rb +132 -0
- data/lib/skill_bench/package_verifier.rb +80 -0
- data/lib/skill_bench/rails/skill_templates.rb +99 -0
- data/lib/skill_bench/runner.rb +89 -0
- data/lib/skill_bench/services/delta_table_formatter.rb +72 -0
- data/lib/skill_bench/services/feedback_generator.rb +122 -0
- data/lib/skill_bench/services/formatting_helpers.rb +45 -0
- data/lib/skill_bench/services/iteration_formatter.rb +30 -0
- data/lib/skill_bench/services/json_formatter.rb +18 -0
- data/lib/skill_bench/services/judge_score_parser_service.rb +66 -0
- data/lib/skill_bench/services/junit_formatter.rb +42 -0
- data/lib/skill_bench/services/option_parser_service.rb +63 -0
- data/lib/skill_bench/services/output_persistence_service.rb +77 -0
- data/lib/skill_bench/services/result_printer_service.rb +126 -0
- data/lib/skill_bench/services/runner_service.rb +381 -0
- data/lib/skill_bench/services/skill_resolver.rb +78 -0
- data/lib/skill_bench/services/template_registry/category_data.rb +73 -0
- data/lib/skill_bench/services/template_registry.rb +148 -0
- data/lib/skill_bench/task/evaluator.rb +94 -0
- data/lib/skill_bench/task/file_reader.rb +69 -0
- data/lib/skill_bench/task.rb +10 -0
- data/lib/skill_bench/tools/argument_parser.rb +20 -0
- data/lib/skill_bench/tools/base.rb +73 -0
- data/lib/skill_bench/tools/dispatcher.rb +61 -0
- data/lib/skill_bench/tools/read_file.rb +66 -0
- data/lib/skill_bench/tools/registry.rb +23 -0
- data/lib/skill_bench/tools/run_command.rb +89 -0
- data/lib/skill_bench/tools/write_file.rb +78 -0
- data/lib/skill_bench/tools.rb +33 -0
- data/lib/skill_bench/trend_tracker/persistence.rb +69 -0
- data/lib/skill_bench/trend_tracker/trend_calculator.rb +60 -0
- data/lib/skill_bench/trend_tracker.rb +66 -0
- data/lib/skill_bench/version.rb +6 -0
- data/lib/skill_bench.rb +103 -0
- metadata +247 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'base'
|
|
4
|
+
require_relative '../error_logger'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Tools
|
|
8
|
+
# Handles reading the contents of a file within the working directory.
|
|
9
|
+
class ReadFile < Base
|
|
10
|
+
# @return [Hash] The tool definition for the LLM API.
|
|
11
|
+
def self.definition
|
|
12
|
+
{
|
|
13
|
+
type: 'function',
|
|
14
|
+
function: {
|
|
15
|
+
name: 'read_file',
|
|
16
|
+
description: 'Read the contents of a file.',
|
|
17
|
+
parameters: {
|
|
18
|
+
type: 'object',
|
|
19
|
+
properties: {
|
|
20
|
+
path: { type: 'string', description: 'Relative path to the file to read.' }
|
|
21
|
+
},
|
|
22
|
+
required: ['path'],
|
|
23
|
+
additionalProperties: false
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Reads the contents of a file.
|
|
30
|
+
#
|
|
31
|
+
# @param path [String] The relative path to the file.
|
|
32
|
+
# @param working_dir_path [Pathname] The working directory to resolve the path against.
|
|
33
|
+
# @return [String] The file contents, or an error message if not found.
|
|
34
|
+
def self.call(path, working_dir_path)
|
|
35
|
+
validation_error = validate_read_file_path(path)
|
|
36
|
+
return validation_error if validation_error
|
|
37
|
+
|
|
38
|
+
target = secure_path(path, working_dir_path)
|
|
39
|
+
return 'Error: File not found' unless target.exist? && target.file?
|
|
40
|
+
return 'Error: File is not readable' unless target.readable?
|
|
41
|
+
|
|
42
|
+
target.read
|
|
43
|
+
rescue ArgumentError
|
|
44
|
+
raise
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
SkillBench::ErrorLogger.log_error(e, 'ReadFile Error')
|
|
47
|
+
"Error reading file: #{e.message}"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
class << self
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def validate_read_file_path(path)
|
|
54
|
+
return 'Error: Invalid path. Path must be a string.' unless path.is_a?(String)
|
|
55
|
+
|
|
56
|
+
normalized = path.strip
|
|
57
|
+
return 'Error: Invalid path. Path must not be empty.' if normalized.empty?
|
|
58
|
+
raise ArgumentError, "Path traversal attempt: #{path}" if normalized.include?('..') || normalized.include?('\\')
|
|
59
|
+
return 'Error: Invalid path. Allowed characters are letters, numbers, dot, underscore, hyphen, and slash.' unless normalized.match?(%r{\A[a-zA-Z0-9._\-/]+\z})
|
|
60
|
+
|
|
61
|
+
nil
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'read_file'
|
|
4
|
+
require_relative 'write_file'
|
|
5
|
+
require_relative 'run_command'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Tools
|
|
9
|
+
# Registry for all available tools, providing their definitions to the LLM.
|
|
10
|
+
class Registry
|
|
11
|
+
# Returns an array of tool definitions in the format expected by the LLM API.
|
|
12
|
+
#
|
|
13
|
+
# @return [Array<Hash>] The list of available tools with their names, descriptions, and schemas.
|
|
14
|
+
def self.definitions
|
|
15
|
+
[
|
|
16
|
+
ReadFile.definition,
|
|
17
|
+
WriteFile.definition,
|
|
18
|
+
RunCommand.definition
|
|
19
|
+
]
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
require 'timeout'
|
|
5
|
+
require 'shellwords'
|
|
6
|
+
require_relative '../config'
|
|
7
|
+
|
|
8
|
+
module SkillBench
|
|
9
|
+
module Tools
|
|
10
|
+
# Handles executing a shell command within the working directory.
|
|
11
|
+
class RunCommand
|
|
12
|
+
# Commands that are always blocked even if listed in allowed_commands,
|
|
13
|
+
# because they can be used to escape the sandbox or execute arbitrary code.
|
|
14
|
+
DANGEROUS_COMMANDS = %w[
|
|
15
|
+
bash sh zsh fish dash ksh csh tcsh
|
|
16
|
+
python python3 python2 ruby perl node
|
|
17
|
+
php lua tcl wish
|
|
18
|
+
curl wget nc ncat socat
|
|
19
|
+
eval exec
|
|
20
|
+
sudo su doas
|
|
21
|
+
chmod chown mount umount
|
|
22
|
+
dd mkfs fdisk parted
|
|
23
|
+
insmod rmmod modprobe
|
|
24
|
+
systemctl service
|
|
25
|
+
passwd useradd userdel groupadd groupdel
|
|
26
|
+
].freeze
|
|
27
|
+
|
|
28
|
+
# @return [Hash] The tool definition for the LLM API.
|
|
29
|
+
def self.definition
|
|
30
|
+
{
|
|
31
|
+
type: 'function',
|
|
32
|
+
function: {
|
|
33
|
+
name: 'run_command',
|
|
34
|
+
description: 'Execute a shell command (e.g., rspec).',
|
|
35
|
+
parameters: {
|
|
36
|
+
type: 'object',
|
|
37
|
+
properties: {
|
|
38
|
+
command: { type: 'string', description: 'The shell command to run.' }
|
|
39
|
+
},
|
|
40
|
+
required: ['command'],
|
|
41
|
+
additionalProperties: false
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Executes a shell command within the working directory (host or container).
|
|
48
|
+
#
|
|
49
|
+
# Tokenizes the command string before execution so that arguments are passed
|
|
50
|
+
# directly to the OS without shell interpretation, preventing shell injection.
|
|
51
|
+
#
|
|
52
|
+
# @param command [String] The command to run (e.g. "rspec spec/models").
|
|
53
|
+
# @param working_dir_path [Pathname] The host directory (ignored if container_id present).
|
|
54
|
+
# @param container_id [String, nil] The Docker container ID for isolated execution.
|
|
55
|
+
# @return [String] A formatted string containing the exit status, STDOUT, and STDERR.
|
|
56
|
+
# @raise [Timeout::Error] Internally rescued; returns a timeout message string.
|
|
57
|
+
def self.call(command, working_dir_path, container_id = nil)
|
|
58
|
+
argv = command.shellsplit
|
|
59
|
+
return 'Error: Empty command.' if argv.empty?
|
|
60
|
+
|
|
61
|
+
base_cmd = argv.first
|
|
62
|
+
return "Error: Command '#{base_cmd}' is blocked for security reasons." if DANGEROUS_COMMANDS.include?(base_cmd)
|
|
63
|
+
|
|
64
|
+
allowed = SkillBench::Config.allowed_commands
|
|
65
|
+
return 'Error: No allowed commands configured. Set allowed_commands in skill-bench.json or use --mode mock.' if allowed.nil?
|
|
66
|
+
return "Error: Command '#{base_cmd}' is not permitted." unless allowed.include?(base_cmd)
|
|
67
|
+
|
|
68
|
+
max_time = SkillBench::Config.max_execution_time
|
|
69
|
+
Timeout.timeout(max_time) do
|
|
70
|
+
stdout_str, stderr_str, status = if container_id
|
|
71
|
+
docker_cmd = ['docker', 'exec', '-w', '/sandbox', container_id] + argv
|
|
72
|
+
Open3.capture3(*docker_cmd)
|
|
73
|
+
else
|
|
74
|
+
Open3.capture3(*argv, chdir: working_dir_path.to_s)
|
|
75
|
+
end
|
|
76
|
+
<<~RESULT
|
|
77
|
+
Exit Status: #{status.exitstatus}
|
|
78
|
+
STDOUT:
|
|
79
|
+
#{stdout_str}
|
|
80
|
+
STDERR:
|
|
81
|
+
#{stderr_str}
|
|
82
|
+
RESULT
|
|
83
|
+
end
|
|
84
|
+
rescue Timeout::Error
|
|
85
|
+
"Error: Command execution timed out after #{max_time} seconds."
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'base'
|
|
4
|
+
|
|
5
|
+
# Top-level namespace for the Rails Agent Evaluator.
|
|
6
|
+
module SkillBench
|
|
7
|
+
# Contains tool implementations for the evaluator.
|
|
8
|
+
module Tools
|
|
9
|
+
# Handles writing content to a file within the working directory.
|
|
10
|
+
class WriteFile < Base
|
|
11
|
+
# @return [Hash] The tool definition for the LLM API.
|
|
12
|
+
def self.definition
|
|
13
|
+
{
|
|
14
|
+
type: 'function',
|
|
15
|
+
function: {
|
|
16
|
+
name: 'write_file',
|
|
17
|
+
description: 'Write content to a file. Overwrites the file if it exists.',
|
|
18
|
+
parameters: {
|
|
19
|
+
type: 'object',
|
|
20
|
+
properties: {
|
|
21
|
+
path: { type: 'string', description: 'Relative path to the file to write.' },
|
|
22
|
+
content: { type: 'string', description: 'The content to write into the file.' }
|
|
23
|
+
},
|
|
24
|
+
required: %w[path content],
|
|
25
|
+
additionalProperties: false
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Writes content to a file. Creates missing parent directories.
|
|
32
|
+
#
|
|
33
|
+
# @param path [String] The relative path to the file.
|
|
34
|
+
# @param content [String] The content to write.
|
|
35
|
+
# @param working_dir_path [Pathname] The working directory to resolve the path against.
|
|
36
|
+
# @return [String] A success message.
|
|
37
|
+
def self.call(path, content, working_dir_path)
|
|
38
|
+
validate_write_path!(path)
|
|
39
|
+
|
|
40
|
+
target = secure_path(path, working_dir_path)
|
|
41
|
+
target.dirname.mkpath
|
|
42
|
+
# Re-verify path after mkpath to mitigate TOCTOU vulnerabilities
|
|
43
|
+
target = secure_path(path, working_dir_path)
|
|
44
|
+
|
|
45
|
+
File.open(target, File::WRONLY | File::CREAT | File::TRUNC, 0o644) do |f|
|
|
46
|
+
f.write(content)
|
|
47
|
+
end
|
|
48
|
+
"Successfully wrote to #{path}"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
class << self
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
# Validates the path against strict security rules to prevent traversal.
|
|
55
|
+
# Following recommendations to disallow directory separators and multiple dots.
|
|
56
|
+
#
|
|
57
|
+
# @param path [String] The relative path to validate.
|
|
58
|
+
# @raise [ArgumentError] if the path is invalid, empty, or attempts traversal.
|
|
59
|
+
# @return [void]
|
|
60
|
+
def validate_write_path!(path)
|
|
61
|
+
raise ArgumentError, 'Path must be a string' unless path.is_a?(String)
|
|
62
|
+
|
|
63
|
+
normalized = path.strip
|
|
64
|
+
raise ArgumentError, 'Path cannot be empty' if normalized.empty?
|
|
65
|
+
|
|
66
|
+
# Allow forward slashes for nested directories, but reject '..'
|
|
67
|
+
raise ArgumentError, "Path traversal attempt: #{path}" if normalized.include?('..')
|
|
68
|
+
|
|
69
|
+
raise ArgumentError, "Backslashes are not allowed in path: #{path}" if normalized.include?('\\')
|
|
70
|
+
|
|
71
|
+
return if normalized.match?(%r{\A[a-zA-Z0-9._\-/]+\z})
|
|
72
|
+
|
|
73
|
+
raise ArgumentError, "Invalid characters in path: #{path}"
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'tools/base'
|
|
4
|
+
require_relative 'tools/argument_parser'
|
|
5
|
+
require_relative 'tools/read_file'
|
|
6
|
+
require_relative 'tools/write_file'
|
|
7
|
+
require_relative 'tools/run_command'
|
|
8
|
+
require_relative 'tools/dispatcher'
|
|
9
|
+
require_relative 'tools/registry'
|
|
10
|
+
|
|
11
|
+
module SkillBench
|
|
12
|
+
# Provides the definitions and execution logic for tools available to the ReAct agent.
|
|
13
|
+
# Currently supports reading files, writing files, and running shell commands.
|
|
14
|
+
module Tools
|
|
15
|
+
# Returns an array of tool definitions in the format expected by the LLM API.
|
|
16
|
+
#
|
|
17
|
+
# @return [Array<Hash>] The list of available tools with their names, descriptions, and schemas.
|
|
18
|
+
def self.definitions
|
|
19
|
+
Registry.definitions
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Executes a specified tool with the given arguments within a working directory.
|
|
23
|
+
#
|
|
24
|
+
# @param name [String] The name of the tool to execute (e.g., 'read_file').
|
|
25
|
+
# @param arguments [String] A JSON string containing the arguments for the tool.
|
|
26
|
+
# @param working_dir [String] The base directory in which the tool should operate.
|
|
27
|
+
# @param container_id [String, nil] The Docker container ID for isolated execution.
|
|
28
|
+
# @return [String] The result of the tool execution, or an error message.
|
|
29
|
+
def self.execute(name, arguments, working_dir, container_id = nil)
|
|
30
|
+
Dispatcher.call(name, arguments, working_dir, container_id)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'pathname'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
class TrendTracker
|
|
8
|
+
# Handles history file persistence operations including backup management
|
|
9
|
+
class Persistence
|
|
10
|
+
# @param history_file [String] Path to the history JSON file
|
|
11
|
+
def initialize(history_file)
|
|
12
|
+
@history_file = File.expand_path(history_file)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Loads history from file with corruption recovery
|
|
16
|
+
#
|
|
17
|
+
# @return [Array<Hash>] List of historical entries
|
|
18
|
+
def load
|
|
19
|
+
return [] unless File.exist?(history_file)
|
|
20
|
+
|
|
21
|
+
JSON.parse(File.read(history_file), symbolize_names: true)
|
|
22
|
+
rescue JSON::ParserError => e
|
|
23
|
+
backup = read_backup
|
|
24
|
+
return backup if backup
|
|
25
|
+
|
|
26
|
+
SkillBench::ErrorLogger.log_error(e, "History file #{history_file} corrupted")
|
|
27
|
+
[]
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Writes history to file with atomic operation and backup.
|
|
31
|
+
# Returns a result hash so callers do not need to rescue SystemCallError.
|
|
32
|
+
#
|
|
33
|
+
# @param history [Array<Hash>] History entries to write
|
|
34
|
+
# @return [Hash] { success: true } on success, { success: false, error: { message: '...' } } on failure
|
|
35
|
+
def write(history)
|
|
36
|
+
json = JSON.pretty_generate(history)
|
|
37
|
+
temp_file = "#{history_file}.tmp"
|
|
38
|
+
File.write(temp_file, json)
|
|
39
|
+
File.rename(temp_file, history_file)
|
|
40
|
+
|
|
41
|
+
begin
|
|
42
|
+
File.write("#{history_file}.bak", json)
|
|
43
|
+
rescue SystemCallError => e
|
|
44
|
+
warn "Backup write failed for #{history_file}: #{e.message}"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
{ success: true }
|
|
48
|
+
rescue SystemCallError => e
|
|
49
|
+
{ success: false, error: { message: e.message } }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
attr_reader :history_file
|
|
55
|
+
|
|
56
|
+
# Reads backup file if it exists
|
|
57
|
+
#
|
|
58
|
+
# @return [Array<Hash>, nil] Backup data or nil if unavailable
|
|
59
|
+
def read_backup
|
|
60
|
+
backup_path = "#{history_file}.bak"
|
|
61
|
+
return nil unless File.exist?(backup_path)
|
|
62
|
+
|
|
63
|
+
JSON.parse(File.read(backup_path), symbolize_names: true)
|
|
64
|
+
rescue JSON::ParserError
|
|
65
|
+
nil
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
class TrendTracker
|
|
5
|
+
# Calculates performance trends between evaluation results
|
|
6
|
+
class TrendCalculator
|
|
7
|
+
# Computes trend comparison between current result and historical entries
|
|
8
|
+
#
|
|
9
|
+
# @param entries [Array<Hash>] Historical entries
|
|
10
|
+
# @param current_entry [Hash] Current evaluation entry
|
|
11
|
+
# @return [Hash, nil] Trend data or nil if no matching history exists
|
|
12
|
+
def self.compute_trend(entries, current_entry)
|
|
13
|
+
matching = filter_matching_entries(entries, current_entry)
|
|
14
|
+
return nil if matching.empty?
|
|
15
|
+
|
|
16
|
+
previous = matching.last
|
|
17
|
+
current_baseline = current_entry[:baseline_total]
|
|
18
|
+
current_context = current_entry[:context_total]
|
|
19
|
+
previous_baseline = previous[:baseline_total]
|
|
20
|
+
previous_context = previous[:context_total]
|
|
21
|
+
return nil unless current_baseline && current_context && previous_baseline && previous_context
|
|
22
|
+
|
|
23
|
+
{
|
|
24
|
+
baseline_trend: trend_direction(current_baseline, previous_baseline),
|
|
25
|
+
context_trend: trend_direction(current_context, previous_context),
|
|
26
|
+
baseline_delta: current_baseline - previous_baseline,
|
|
27
|
+
context_delta: current_context - previous_context,
|
|
28
|
+
previous_run: previous[:timestamp]
|
|
29
|
+
}
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
class << self
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
# Filters historical entries to match current evaluation configuration
|
|
36
|
+
#
|
|
37
|
+
# @param entries [Array<Hash>] Historical entries
|
|
38
|
+
# @param current_entry [Hash] Current evaluation entry
|
|
39
|
+
# @return [Array<Hash>] Matching entries
|
|
40
|
+
def filter_matching_entries(entries, current_entry)
|
|
41
|
+
entries.select do |entry|
|
|
42
|
+
entry[:eval_name] == current_entry[:eval_name] &&
|
|
43
|
+
entry[:skill_names].sort == current_entry[:skill_names].sort
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Determines trend direction between two values
|
|
48
|
+
#
|
|
49
|
+
# @param current [Numeric] Current value
|
|
50
|
+
# @param previous [Numeric] Previous value
|
|
51
|
+
# @return [Symbol] :improved, :regressed, or :unchanged
|
|
52
|
+
def trend_direction(current, previous)
|
|
53
|
+
return :unchanged if current == previous
|
|
54
|
+
|
|
55
|
+
current > previous ? :improved : :regressed
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'time'
|
|
4
|
+
require_relative 'trend_tracker/persistence'
|
|
5
|
+
require_relative 'trend_tracker/trend_calculator'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
# Tracks evaluation results over time and computes trend deltas.
|
|
9
|
+
class TrendTracker
|
|
10
|
+
DEFAULT_HISTORY_FILE = '.skill-bench-trends.json'
|
|
11
|
+
|
|
12
|
+
# @param history_file [String] Path to the history JSON file.
|
|
13
|
+
def initialize(history_file: DEFAULT_HISTORY_FILE)
|
|
14
|
+
@persistence = Persistence.new(history_file)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Records an evaluation result.
|
|
18
|
+
#
|
|
19
|
+
# @param result [Hash] The evaluation result from EvaluationRunner.
|
|
20
|
+
# @return [Hash] Service response.
|
|
21
|
+
def record(result)
|
|
22
|
+
history = @persistence.load
|
|
23
|
+
history << extract_entry(result)
|
|
24
|
+
write_result = @persistence.write(history)
|
|
25
|
+
|
|
26
|
+
return { success: false, response: { error: write_result[:error] } } unless write_result[:success]
|
|
27
|
+
|
|
28
|
+
{ success: true, response: { recorded: true } }
|
|
29
|
+
rescue StandardError => e
|
|
30
|
+
SkillBench::ErrorLogger.log_error(e, 'TrendTracker Error')
|
|
31
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Loads the full history.
|
|
35
|
+
#
|
|
36
|
+
# @return [Array<Hash>] List of historical entries.
|
|
37
|
+
def history
|
|
38
|
+
@persistence.load
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Computes the trend of the given result against the most recent matching history entry.
|
|
42
|
+
#
|
|
43
|
+
# @param result [Hash] The current evaluation result.
|
|
44
|
+
# @return [Hash, nil] Trend data or nil if no matching history exists.
|
|
45
|
+
def trend_for(result)
|
|
46
|
+
entries = @persistence.load
|
|
47
|
+
current = extract_entry(result)
|
|
48
|
+
TrendCalculator.compute_trend(entries, current)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def extract_entry(result)
|
|
54
|
+
report = result.dig(:response, :report)
|
|
55
|
+
{
|
|
56
|
+
timestamp: Time.now.iso8601,
|
|
57
|
+
eval_name: result[:eval_name],
|
|
58
|
+
skill_names: result[:skill_names],
|
|
59
|
+
verdict: report&.verdict,
|
|
60
|
+
baseline_total: report&.baseline_total,
|
|
61
|
+
context_total: report&.context_total,
|
|
62
|
+
deltas: report&.deltas
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
data/lib/skill_bench.rb
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Ruby Skill Bench - AI Agent Skills Evaluation Engine
|
|
4
|
+
#
|
|
5
|
+
# @example Basic usage
|
|
6
|
+
# require 'skill_bench'
|
|
7
|
+
# SkillBench::CLI.call(ARGV)
|
|
8
|
+
|
|
9
|
+
# Core modules
|
|
10
|
+
require_relative 'skill_bench/version'
|
|
11
|
+
require_relative 'skill_bench/dimension'
|
|
12
|
+
require_relative 'skill_bench/criteria'
|
|
13
|
+
require_relative 'skill_bench/delta_report'
|
|
14
|
+
require_relative 'skill_bench/cli'
|
|
15
|
+
require_relative 'skill_bench/config'
|
|
16
|
+
require_relative 'skill_bench/output_formatter'
|
|
17
|
+
require_relative 'skill_bench/client'
|
|
18
|
+
|
|
19
|
+
# Judge subsystem
|
|
20
|
+
require_relative 'skill_bench/judge'
|
|
21
|
+
require_relative 'skill_bench/judge/judge'
|
|
22
|
+
require_relative 'skill_bench/judge/prompt'
|
|
23
|
+
require_relative 'skill_bench/judge/response'
|
|
24
|
+
|
|
25
|
+
# Agent subsystem
|
|
26
|
+
require_relative 'skill_bench/agent'
|
|
27
|
+
require_relative 'skill_bench/agent/runner'
|
|
28
|
+
require_relative 'skill_bench/agent/summary'
|
|
29
|
+
require_relative 'skill_bench/agent/react_agent'
|
|
30
|
+
|
|
31
|
+
# Task subsystem
|
|
32
|
+
require_relative 'skill_bench/task'
|
|
33
|
+
require_relative 'skill_bench/task/evaluator'
|
|
34
|
+
require_relative 'skill_bench/task/file_reader'
|
|
35
|
+
|
|
36
|
+
# Evaluation orchestration
|
|
37
|
+
require_relative 'skill_bench/evaluation'
|
|
38
|
+
require_relative 'skill_bench/evaluation/runner'
|
|
39
|
+
require_relative 'skill_bench/evaluation/generator'
|
|
40
|
+
|
|
41
|
+
# Execution environment
|
|
42
|
+
require_relative 'skill_bench/execution'
|
|
43
|
+
require_relative 'skill_bench/execution/context_hydrator'
|
|
44
|
+
require_relative 'skill_bench/execution/sandbox'
|
|
45
|
+
require_relative 'skill_bench/execution/source_path_resolver'
|
|
46
|
+
|
|
47
|
+
# Clients
|
|
48
|
+
require_relative 'skill_bench/clients/all'
|
|
49
|
+
require_relative 'skill_bench/clients/provider_schemas'
|
|
50
|
+
|
|
51
|
+
# Config management
|
|
52
|
+
require_relative 'skill_bench/config/store'
|
|
53
|
+
require_relative 'skill_bench/config/defaults'
|
|
54
|
+
require_relative 'skill_bench/config/applier'
|
|
55
|
+
require_relative 'skill_bench/config/env_overrides'
|
|
56
|
+
require_relative 'skill_bench/config/json_loader'
|
|
57
|
+
require_relative 'skill_bench/config/facade_readers'
|
|
58
|
+
require_relative 'skill_bench/config/facade_writers'
|
|
59
|
+
|
|
60
|
+
# Models
|
|
61
|
+
require_relative 'skill_bench/models/config'
|
|
62
|
+
require_relative 'skill_bench/models/criteria_validator'
|
|
63
|
+
require_relative 'skill_bench/models/eval'
|
|
64
|
+
require_relative 'skill_bench/models/skill'
|
|
65
|
+
require_relative 'skill_bench/models/provider'
|
|
66
|
+
|
|
67
|
+
# Commands
|
|
68
|
+
require_relative 'skill_bench/commands/init'
|
|
69
|
+
require_relative 'skill_bench/commands/run'
|
|
70
|
+
require_relative 'skill_bench/commands/skill_new'
|
|
71
|
+
require_relative 'skill_bench/commands/eval_new'
|
|
72
|
+
|
|
73
|
+
# Services
|
|
74
|
+
require_relative 'skill_bench/services/runner_service'
|
|
75
|
+
require_relative 'skill_bench/services/template_registry'
|
|
76
|
+
|
|
77
|
+
# Tools
|
|
78
|
+
require_relative 'skill_bench/tools'
|
|
79
|
+
|
|
80
|
+
# History recording
|
|
81
|
+
require_relative 'skill_bench/history_recorder'
|
|
82
|
+
require_relative 'skill_bench/history_recorder/persistence_service'
|
|
83
|
+
require_relative 'skill_bench/history_recorder/summary_service'
|
|
84
|
+
|
|
85
|
+
# Trend tracking
|
|
86
|
+
require_relative 'skill_bench/trend_tracker'
|
|
87
|
+
require_relative 'skill_bench/trend_tracker/persistence'
|
|
88
|
+
require_relative 'skill_bench/trend_tracker/trend_calculator'
|
|
89
|
+
|
|
90
|
+
# Rails integrations
|
|
91
|
+
require_relative 'skill_bench/rails/skill_templates'
|
|
92
|
+
|
|
93
|
+
# Migration utilities
|
|
94
|
+
require_relative 'skill_bench/migration/provider_migrator'
|
|
95
|
+
|
|
96
|
+
# Interactive mode
|
|
97
|
+
require_relative 'skill_bench/interactive'
|
|
98
|
+
|
|
99
|
+
# Package verification
|
|
100
|
+
require_relative 'skill_bench/package_verifier'
|
|
101
|
+
|
|
102
|
+
# Utility modules
|
|
103
|
+
require_relative 'skill_bench/error_logger'
|