ruby-skill-bench 0.1.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +86 -0
- data/lib/skill_bench/cli/compare_command.rb +91 -0
- data/lib/skill_bench/cli/help_printer.rb +9 -1
- data/lib/skill_bench/cli/run_command.rb +6 -4
- data/lib/skill_bench/cli.rb +7 -4
- data/lib/skill_bench/clients/all.rb +1 -0
- data/lib/skill_bench/clients/providers/mock.rb +56 -0
- data/lib/skill_bench/commands/run.rb +6 -2
- data/lib/skill_bench/config/applier.rb +1 -0
- data/lib/skill_bench/config/defaults.rb +1 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/json_loader.rb +3 -3
- data/lib/skill_bench/config/store.rb +5 -0
- data/lib/skill_bench/config.rb +10 -1
- data/lib/skill_bench/delta_report.rb +20 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
- data/lib/skill_bench/registry/pack_resolver.rb +119 -0
- data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
- data/lib/skill_bench/services/compare_option_parser.rb +55 -0
- data/lib/skill_bench/services/comparison_reporter.rb +97 -0
- data/lib/skill_bench/services/comparison_runner.rb +49 -0
- data/lib/skill_bench/services/context_loader_service.rb +42 -0
- data/lib/skill_bench/services/error_response_builder.rb +119 -0
- data/lib/skill_bench/services/eval_resolver.rb +33 -0
- data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
- data/lib/skill_bench/services/judge_params_builder.rb +54 -0
- data/lib/skill_bench/services/manifest_finder.rb +36 -0
- data/lib/skill_bench/services/output_formatter.rb +28 -0
- data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
- data/lib/skill_bench/services/provider_resolver.rb +73 -0
- data/lib/skill_bench/services/runner_service.rb +84 -315
- data/lib/skill_bench/services/skill_resolver.rb +37 -9
- data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
- data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
- data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
- data/lib/skill_bench/services/variant_parser.rb +32 -0
- data/lib/skill_bench/services/variant_resolver.rb +63 -0
- data/lib/skill_bench/version.rb +1 -1
- metadata +23 -2
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Builds judge parameters from provider configuration.
|
|
6
|
+
class JudgeParamsBuilder
|
|
7
|
+
# Builds judge parameters from provider configuration.
|
|
8
|
+
#
|
|
9
|
+
# @param provider [Object] The resolved provider
|
|
10
|
+
# @param config [Hash, nil] Provider config
|
|
11
|
+
# @return [Hash] Judge parameters with api_key, model, and provider
|
|
12
|
+
def self.call(provider, config)
|
|
13
|
+
new(provider, config).call
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @param provider [Object] The resolved provider
|
|
17
|
+
# @param config [Hash, nil] Provider config
|
|
18
|
+
def initialize(provider, config)
|
|
19
|
+
@provider = provider
|
|
20
|
+
@config = config
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Builds judge parameters from provider configuration.
|
|
24
|
+
#
|
|
25
|
+
# @return [Hash] Judge parameters with api_key, model, and provider
|
|
26
|
+
def call
|
|
27
|
+
return { provider: :mock } if @provider.name == 'mock'
|
|
28
|
+
|
|
29
|
+
config = @config || safe_merged_config
|
|
30
|
+
return {} unless config
|
|
31
|
+
|
|
32
|
+
{
|
|
33
|
+
api_key: config[:api_key],
|
|
34
|
+
model: config[:model] || @provider.llm,
|
|
35
|
+
provider: @provider.runtime.to_sym
|
|
36
|
+
}
|
|
37
|
+
rescue KeyError, NoMethodError
|
|
38
|
+
# Expected errors from missing config keys or nil config
|
|
39
|
+
{}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
# Safely calls merged_config, returning nil on any error.
|
|
45
|
+
#
|
|
46
|
+
# @return [Hash, nil] The merged config or nil
|
|
47
|
+
def safe_merged_config
|
|
48
|
+
@provider.merged_config
|
|
49
|
+
rescue StandardError
|
|
50
|
+
nil
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Finds the registry manifest file path.
|
|
6
|
+
class ManifestFinder
|
|
7
|
+
# Default path relative to current working directory.
|
|
8
|
+
DEFAULT_PATH = '../agent-mcp-runtime/registry.json'
|
|
9
|
+
|
|
10
|
+
# Finds the registry manifest file.
|
|
11
|
+
#
|
|
12
|
+
# @param path [String, nil] Optional custom path to the manifest
|
|
13
|
+
# @return [String] Absolute path to the registry manifest
|
|
14
|
+
# @raise [ArgumentError] when the manifest file is not found
|
|
15
|
+
def self.call(path: nil)
|
|
16
|
+
new(path: path).call
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# @param path [String, nil] Optional custom path to the manifest
|
|
20
|
+
def initialize(path: nil)
|
|
21
|
+
@path = path
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Finds the registry manifest file.
|
|
25
|
+
#
|
|
26
|
+
# @return [String] Absolute path to the registry manifest
|
|
27
|
+
# @raise [ArgumentError] when the manifest file is not found
|
|
28
|
+
def call
|
|
29
|
+
manifest_path = @path || File.expand_path(DEFAULT_PATH, Dir.pwd)
|
|
30
|
+
raise ArgumentError, "Registry manifest not found: #{manifest_path}" unless File.exist?(manifest_path)
|
|
31
|
+
|
|
32
|
+
manifest_path
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Formats agent output for evaluation.
|
|
6
|
+
class OutputFormatter
|
|
7
|
+
# Formats agent output for evaluation.
|
|
8
|
+
#
|
|
9
|
+
# @param agent_result [Hash] The agent result containing the output
|
|
10
|
+
# @return [String] The formatted output
|
|
11
|
+
def self.call(agent_result)
|
|
12
|
+
new(agent_result).call
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @param agent_result [Hash] The agent result containing the output
|
|
16
|
+
def initialize(agent_result)
|
|
17
|
+
@agent_result = agent_result
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Formats agent output for evaluation.
|
|
21
|
+
#
|
|
22
|
+
# @return [String] The formatted output
|
|
23
|
+
def call
|
|
24
|
+
@agent_result[:result].to_s
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require_relative '../execution/context_hydrator'
|
|
5
|
+
require_relative '../execution/source_path_resolver'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Services
|
|
9
|
+
# Builds system prompts for baseline and context agent runs.
|
|
10
|
+
class PromptBuilderService
|
|
11
|
+
# Builds the baseline system prompt (no skill context).
|
|
12
|
+
#
|
|
13
|
+
# @return [String] The baseline system prompt
|
|
14
|
+
def self.build_baseline
|
|
15
|
+
new.build_baseline
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Builds the context-aware system prompt based on eval metadata.
|
|
19
|
+
#
|
|
20
|
+
# For `skill_bundle_xml` context mode, combines SKILL.md with source code
|
|
21
|
+
# via ContextHydrator. Falls back to SKILL.md-only if source is unavailable.
|
|
22
|
+
#
|
|
23
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
24
|
+
# @param skills [Array<SkillBench::Models::Skill>] Resolved skills
|
|
25
|
+
# @param skill_context [String] The combined skill context from SKILL.md files
|
|
26
|
+
# @return [String] The context system prompt
|
|
27
|
+
def self.build_context(evaluation, skills, skill_context)
|
|
28
|
+
new.build_context(evaluation, skills, skill_context)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Builds the baseline system prompt (no skill context).
|
|
32
|
+
#
|
|
33
|
+
# @return [String] The baseline system prompt
|
|
34
|
+
def build_baseline
|
|
35
|
+
<<~PROMPT
|
|
36
|
+
You are an expert Ruby on Rails developer. Your job is to read the task,
|
|
37
|
+
modify the codebase using the tools provided to meet the requirements,
|
|
38
|
+
and then explain what you did.
|
|
39
|
+
PROMPT
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Builds the context-aware system prompt based on eval metadata.
|
|
43
|
+
#
|
|
44
|
+
# For `skill_bundle_xml` context mode, combines SKILL.md with source code
|
|
45
|
+
# via ContextHydrator. Falls back to SKILL.md-only if source is unavailable.
|
|
46
|
+
#
|
|
47
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
48
|
+
# @param _skills [Array<SkillBench::Models::Skill>] Resolved skills (unused in current implementation)
|
|
49
|
+
# @param skill_context [String] The combined skill context from SKILL.md files
|
|
50
|
+
# @return [String] The context system prompt
|
|
51
|
+
def build_context(evaluation, _skills, skill_context)
|
|
52
|
+
return skill_context unless evaluation.metadata['context_mode'] == 'skill_bundle_xml'
|
|
53
|
+
|
|
54
|
+
source_path = resolve_source_path(evaluation)
|
|
55
|
+
return skill_context unless source_path
|
|
56
|
+
|
|
57
|
+
xml_result = Execution::ContextHydrator.call(source_path: source_path, base_path: Pathname.new(Dir.pwd))
|
|
58
|
+
hydrator_response = xml_result[:response]
|
|
59
|
+
xml_context = hydrator_response[:context]
|
|
60
|
+
return skill_context unless xml_result[:success] && !xml_context.empty?
|
|
61
|
+
|
|
62
|
+
<<~PROMPT
|
|
63
|
+
You are an expert Ruby on Rails developer.
|
|
64
|
+
You have access to a skill file and source code wrapped in <agent_context> tags.
|
|
65
|
+
Use the skill instructions and the provided source code to solve the task.
|
|
66
|
+
|
|
67
|
+
## Skill Instructions
|
|
68
|
+
#{skill_context}
|
|
69
|
+
|
|
70
|
+
## Source Code
|
|
71
|
+
#{xml_context}
|
|
72
|
+
PROMPT
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
# Resolves the source path for context hydration.
|
|
78
|
+
#
|
|
79
|
+
# Tries the eval's `source/` subdirectory first, then falls back to
|
|
80
|
+
# SourcePathResolver inference.
|
|
81
|
+
#
|
|
82
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
83
|
+
# @return [String, nil] The resolved source path, or nil if not found
|
|
84
|
+
def resolve_source_path(evaluation)
|
|
85
|
+
eval_path = evaluation.path
|
|
86
|
+
eval_source = File.join(eval_path, 'source')
|
|
87
|
+
return eval_source if Dir.exist?(eval_source)
|
|
88
|
+
|
|
89
|
+
sources = SkillBench::Config.skill_sources || {}
|
|
90
|
+
inferred = Execution::SourcePathResolver.call(
|
|
91
|
+
eval_folder_path: eval_path.to_s,
|
|
92
|
+
skill_sources: sources
|
|
93
|
+
)
|
|
94
|
+
inferred if inferred && Dir.exist?(inferred)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../models/config'
|
|
4
|
+
require_relative '../models/provider'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Resolves the provider and its configuration.
|
|
9
|
+
class ProviderResolver
|
|
10
|
+
# Stand-in provider when no LLM config is available.
|
|
11
|
+
MOCK_PROVIDER = Struct.new(:name, :runtime, :llm, :merged_config)
|
|
12
|
+
private_constant :MOCK_PROVIDER
|
|
13
|
+
|
|
14
|
+
# Resolves the provider and its configuration.
|
|
15
|
+
#
|
|
16
|
+
# @return [Hash] Result with keys:
|
|
17
|
+
# - success: Boolean indicating if resolution succeeded
|
|
18
|
+
# - provider: The resolved provider instance
|
|
19
|
+
# - config: The merged provider config (if successful)
|
|
20
|
+
# - error: The error object (if failed)
|
|
21
|
+
def self.call
|
|
22
|
+
new.call
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Resolves the provider and its configuration.
|
|
26
|
+
#
|
|
27
|
+
# @return [Hash] Result with keys:
|
|
28
|
+
# - success: Boolean indicating if resolution succeeded
|
|
29
|
+
# - provider: The resolved provider instance
|
|
30
|
+
# - config: The merged provider config (if successful)
|
|
31
|
+
# - error: The error object (if failed)
|
|
32
|
+
def call
|
|
33
|
+
provider = resolve_provider
|
|
34
|
+
config_result = resolve_provider_config(provider)
|
|
35
|
+
|
|
36
|
+
if config_result[:success]
|
|
37
|
+
{
|
|
38
|
+
success: true,
|
|
39
|
+
provider: provider,
|
|
40
|
+
config: config_result[:config]
|
|
41
|
+
}
|
|
42
|
+
else
|
|
43
|
+
{
|
|
44
|
+
success: false,
|
|
45
|
+
provider: provider,
|
|
46
|
+
error: config_result[:error]
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def resolve_provider
|
|
54
|
+
config = SkillBench::Models::Config.load
|
|
55
|
+
provider = config.to_provider
|
|
56
|
+
return provider if provider
|
|
57
|
+
|
|
58
|
+
warn 'Config load failed, using mock provider'
|
|
59
|
+
MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
|
|
60
|
+
rescue JSON::ParserError, ArgumentError, Errno::ENOENT => e
|
|
61
|
+
# Config parsing/validation errors or missing config file - fall back to mock
|
|
62
|
+
warn "Config load failed with error: #{e.message}, using mock provider"
|
|
63
|
+
MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def resolve_provider_config(provider)
|
|
67
|
+
{ success: true, config: provider.merged_config }
|
|
68
|
+
rescue ArgumentError => e
|
|
69
|
+
{ success: false, error: e }
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|