ruby-skill-bench 0.1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +231 -0
- data/lib/skill_bench/agent/react_agent.rb +2 -1
- data/lib/skill_bench/cli/compare_command.rb +91 -0
- data/lib/skill_bench/cli/help_printer.rb +9 -1
- data/lib/skill_bench/cli/run_command.rb +6 -4
- data/lib/skill_bench/cli.rb +7 -4
- data/lib/skill_bench/clients/all.rb +2 -0
- data/lib/skill_bench/clients/base_client.rb +2 -5
- data/lib/skill_bench/clients/providers/mock.rb +56 -0
- data/lib/skill_bench/clients/request_builder.rb +2 -4
- data/lib/skill_bench/clients/response_builder.rb +91 -0
- data/lib/skill_bench/clients/response_error_handler.rb +5 -17
- data/lib/skill_bench/clients/retry_handler.rb +4 -7
- data/lib/skill_bench/commands/run.rb +6 -2
- data/lib/skill_bench/config/applier.rb +1 -0
- data/lib/skill_bench/config/defaults.rb +1 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/json_loader.rb +3 -3
- data/lib/skill_bench/config/store.rb +5 -0
- data/lib/skill_bench/config.rb +10 -1
- data/lib/skill_bench/constants.rb +58 -0
- data/lib/skill_bench/delta_report.rb +20 -0
- data/lib/skill_bench/execution/context_hydrator.rb +16 -6
- data/lib/skill_bench/execution/sandbox.rb +18 -3
- data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
- data/lib/skill_bench/registry/pack_resolver.rb +119 -0
- data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
- data/lib/skill_bench/services/compare_option_parser.rb +55 -0
- data/lib/skill_bench/services/comparison_reporter.rb +97 -0
- data/lib/skill_bench/services/comparison_runner.rb +49 -0
- data/lib/skill_bench/services/context_loader_service.rb +42 -0
- data/lib/skill_bench/services/error_response_builder.rb +119 -0
- data/lib/skill_bench/services/eval_resolver.rb +33 -0
- data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
- data/lib/skill_bench/services/judge_params_builder.rb +54 -0
- data/lib/skill_bench/services/manifest_finder.rb +36 -0
- data/lib/skill_bench/services/output_formatter.rb +28 -0
- data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
- data/lib/skill_bench/services/provider_resolver.rb +73 -0
- data/lib/skill_bench/services/runner_service.rb +84 -315
- data/lib/skill_bench/services/skill_resolver.rb +37 -9
- data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
- data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
- data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
- data/lib/skill_bench/services/variant_parser.rb +32 -0
- data/lib/skill_bench/services/variant_resolver.rb +63 -0
- data/lib/skill_bench/tools/run_command.rb +2 -17
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +1 -0
- metadata +25 -2
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require_relative '../execution/context_hydrator'
|
|
5
|
+
require_relative '../execution/source_path_resolver'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Services
|
|
9
|
+
# Builds system prompts for baseline and context agent runs.
|
|
10
|
+
class PromptBuilderService
|
|
11
|
+
# Builds the baseline system prompt (no skill context).
|
|
12
|
+
#
|
|
13
|
+
# @return [String] The baseline system prompt
|
|
14
|
+
def self.build_baseline
|
|
15
|
+
new.build_baseline
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Builds the context-aware system prompt based on eval metadata.
|
|
19
|
+
#
|
|
20
|
+
# For `skill_bundle_xml` context mode, combines SKILL.md with source code
|
|
21
|
+
# via ContextHydrator. Falls back to SKILL.md-only if source is unavailable.
|
|
22
|
+
#
|
|
23
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
24
|
+
# @param skills [Array<SkillBench::Models::Skill>] Resolved skills
|
|
25
|
+
# @param skill_context [String] The combined skill context from SKILL.md files
|
|
26
|
+
# @return [String] The context system prompt
|
|
27
|
+
def self.build_context(evaluation, skills, skill_context)
|
|
28
|
+
new.build_context(evaluation, skills, skill_context)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Builds the baseline system prompt (no skill context).
|
|
32
|
+
#
|
|
33
|
+
# @return [String] The baseline system prompt
|
|
34
|
+
def build_baseline
|
|
35
|
+
<<~PROMPT
|
|
36
|
+
You are an expert Ruby on Rails developer. Your job is to read the task,
|
|
37
|
+
modify the codebase using the tools provided to meet the requirements,
|
|
38
|
+
and then explain what you did.
|
|
39
|
+
PROMPT
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Builds the context-aware system prompt based on eval metadata.
|
|
43
|
+
#
|
|
44
|
+
# For `skill_bundle_xml` context mode, combines SKILL.md with source code
|
|
45
|
+
# via ContextHydrator. Falls back to SKILL.md-only if source is unavailable.
|
|
46
|
+
#
|
|
47
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
48
|
+
# @param _skills [Array<SkillBench::Models::Skill>] Resolved skills (unused in current implementation)
|
|
49
|
+
# @param skill_context [String] The combined skill context from SKILL.md files
|
|
50
|
+
# @return [String] The context system prompt
|
|
51
|
+
def build_context(evaluation, _skills, skill_context)
|
|
52
|
+
return skill_context unless evaluation.metadata['context_mode'] == 'skill_bundle_xml'
|
|
53
|
+
|
|
54
|
+
source_path = resolve_source_path(evaluation)
|
|
55
|
+
return skill_context unless source_path
|
|
56
|
+
|
|
57
|
+
xml_result = Execution::ContextHydrator.call(source_path: source_path, base_path: Pathname.new(Dir.pwd))
|
|
58
|
+
hydrator_response = xml_result[:response]
|
|
59
|
+
xml_context = hydrator_response[:context]
|
|
60
|
+
return skill_context unless xml_result[:success] && !xml_context.empty?
|
|
61
|
+
|
|
62
|
+
<<~PROMPT
|
|
63
|
+
You are an expert Ruby on Rails developer.
|
|
64
|
+
You have access to a skill file and source code wrapped in <agent_context> tags.
|
|
65
|
+
Use the skill instructions and the provided source code to solve the task.
|
|
66
|
+
|
|
67
|
+
## Skill Instructions
|
|
68
|
+
#{skill_context}
|
|
69
|
+
|
|
70
|
+
## Source Code
|
|
71
|
+
#{xml_context}
|
|
72
|
+
PROMPT
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
# Resolves the source path for context hydration.
|
|
78
|
+
#
|
|
79
|
+
# Tries the eval's `source/` subdirectory first, then falls back to
|
|
80
|
+
# SourcePathResolver inference.
|
|
81
|
+
#
|
|
82
|
+
# @param evaluation [SkillBench::Models::Eval] The eval being run
|
|
83
|
+
# @return [String, nil] The resolved source path, or nil if not found
|
|
84
|
+
def resolve_source_path(evaluation)
|
|
85
|
+
eval_path = evaluation.path
|
|
86
|
+
eval_source = File.join(eval_path, 'source')
|
|
87
|
+
return eval_source if Dir.exist?(eval_source)
|
|
88
|
+
|
|
89
|
+
sources = SkillBench::Config.skill_sources || {}
|
|
90
|
+
inferred = Execution::SourcePathResolver.call(
|
|
91
|
+
eval_folder_path: eval_path.to_s,
|
|
92
|
+
skill_sources: sources
|
|
93
|
+
)
|
|
94
|
+
inferred if inferred && Dir.exist?(inferred)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../models/config'
|
|
4
|
+
require_relative '../models/provider'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Services
|
|
8
|
+
# Resolves the provider and its configuration.
|
|
9
|
+
class ProviderResolver
|
|
10
|
+
# Stand-in provider when no LLM config is available.
|
|
11
|
+
MOCK_PROVIDER = Struct.new(:name, :runtime, :llm, :merged_config)
|
|
12
|
+
private_constant :MOCK_PROVIDER
|
|
13
|
+
|
|
14
|
+
# Resolves the provider and its configuration.
|
|
15
|
+
#
|
|
16
|
+
# @return [Hash] Result with keys:
|
|
17
|
+
# - success: Boolean indicating if resolution succeeded
|
|
18
|
+
# - provider: The resolved provider instance
|
|
19
|
+
# - config: The merged provider config (if successful)
|
|
20
|
+
# - error: The error object (if failed)
|
|
21
|
+
def self.call
|
|
22
|
+
new.call
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Resolves the provider and its configuration.
|
|
26
|
+
#
|
|
27
|
+
# @return [Hash] Result with keys:
|
|
28
|
+
# - success: Boolean indicating if resolution succeeded
|
|
29
|
+
# - provider: The resolved provider instance
|
|
30
|
+
# - config: The merged provider config (if successful)
|
|
31
|
+
# - error: The error object (if failed)
|
|
32
|
+
def call
|
|
33
|
+
provider = resolve_provider
|
|
34
|
+
config_result = resolve_provider_config(provider)
|
|
35
|
+
|
|
36
|
+
if config_result[:success]
|
|
37
|
+
{
|
|
38
|
+
success: true,
|
|
39
|
+
provider: provider,
|
|
40
|
+
config: config_result[:config]
|
|
41
|
+
}
|
|
42
|
+
else
|
|
43
|
+
{
|
|
44
|
+
success: false,
|
|
45
|
+
provider: provider,
|
|
46
|
+
error: config_result[:error]
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def resolve_provider
|
|
54
|
+
config = SkillBench::Models::Config.load
|
|
55
|
+
provider = config.to_provider
|
|
56
|
+
return provider if provider
|
|
57
|
+
|
|
58
|
+
warn 'Config load failed, using mock provider'
|
|
59
|
+
MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
|
|
60
|
+
rescue JSON::ParserError, ArgumentError, Errno::ENOENT => e
|
|
61
|
+
# Config parsing/validation errors or missing config file - fall back to mock
|
|
62
|
+
warn "Config load failed with error: #{e.message}, using mock provider"
|
|
63
|
+
MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def resolve_provider_config(provider)
|
|
67
|
+
{ success: true, config: provider.merged_config }
|
|
68
|
+
rescue ArgumentError => e
|
|
69
|
+
{ success: false, error: e }
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -1,42 +1,49 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
require_relative '
|
|
6
|
-
require_relative '
|
|
7
|
-
require_relative '
|
|
8
|
-
require_relative '
|
|
9
|
-
require_relative '
|
|
10
|
-
require_relative '
|
|
11
|
-
require_relative '
|
|
12
|
-
require_relative '
|
|
13
|
-
require_relative '
|
|
14
|
-
require_relative '../execution/source_path_resolver'
|
|
15
|
-
require_relative '../agent/react_agent'
|
|
3
|
+
require_relative '../evaluation/runner'
|
|
4
|
+
require_relative 'eval_resolver'
|
|
5
|
+
require_relative 'skill_resolver_service'
|
|
6
|
+
require_relative 'provider_resolver'
|
|
7
|
+
require_relative 'prompt_builder_service'
|
|
8
|
+
require_relative 'agent_spawner_service'
|
|
9
|
+
require_relative 'context_loader_service'
|
|
10
|
+
require_relative 'judge_params_builder'
|
|
11
|
+
require_relative 'error_response_builder'
|
|
12
|
+
require_relative 'trend_recorder_service'
|
|
13
|
+
require_relative 'output_formatter'
|
|
16
14
|
|
|
17
15
|
module SkillBench
|
|
18
16
|
module Services
|
|
19
17
|
# Orchestrates the execution of an eval with baseline and context runs.
|
|
20
|
-
#
|
|
18
|
+
# Coordinates multiple services to resolve entities, spawn agents, and evaluate results.
|
|
21
19
|
class RunnerService
|
|
22
|
-
#
|
|
23
|
-
|
|
24
|
-
private_constant :MOCK_PROVIDER
|
|
25
|
-
|
|
20
|
+
# Context for evaluation and trend recording
|
|
21
|
+
EvaluationContext = Struct.new(:evaluation, :skill_context, :baseline_output, :context_output, :provider, :config, keyword_init: true)
|
|
26
22
|
# Runs an eval with the given parameters.
|
|
27
23
|
#
|
|
28
24
|
# @param eval_name [String] Name or path of the eval to run
|
|
29
25
|
# @param skill_names [Array<String>] Names of the skills to use
|
|
26
|
+
# @param pack [String, nil] Optional pack name for registry-based skill resolution
|
|
27
|
+
# @param registry_manifest [String, nil] Optional path to registry.json manifest
|
|
30
28
|
# @return [Hash] Result from EvaluationRunner
|
|
31
|
-
def self.call(eval_name:, skill_names:)
|
|
32
|
-
new(
|
|
29
|
+
def self.call(eval_name:, skill_names:, pack: nil, registry_manifest: nil)
|
|
30
|
+
new(
|
|
31
|
+
eval_name: eval_name,
|
|
32
|
+
skill_names: skill_names,
|
|
33
|
+
pack: pack,
|
|
34
|
+
registry_manifest: registry_manifest
|
|
35
|
+
).call
|
|
33
36
|
end
|
|
34
37
|
|
|
35
38
|
# @param eval_name [String] Name or path of the eval
|
|
36
39
|
# @param skill_names [Array<String>] Names of the skills
|
|
37
|
-
|
|
40
|
+
# @param pack [String, nil] Optional pack name
|
|
41
|
+
# @param registry_manifest [String, nil] Optional registry.json path
|
|
42
|
+
def initialize(eval_name:, skill_names:, pack: nil, registry_manifest: nil)
|
|
38
43
|
@eval_name = eval_name
|
|
39
44
|
@skill_names = skill_names
|
|
45
|
+
@pack = pack
|
|
46
|
+
@registry_manifest = registry_manifest
|
|
40
47
|
end
|
|
41
48
|
|
|
42
49
|
# Executes the eval: resolves entities, runs baseline and context, evaluates.
|
|
@@ -45,337 +52,99 @@ module SkillBench
|
|
|
45
52
|
# @raise [Errno::ENOENT] when the eval directory does not exist.
|
|
46
53
|
# @raise [ArgumentError] when a skill cannot be resolved.
|
|
47
54
|
def call
|
|
48
|
-
evaluation =
|
|
49
|
-
skills =
|
|
50
|
-
|
|
55
|
+
evaluation = EvalResolver.call(eval_name)
|
|
56
|
+
skills = SkillResolverService.call(skill_names, pack: pack, registry_manifest: registry_manifest)
|
|
57
|
+
provider_result = ProviderResolver.call
|
|
51
58
|
|
|
52
|
-
|
|
53
|
-
return config_error_result(config_result[:error], evaluation, provider) unless config_result[:success]
|
|
59
|
+
return config_error_result(provider_result[:error], evaluation, provider_result[:provider]) unless provider_result[:success]
|
|
54
60
|
|
|
55
|
-
|
|
56
|
-
|
|
61
|
+
provider = provider_result[:provider]
|
|
62
|
+
config = provider_result[:config]
|
|
57
63
|
|
|
58
|
-
baseline_output =
|
|
64
|
+
baseline_output = run_baseline_agent(evaluation, provider, config)
|
|
59
65
|
return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error
|
|
60
66
|
|
|
61
|
-
skill_context =
|
|
67
|
+
skill_context = ContextLoaderService.call(skills)
|
|
62
68
|
return empty_context_error_result(evaluation, provider) if skill_context.strip.empty?
|
|
63
69
|
|
|
64
|
-
|
|
65
|
-
context_output = spawn_agent(evaluation, context_prompt, provider, config)
|
|
70
|
+
context_output = run_context_agent(evaluation, skills, skill_context, provider, config)
|
|
66
71
|
return agent_error_result(context_output, 'context', evaluation, provider) if context_output[:status] == :error
|
|
67
72
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
judge_params = build_judge_params(provider, config)
|
|
71
|
-
|
|
72
|
-
result = Evaluation::Runner.call(
|
|
73
|
-
task: evaluation.task,
|
|
74
|
-
criteria: criteria,
|
|
73
|
+
context = EvaluationContext.new(
|
|
74
|
+
evaluation: evaluation,
|
|
75
75
|
skill_context: skill_context,
|
|
76
|
-
baseline_output:
|
|
77
|
-
context_output:
|
|
78
|
-
|
|
76
|
+
baseline_output: baseline_output,
|
|
77
|
+
context_output: context_output,
|
|
78
|
+
provider: provider,
|
|
79
|
+
config: config
|
|
79
80
|
)
|
|
80
|
-
|
|
81
|
-
return enrich_error_result(result, evaluation, provider) unless result[:success]
|
|
82
|
-
|
|
83
|
-
trend_result = record_and_compute_trend(result)
|
|
84
|
-
return enrich_error_result(trend_result, evaluation, provider) unless trend_result[:success]
|
|
85
|
-
|
|
86
|
-
{
|
|
87
|
-
success: true,
|
|
88
|
-
eval_name: eval_name,
|
|
89
|
-
skill_name: skill_names.join(', '),
|
|
90
|
-
provider_name: provider.name,
|
|
91
|
-
response: result[:response].merge(
|
|
92
|
-
trend: trend_result[:trend],
|
|
93
|
-
baseline_iterations: baseline_output[:iterations] || [],
|
|
94
|
-
context_iterations: context_output[:iterations] || []
|
|
95
|
-
)
|
|
96
|
-
}
|
|
81
|
+
evaluate_and_record_trend(context)
|
|
97
82
|
end
|
|
98
83
|
|
|
99
84
|
private
|
|
100
85
|
|
|
101
|
-
attr_reader :eval_name, :skill_names
|
|
102
|
-
|
|
103
|
-
def resolve_eval
|
|
104
|
-
eval_path = eval_name.include?('/') ? eval_name : "evals/#{eval_name}"
|
|
105
|
-
SkillBench::Models::Eval.load(eval_path)
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
def resolve_skills
|
|
109
|
-
skill_names.map { |name| Services::SkillResolver.call(name) }
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
def resolve_provider_config(provider)
|
|
113
|
-
{ success: true, config: provider.merged_config }
|
|
114
|
-
rescue ArgumentError => e
|
|
115
|
-
{ success: false, error: e }
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
# Safely calls merged_config, returning nil on any error.
|
|
119
|
-
#
|
|
120
|
-
# @param provider [Object] The provider to query.
|
|
121
|
-
# @return [Hash, nil] The merged config or nil.
|
|
122
|
-
def safe_merged_config(provider)
|
|
123
|
-
provider.merged_config
|
|
124
|
-
rescue StandardError
|
|
125
|
-
nil
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
def resolve_provider
|
|
129
|
-
config = SkillBench::Models::Config.load
|
|
130
|
-
provider = config.to_provider
|
|
131
|
-
return provider if provider
|
|
132
|
-
|
|
133
|
-
warn 'Config load failed, using mock provider'
|
|
134
|
-
MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
# Spawns the LLM agent with the given system prompt.
|
|
138
|
-
#
|
|
139
|
-
# @param evaluation [SkillBench::Models::Eval] The eval being run.
|
|
140
|
-
# @param system_prompt [String] The system prompt for the agent.
|
|
141
|
-
# @param provider [Object] The resolved provider.
|
|
142
|
-
# @param config [Hash, nil] Provider config.
|
|
143
|
-
# @return [Hash] Agent response with result, status, runtime, usage, raw_response, iterations.
|
|
144
|
-
def spawn_agent(evaluation, system_prompt, provider, config)
|
|
145
|
-
return { result: 'mock result', status: :success, iterations: [] } if provider.name == 'mock'
|
|
146
|
-
|
|
147
|
-
client_params = build_client_params(provider, config)
|
|
148
|
-
|
|
149
|
-
max_iterations = config&.[](:max_iterations) || config&.[]('max_iterations') || 25
|
|
150
|
-
|
|
151
|
-
Execution::Sandbox.run(evaluation.path) do |sandbox|
|
|
152
|
-
agent_result = Agent::ReactAgent.call(
|
|
153
|
-
system_prompt: system_prompt,
|
|
154
|
-
initial_prompt: evaluation.task,
|
|
155
|
-
working_dir: sandbox.path,
|
|
156
|
-
container_id: sandbox.container_id,
|
|
157
|
-
client_params: client_params,
|
|
158
|
-
max_iterations: max_iterations
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
status = agent_result[:success] ? :success : :error
|
|
162
|
-
final_answer = agent_result.dig(:response, :content) || ''
|
|
163
|
-
diff = Execution::Sandbox.capture_diff(sandbox.path)
|
|
164
|
-
iterations = agent_result.dig(:response, :iterations) || []
|
|
165
|
-
|
|
166
|
-
output = [final_answer, diff].reject(&:empty?).join("\n\n")
|
|
167
|
-
|
|
168
|
-
{
|
|
169
|
-
result: output,
|
|
170
|
-
status: status,
|
|
171
|
-
runtime: provider.runtime,
|
|
172
|
-
usage: {},
|
|
173
|
-
raw_response: agent_result,
|
|
174
|
-
iterations: iterations
|
|
175
|
-
}
|
|
176
|
-
end
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
# Builds client parameters for the ReactAgent.
|
|
180
|
-
#
|
|
181
|
-
# @param provider [Object] The resolved provider.
|
|
182
|
-
# @param config [Hash, nil] Provider config.
|
|
183
|
-
# @return [Hash] Client parameters.
|
|
184
|
-
def build_client_params(provider, config)
|
|
185
|
-
config ||= safe_merged_config(provider)
|
|
186
|
-
return {} unless config
|
|
187
|
-
|
|
188
|
-
params = config.dup
|
|
189
|
-
params[:model] ||= provider.llm
|
|
190
|
-
params[:provider] = provider.runtime.to_sym
|
|
191
|
-
params
|
|
192
|
-
rescue StandardError
|
|
193
|
-
{}
|
|
194
|
-
end
|
|
195
|
-
|
|
196
|
-
# Builds the baseline system prompt (no skill context).
|
|
197
|
-
#
|
|
198
|
-
# @return [String] The baseline system prompt.
|
|
199
|
-
def build_baseline_system_prompt
|
|
200
|
-
<<~PROMPT
|
|
201
|
-
You are an expert Ruby on Rails developer. Your job is to read the task,
|
|
202
|
-
modify the codebase using the tools provided to meet the requirements,
|
|
203
|
-
and then explain what you did.
|
|
204
|
-
PROMPT
|
|
205
|
-
end
|
|
86
|
+
attr_reader :eval_name, :skill_names, :pack, :registry_manifest
|
|
206
87
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
# For `skill_bundle_xml` context mode, combines SKILL.md with source code
|
|
210
|
-
# via ContextHydrator. Falls back to SKILL.md-only if source is unavailable.
|
|
211
|
-
#
|
|
212
|
-
# @param evaluation [SkillBench::Models::Eval] The eval being run.
|
|
213
|
-
# @param skills [Array<SkillBench::Models::Skill>] Resolved skills.
|
|
214
|
-
# @return [String] The context system prompt.
|
|
215
|
-
def build_context_system_prompt(evaluation, skills)
|
|
216
|
-
skill_md_content = load_combined_skill_context(skills)
|
|
217
|
-
return skill_md_content unless evaluation.metadata['context_mode'] == 'skill_bundle_xml'
|
|
218
|
-
|
|
219
|
-
source_path = resolve_source_path(evaluation)
|
|
220
|
-
return skill_md_content unless source_path
|
|
221
|
-
|
|
222
|
-
xml_result = Execution::ContextHydrator.call(source_path: source_path, base_path: Pathname.new(Dir.pwd))
|
|
223
|
-
hydrator_response = xml_result[:response]
|
|
224
|
-
xml_context = hydrator_response[:context]
|
|
225
|
-
return skill_md_content unless xml_result[:success] && !xml_context.empty?
|
|
226
|
-
|
|
227
|
-
<<~PROMPT
|
|
228
|
-
You are an expert Ruby on Rails developer.
|
|
229
|
-
You have access to a skill file and source code wrapped in <agent_context> tags.
|
|
230
|
-
Use the skill instructions and the provided source code to solve the task.
|
|
231
|
-
|
|
232
|
-
## Skill Instructions
|
|
233
|
-
#{skill_md_content}
|
|
234
|
-
|
|
235
|
-
## Source Code
|
|
236
|
-
#{xml_context}
|
|
237
|
-
PROMPT
|
|
88
|
+
def config_error_result(error, evaluation, provider)
|
|
89
|
+
ErrorResponseBuilder.config_error(error, evaluation, provider, skill_names)
|
|
238
90
|
end
|
|
239
91
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
# Tries the eval's `source/` subdirectory first, then falls back to
|
|
243
|
-
# SourcePathResolver inference.
|
|
244
|
-
#
|
|
245
|
-
# @param evaluation [SkillBench::Models::Eval] The eval being run.
|
|
246
|
-
# @return [String, nil] The resolved source path, or nil if not found.
|
|
247
|
-
def resolve_source_path(evaluation)
|
|
248
|
-
eval_path = evaluation.path
|
|
249
|
-
eval_source = File.join(eval_path, 'source')
|
|
250
|
-
return eval_source if Dir.exist?(eval_source)
|
|
251
|
-
|
|
252
|
-
inferred = Execution::SourcePathResolver.call(eval_folder_path: eval_path.to_s)
|
|
253
|
-
inferred if inferred && Dir.exist?(inferred)
|
|
92
|
+
def agent_error_result(result, phase, evaluation, provider)
|
|
93
|
+
ErrorResponseBuilder.agent_error(result, phase, evaluation, provider, skill_names)
|
|
254
94
|
end
|
|
255
95
|
|
|
256
|
-
# Returns an error result when skill context is empty.
|
|
257
|
-
#
|
|
258
|
-
# @param evaluation [SkillBench::Models::Eval] The eval being run.
|
|
259
|
-
# @param provider [Object] The resolved provider.
|
|
260
|
-
# @return [Hash] Error result with metadata.
|
|
261
96
|
def empty_context_error_result(evaluation, provider)
|
|
262
|
-
|
|
263
|
-
success: false,
|
|
264
|
-
response: {
|
|
265
|
-
error: {
|
|
266
|
-
message: 'Skill context is empty. Ensure SKILL.md exists and has content.'
|
|
267
|
-
}
|
|
268
|
-
},
|
|
269
|
-
eval_name: evaluation.name,
|
|
270
|
-
skill_name: skill_names.join(', '),
|
|
271
|
-
provider_name: provider.name
|
|
272
|
-
}
|
|
97
|
+
ErrorResponseBuilder.empty_context_error(evaluation, provider, skill_names)
|
|
273
98
|
end
|
|
274
99
|
|
|
275
|
-
def
|
|
276
|
-
|
|
100
|
+
def enrich_error_result(result, evaluation, provider)
|
|
101
|
+
ErrorResponseBuilder.enrich_error(result, evaluation, provider, skill_names)
|
|
102
|
+
end
|
|
277
103
|
|
|
278
|
-
|
|
279
|
-
|
|
104
|
+
def run_baseline_agent(evaluation, provider, config)
|
|
105
|
+
baseline_prompt = PromptBuilderService.build_baseline
|
|
106
|
+
AgentSpawnerService.call(evaluation, baseline_prompt, provider, config)
|
|
280
107
|
end
|
|
281
108
|
|
|
282
|
-
def
|
|
283
|
-
|
|
284
|
-
|
|
109
|
+
def run_context_agent(evaluation, skills, skill_context, provider, config)
|
|
110
|
+
context_prompt = PromptBuilderService.build_context(evaluation, skills, skill_context)
|
|
111
|
+
AgentSpawnerService.call(evaluation, context_prompt, provider, config)
|
|
285
112
|
end
|
|
286
113
|
|
|
287
|
-
def
|
|
288
|
-
|
|
114
|
+
def evaluate_and_record_trend(context)
|
|
115
|
+
evaluation = context.evaluation
|
|
116
|
+
provider = context.provider
|
|
117
|
+
config = context.config
|
|
289
118
|
|
|
290
|
-
|
|
291
|
-
|
|
119
|
+
criteria = evaluation.criteria
|
|
120
|
+
judge_params = JudgeParamsBuilder.call(provider, config)
|
|
292
121
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
122
|
+
result = Evaluation::Runner.call(
|
|
123
|
+
task: evaluation.task,
|
|
124
|
+
criteria: criteria,
|
|
125
|
+
skill_context: context.skill_context,
|
|
126
|
+
baseline_output: OutputFormatter.call(context.baseline_output),
|
|
127
|
+
context_output: OutputFormatter.call(context.context_output),
|
|
128
|
+
judge_params: judge_params
|
|
129
|
+
)
|
|
301
130
|
|
|
302
|
-
|
|
303
|
-
agent_result[:result].to_s
|
|
304
|
-
end
|
|
131
|
+
return enrich_error_result(result, evaluation, provider) unless result[:success]
|
|
305
132
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
error_msg = raw&.dig(:response, :error, :message) || raw&.dig(:error, :message) || 'unknown error'
|
|
309
|
-
{
|
|
310
|
-
success: false,
|
|
311
|
-
response: {
|
|
312
|
-
error: {
|
|
313
|
-
message: "#{phase.capitalize} agent failed: #{error_msg}"
|
|
314
|
-
}
|
|
315
|
-
},
|
|
316
|
-
eval_name: evaluation.name,
|
|
317
|
-
skill_name: skill_names.join(', '),
|
|
318
|
-
provider_name: provider.name
|
|
319
|
-
}
|
|
320
|
-
end
|
|
133
|
+
trend_result = TrendRecorderService.call(result, eval_name, skill_names)
|
|
134
|
+
return enrich_error_result(trend_result, evaluation, provider) unless trend_result[:success]
|
|
321
135
|
|
|
322
|
-
def config_error_result(error, evaluation, provider)
|
|
323
136
|
{
|
|
324
|
-
success:
|
|
325
|
-
|
|
326
|
-
error: {
|
|
327
|
-
message: "Configuration error: #{error.message}"
|
|
328
|
-
}
|
|
329
|
-
},
|
|
330
|
-
eval_name: evaluation.name,
|
|
331
|
-
skill_name: skill_names.join(', '),
|
|
332
|
-
provider_name: provider.name
|
|
333
|
-
}
|
|
334
|
-
end
|
|
335
|
-
|
|
336
|
-
def enrich_error_result(result, evaluation, provider)
|
|
337
|
-
result.merge(
|
|
338
|
-
eval_name: evaluation.name,
|
|
137
|
+
success: true,
|
|
138
|
+
eval_name: eval_name,
|
|
339
139
|
skill_name: skill_names.join(', '),
|
|
340
|
-
provider_name: provider.name
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
tracker = TrendTracker.new
|
|
346
|
-
enriched = result.merge(eval_name: eval_name, skill_names: skill_names)
|
|
347
|
-
trend = tracker.trend_for(enriched)
|
|
348
|
-
record_result = tracker.record(enriched)
|
|
349
|
-
|
|
350
|
-
record_success = record_result.is_a?(Hash) && record_result[:success]
|
|
351
|
-
unless record_success
|
|
352
|
-
message = if record_result.is_a?(Hash)
|
|
353
|
-
record_result.dig(:response, :error, :message) ||
|
|
354
|
-
record_result.dig(:error, :message) ||
|
|
355
|
-
'Unknown error'
|
|
356
|
-
else
|
|
357
|
-
'Unexpected record response'
|
|
358
|
-
end
|
|
359
|
-
SkillBench::ErrorLogger.log_error(
|
|
360
|
-
StandardError.new(message),
|
|
361
|
-
"Trend tracking record failed for eval #{eval_name}"
|
|
140
|
+
provider_name: provider.name,
|
|
141
|
+
response: result[:response].merge(
|
|
142
|
+
trend: trend_result[:trend],
|
|
143
|
+
baseline_iterations: context.baseline_output[:iterations] || [],
|
|
144
|
+
context_iterations: context.context_output[:iterations] || []
|
|
362
145
|
)
|
|
363
|
-
|
|
364
|
-
success: false,
|
|
365
|
-
response: {
|
|
366
|
-
error: {
|
|
367
|
-
message: "Trend tracking record failed: #{message}",
|
|
368
|
-
record_result: record_result
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
end
|
|
373
|
-
{ success: true, trend: trend }
|
|
374
|
-
rescue StandardError => e
|
|
375
|
-
SkillBench::ErrorLogger.log_error(e, 'Trend tracking failed')
|
|
376
|
-
{ success: false, response: { error: { message: e.message } } }
|
|
146
|
+
}
|
|
377
147
|
end
|
|
378
|
-
# rubocop:enable Metrics/ClassLength
|
|
379
148
|
end
|
|
380
149
|
end
|
|
381
150
|
end
|