ruby-skill-bench 0.1.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +86 -0
  3. data/lib/skill_bench/cli/compare_command.rb +91 -0
  4. data/lib/skill_bench/cli/help_printer.rb +9 -1
  5. data/lib/skill_bench/cli/run_command.rb +6 -4
  6. data/lib/skill_bench/cli.rb +7 -4
  7. data/lib/skill_bench/clients/all.rb +1 -0
  8. data/lib/skill_bench/clients/providers/mock.rb +56 -0
  9. data/lib/skill_bench/commands/run.rb +6 -2
  10. data/lib/skill_bench/config/applier.rb +1 -0
  11. data/lib/skill_bench/config/defaults.rb +1 -0
  12. data/lib/skill_bench/config/facade_readers.rb +7 -0
  13. data/lib/skill_bench/config/json_loader.rb +3 -3
  14. data/lib/skill_bench/config/store.rb +5 -0
  15. data/lib/skill_bench/config.rb +10 -1
  16. data/lib/skill_bench/delta_report.rb +20 -0
  17. data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
  18. data/lib/skill_bench/registry/pack_resolver.rb +119 -0
  19. data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
  20. data/lib/skill_bench/services/compare_option_parser.rb +55 -0
  21. data/lib/skill_bench/services/comparison_reporter.rb +97 -0
  22. data/lib/skill_bench/services/comparison_runner.rb +49 -0
  23. data/lib/skill_bench/services/context_loader_service.rb +42 -0
  24. data/lib/skill_bench/services/error_response_builder.rb +119 -0
  25. data/lib/skill_bench/services/eval_resolver.rb +33 -0
  26. data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
  27. data/lib/skill_bench/services/judge_params_builder.rb +54 -0
  28. data/lib/skill_bench/services/manifest_finder.rb +36 -0
  29. data/lib/skill_bench/services/output_formatter.rb +28 -0
  30. data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
  31. data/lib/skill_bench/services/provider_resolver.rb +73 -0
  32. data/lib/skill_bench/services/runner_service.rb +84 -315
  33. data/lib/skill_bench/services/skill_resolver.rb +37 -9
  34. data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
  35. data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
  36. data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
  37. data/lib/skill_bench/services/variant_parser.rb +32 -0
  38. data/lib/skill_bench/services/variant_resolver.rb +63 -0
  39. data/lib/skill_bench/version.rb +1 -1
  40. metadata +23 -2
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Builds judge parameters from provider configuration.
6
+ class JudgeParamsBuilder
7
+ # Builds judge parameters from provider configuration.
8
+ #
9
+ # @param provider [Object] The resolved provider
10
+ # @param config [Hash, nil] Provider config
11
+ # @return [Hash] Judge parameters with api_key, model, and provider
12
+ def self.call(provider, config)
13
+ new(provider, config).call
14
+ end
15
+
16
+ # @param provider [Object] The resolved provider
17
+ # @param config [Hash, nil] Provider config
18
+ def initialize(provider, config)
19
+ @provider = provider
20
+ @config = config
21
+ end
22
+
23
+ # Builds judge parameters from provider configuration.
24
+ #
25
+ # @return [Hash] Judge parameters with api_key, model, and provider
26
+ def call
27
+ return { provider: :mock } if @provider.name == 'mock'
28
+
29
+ config = @config || safe_merged_config
30
+ return {} unless config
31
+
32
+ {
33
+ api_key: config[:api_key],
34
+ model: config[:model] || @provider.llm,
35
+ provider: @provider.runtime.to_sym
36
+ }
37
+ rescue KeyError, NoMethodError
38
+ # Expected errors from missing config keys or nil config
39
+ {}
40
+ end
41
+
42
+ private
43
+
44
+ # Safely calls merged_config, returning nil on any error.
45
+ #
46
+ # @return [Hash, nil] The merged config or nil
47
+ def safe_merged_config
48
+ @provider.merged_config
49
+ rescue StandardError
50
+ nil
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Finds the registry manifest file path.
6
+ class ManifestFinder
7
+ # Default path relative to current working directory.
8
+ DEFAULT_PATH = '../agent-mcp-runtime/registry.json'
9
+
10
+ # Finds the registry manifest file.
11
+ #
12
+ # @param path [String, nil] Optional custom path to the manifest
13
+ # @return [String] Absolute path to the registry manifest
14
+ # @raise [ArgumentError] when the manifest file is not found
15
+ def self.call(path: nil)
16
+ new(path: path).call
17
+ end
18
+
19
+ # @param path [String, nil] Optional custom path to the manifest
20
+ def initialize(path: nil)
21
+ @path = path
22
+ end
23
+
24
+ # Finds the registry manifest file.
25
+ #
26
+ # @return [String] Absolute path to the registry manifest
27
+ # @raise [ArgumentError] when the manifest file is not found
28
+ def call
29
+ manifest_path = @path || File.expand_path(DEFAULT_PATH, Dir.pwd)
30
+ raise ArgumentError, "Registry manifest not found: #{manifest_path}" unless File.exist?(manifest_path)
31
+
32
+ manifest_path
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Formats agent output for evaluation.
6
+ class OutputFormatter
7
+ # Formats agent output for evaluation.
8
+ #
9
+ # @param agent_result [Hash] The agent result containing the output
10
+ # @return [String] The formatted output
11
+ def self.call(agent_result)
12
+ new(agent_result).call
13
+ end
14
+
15
+ # @param agent_result [Hash] The agent result containing the output
16
+ def initialize(agent_result)
17
+ @agent_result = agent_result
18
+ end
19
+
20
+ # Formats agent output for evaluation.
21
+ #
22
+ # @return [String] The formatted output
23
+ def call
24
+ @agent_result[:result].to_s
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require_relative '../execution/context_hydrator'
5
+ require_relative '../execution/source_path_resolver'
6
+
7
+ module SkillBench
8
+ module Services
9
+ # Builds system prompts for baseline and context agent runs.
10
+ class PromptBuilderService
11
+ # Builds the baseline system prompt (no skill context).
12
+ #
13
+ # @return [String] The baseline system prompt
14
+ def self.build_baseline
15
+ new.build_baseline
16
+ end
17
+
18
+ # Builds the context-aware system prompt based on eval metadata.
19
+ #
20
+ # For `skill_bundle_xml` context mode, combines SKILL.md with source code
21
+ # via ContextHydrator. Falls back to SKILL.md-only if source is unavailable.
22
+ #
23
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
24
+ # @param skills [Array<SkillBench::Models::Skill>] Resolved skills
25
+ # @param skill_context [String] The combined skill context from SKILL.md files
26
+ # @return [String] The context system prompt
27
+ def self.build_context(evaluation, skills, skill_context)
28
+ new.build_context(evaluation, skills, skill_context)
29
+ end
30
+
31
+ # Builds the baseline system prompt (no skill context).
32
+ #
33
+ # @return [String] The baseline system prompt
34
+ def build_baseline
35
+ <<~PROMPT
36
+ You are an expert Ruby on Rails developer. Your job is to read the task,
37
+ modify the codebase using the tools provided to meet the requirements,
38
+ and then explain what you did.
39
+ PROMPT
40
+ end
41
+
42
+ # Builds the context-aware system prompt based on eval metadata.
43
+ #
44
+ # For `skill_bundle_xml` context mode, combines SKILL.md with source code
45
+ # via ContextHydrator. Falls back to SKILL.md-only if source is unavailable.
46
+ #
47
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
48
+ # @param _skills [Array<SkillBench::Models::Skill>] Resolved skills (unused in current implementation)
49
+ # @param skill_context [String] The combined skill context from SKILL.md files
50
+ # @return [String] The context system prompt
51
+ def build_context(evaluation, _skills, skill_context)
52
+ return skill_context unless evaluation.metadata['context_mode'] == 'skill_bundle_xml'
53
+
54
+ source_path = resolve_source_path(evaluation)
55
+ return skill_context unless source_path
56
+
57
+ xml_result = Execution::ContextHydrator.call(source_path: source_path, base_path: Pathname.new(Dir.pwd))
58
+ hydrator_response = xml_result[:response]
59
+ xml_context = hydrator_response[:context]
60
+ return skill_context unless xml_result[:success] && !xml_context.empty?
61
+
62
+ <<~PROMPT
63
+ You are an expert Ruby on Rails developer.
64
+ You have access to a skill file and source code wrapped in <agent_context> tags.
65
+ Use the skill instructions and the provided source code to solve the task.
66
+
67
+ ## Skill Instructions
68
+ #{skill_context}
69
+
70
+ ## Source Code
71
+ #{xml_context}
72
+ PROMPT
73
+ end
74
+
75
+ private
76
+
77
+ # Resolves the source path for context hydration.
78
+ #
79
+ # Tries the eval's `source/` subdirectory first, then falls back to
80
+ # SourcePathResolver inference.
81
+ #
82
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
83
+ # @return [String, nil] The resolved source path, or nil if not found
84
+ def resolve_source_path(evaluation)
85
+ eval_path = evaluation.path
86
+ eval_source = File.join(eval_path, 'source')
87
+ return eval_source if Dir.exist?(eval_source)
88
+
89
+ sources = SkillBench::Config.skill_sources || {}
90
+ inferred = Execution::SourcePathResolver.call(
91
+ eval_folder_path: eval_path.to_s,
92
+ skill_sources: sources
93
+ )
94
+ inferred if inferred && Dir.exist?(inferred)
95
+ end
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../models/config'
4
+ require_relative '../models/provider'
5
+
6
+ module SkillBench
7
+ module Services
8
+ # Resolves the provider and its configuration.
9
+ class ProviderResolver
10
+ # Stand-in provider when no LLM config is available.
11
+ MOCK_PROVIDER = Struct.new(:name, :runtime, :llm, :merged_config)
12
+ private_constant :MOCK_PROVIDER
13
+
14
+ # Resolves the provider and its configuration.
15
+ #
16
+ # @return [Hash] Result with keys:
17
+ # - success: Boolean indicating if resolution succeeded
18
+ # - provider: The resolved provider instance
19
+ # - config: The merged provider config (if successful)
20
+ # - error: The error object (if failed)
21
+ def self.call
22
+ new.call
23
+ end
24
+
25
+ # Resolves the provider and its configuration.
26
+ #
27
+ # @return [Hash] Result with keys:
28
+ # - success: Boolean indicating if resolution succeeded
29
+ # - provider: The resolved provider instance
30
+ # - config: The merged provider config (if successful)
31
+ # - error: The error object (if failed)
32
+ def call
33
+ provider = resolve_provider
34
+ config_result = resolve_provider_config(provider)
35
+
36
+ if config_result[:success]
37
+ {
38
+ success: true,
39
+ provider: provider,
40
+ config: config_result[:config]
41
+ }
42
+ else
43
+ {
44
+ success: false,
45
+ provider: provider,
46
+ error: config_result[:error]
47
+ }
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def resolve_provider
54
+ config = SkillBench::Models::Config.load
55
+ provider = config.to_provider
56
+ return provider if provider
57
+
58
+ warn 'Config load failed, using mock provider'
59
+ MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
60
+ rescue JSON::ParserError, ArgumentError, Errno::ENOENT => e
61
+ # Config parsing/validation errors or missing config file - fall back to mock
62
+ warn "Config load failed with error: #{e.message}, using mock provider"
63
+ MOCK_PROVIDER.new('mock', 'mock', 'mock', {})
64
+ end
65
+
66
+ def resolve_provider_config(provider)
67
+ { success: true, config: provider.merged_config }
68
+ rescue ArgumentError => e
69
+ { success: false, error: e }
70
+ end
71
+ end
72
+ end
73
+ end