ruby-skill-bench 0.1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +231 -0
  3. data/lib/skill_bench/agent/react_agent.rb +2 -1
  4. data/lib/skill_bench/cli/compare_command.rb +91 -0
  5. data/lib/skill_bench/cli/help_printer.rb +9 -1
  6. data/lib/skill_bench/cli/run_command.rb +6 -4
  7. data/lib/skill_bench/cli.rb +7 -4
  8. data/lib/skill_bench/clients/all.rb +2 -0
  9. data/lib/skill_bench/clients/base_client.rb +2 -5
  10. data/lib/skill_bench/clients/providers/mock.rb +56 -0
  11. data/lib/skill_bench/clients/request_builder.rb +2 -4
  12. data/lib/skill_bench/clients/response_builder.rb +91 -0
  13. data/lib/skill_bench/clients/response_error_handler.rb +5 -17
  14. data/lib/skill_bench/clients/retry_handler.rb +4 -7
  15. data/lib/skill_bench/commands/run.rb +6 -2
  16. data/lib/skill_bench/config/applier.rb +1 -0
  17. data/lib/skill_bench/config/defaults.rb +1 -0
  18. data/lib/skill_bench/config/facade_readers.rb +7 -0
  19. data/lib/skill_bench/config/json_loader.rb +3 -3
  20. data/lib/skill_bench/config/store.rb +5 -0
  21. data/lib/skill_bench/config.rb +10 -1
  22. data/lib/skill_bench/constants.rb +58 -0
  23. data/lib/skill_bench/delta_report.rb +20 -0
  24. data/lib/skill_bench/execution/context_hydrator.rb +16 -6
  25. data/lib/skill_bench/execution/sandbox.rb +18 -3
  26. data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
  27. data/lib/skill_bench/registry/pack_resolver.rb +119 -0
  28. data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
  29. data/lib/skill_bench/services/compare_option_parser.rb +55 -0
  30. data/lib/skill_bench/services/comparison_reporter.rb +97 -0
  31. data/lib/skill_bench/services/comparison_runner.rb +49 -0
  32. data/lib/skill_bench/services/context_loader_service.rb +42 -0
  33. data/lib/skill_bench/services/error_response_builder.rb +119 -0
  34. data/lib/skill_bench/services/eval_resolver.rb +33 -0
  35. data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
  36. data/lib/skill_bench/services/judge_params_builder.rb +54 -0
  37. data/lib/skill_bench/services/manifest_finder.rb +36 -0
  38. data/lib/skill_bench/services/output_formatter.rb +28 -0
  39. data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
  40. data/lib/skill_bench/services/provider_resolver.rb +73 -0
  41. data/lib/skill_bench/services/runner_service.rb +84 -315
  42. data/lib/skill_bench/services/skill_resolver.rb +37 -9
  43. data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
  44. data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
  45. data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
  46. data/lib/skill_bench/services/variant_parser.rb +32 -0
  47. data/lib/skill_bench/services/variant_resolver.rb +63 -0
  48. data/lib/skill_bench/tools/run_command.rb +2 -17
  49. data/lib/skill_bench/version.rb +1 -1
  50. data/lib/skill_bench.rb +1 -0
  51. metadata +25 -2
@@ -48,7 +48,21 @@ module SkillBench
48
48
  cwd = File.expand_path(Dir.pwd)
49
49
  cwd_with_sep = cwd + File::SEPARATOR
50
50
 
51
- raise(ArgumentError, "Skill path escapes project boundary: #{identifier}") unless absolute_path == cwd || absolute_path.start_with?(cwd_with_sep)
51
+ allowed = absolute_path == cwd || absolute_path.start_with?(cwd_with_sep)
52
+ unless allowed
53
+ sources = SkillBench::Config.skill_sources
54
+ if sources.is_a?(Hash)
55
+ sources.each_value do |source_path|
56
+ abs_src = File.expand_path(source_path)
57
+ if absolute_path == abs_src || absolute_path.start_with?(abs_src + File::SEPARATOR)
58
+ allowed = true
59
+ break
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ raise(ArgumentError, "Skill path escapes project boundary: #{identifier}") unless allowed
52
66
 
53
67
  skill_md = File.join(normalized_path, 'SKILL.md')
54
68
 
@@ -57,21 +71,35 @@ module SkillBench
57
71
  raise(ArgumentError, "Skill not found: #{identifier}")
58
72
  end
59
73
 
60
- # Resolves a skill by name using recursive discovery.
61
- #
62
- # @return [SkillBench::Models::Skill] The resolved skill
63
- # @raise [ArgumentError] if no skill with matching name found
64
74
  def resolve_by_name
65
- skills = Models::Skill.discover(base_path)
75
+ skills = discover_all_skills
66
76
  matches = skills.select { |skill| skill.name == identifier }
67
77
 
78
+ validate_matches!(matches)
79
+
80
+ matches.first
81
+ end
82
+
83
+ def discover_all_skills
84
+ skills = Models::Skill.discover(base_path)
85
+
86
+ sources = SkillBench::Config.skill_sources
87
+ if sources.is_a?(Hash)
88
+ sources.each_value do |source_path|
89
+ skills += Models::Skill.discover(source_path) if Dir.exist?(source_path)
90
+ end
91
+ end
92
+
93
+ skills
94
+ end
95
+
96
+ def validate_matches!(matches)
68
97
  if matches.empty?
69
98
  raise(ArgumentError, "Skill not found: #{identifier}")
70
99
  elsif matches.size > 1
71
- raise(ArgumentError, "Multiple skills found with name '#{identifier}': #{matches.map(&:path).join(', ')}")
100
+ matches.uniq! { |m| File.expand_path(m.path) }
101
+ raise(ArgumentError, "Multiple skills found with name '#{identifier}': #{matches.map(&:path).join(', ')}") if matches.size > 1
72
102
  end
73
-
74
- matches.first
75
103
  end
76
104
  end
77
105
  end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../models/skill'
4
+ require_relative 'skill_resolver'
5
+ require_relative '../registry/pack_resolver'
6
+
7
+ module SkillBench
8
+ module Services
9
+ # Resolves skills from names, supporting both direct resolution and pack-based resolution.
10
+ class SkillResolverService
11
+ # Default registry manifest path relative to the current working directory.
12
+ DEFAULT_REGISTRY_MANIFEST = '../agent-mcp-runtime/registry.json'
13
+ private_constant :DEFAULT_REGISTRY_MANIFEST
14
+
15
+ # Resolves skills from names.
16
+ #
17
+ # @param skill_names [Array<String>] Names of the skills to resolve
18
+ # @param pack [String, nil] Optional pack name for registry-based skill resolution
19
+ # @param registry_manifest [String, nil] Optional path to registry.json manifest
20
+ # @return [Array<SkillBench::Models::Skill>] The resolved skills
21
+ # @raise [ArgumentError] when a skill cannot be resolved
22
+ def self.call(skill_names, pack: nil, registry_manifest: nil)
23
+ new(skill_names, pack: pack, registry_manifest: registry_manifest).call
24
+ end
25
+
26
+ # @param skill_names [Array<String>] Names of the skills
27
+ # @param pack [String, nil] Optional pack name
28
+ # @param registry_manifest [String, nil] Optional registry.json path
29
+ def initialize(skill_names, pack: nil, registry_manifest: nil)
30
+ @skill_names = skill_names
31
+ @pack = pack
32
+ @registry_manifest = registry_manifest
33
+ end
34
+
35
+ # Resolves the skills from names.
36
+ #
37
+ # @return [Array<SkillBench::Models::Skill>] The resolved skills
38
+ # @raise [ArgumentError] when a skill cannot be resolved
39
+ def call
40
+ return @call if defined?(@call)
41
+
42
+ @call = if @pack && !@pack.empty?
43
+ resolve_pack_skills
44
+ else
45
+ @skill_names.map { |name| Services::SkillResolver.call(name) }
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ attr_reader :skill_names, :pack, :registry_manifest
52
+
53
+ def resolve_pack_skills
54
+ manifest_path = registry_manifest || DEFAULT_REGISTRY_MANIFEST
55
+ manifest_absolute = File.expand_path(manifest_path, Dir.pwd)
56
+
57
+ raise ArgumentError, "Registry manifest not found: #{manifest_path}" unless File.exist?(manifest_absolute)
58
+
59
+ resolver = Registry::PackResolver.new(manifest_absolute)
60
+
61
+ skill_names.map do |skill_name|
62
+ path = resolver.resolve_skill(pack, skill_name)
63
+ raise ArgumentError, "Skill '#{skill_name}' not found in pack '#{pack}'" unless path
64
+
65
+ Models::Skill.new(name: skill_name, path: path)
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../execution/source_path_resolver'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Resolves the source path for context hydration.
8
+ class SourcePathResolverService
9
+ # Resolves the source path for context hydration.
10
+ #
11
+ # Tries the eval's `source/` subdirectory first, then falls back to
12
+ # SourcePathResolver inference.
13
+ #
14
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
15
+ # @return [String, nil] The resolved source path, or nil if not found
16
+ def self.call(evaluation)
17
+ new(evaluation).call
18
+ end
19
+
20
+ # @param evaluation [SkillBench::Models::Eval] The eval being run
21
+ def initialize(evaluation)
22
+ @evaluation = evaluation
23
+ end
24
+
25
+ # Resolves the source path for context hydration.
26
+ #
27
+ # Tries the eval's `source/` subdirectory first, then falls back to
28
+ # SourcePathResolver inference.
29
+ #
30
+ # @return [String, nil] The resolved source path, or nil if not found
31
+ def call
32
+ eval_path = @evaluation.path
33
+ eval_source = File.join(eval_path, 'source')
34
+ return eval_source if Dir.exist?(eval_source)
35
+
36
+ sources = SkillBench::Config.skill_sources || {}
37
+ inferred = Execution::SourcePathResolver.call(
38
+ eval_folder_path: eval_path.to_s,
39
+ skill_sources: sources
40
+ )
41
+ inferred if inferred && Dir.exist?(inferred)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../trend_tracker'
4
+
5
+ module SkillBench
6
+ module Services
7
+ # Records evaluation results and computes trends.
8
+ class TrendRecorderService
9
+ # Records evaluation results and computes trends.
10
+ #
11
+ # @param result [Hash] The evaluation result from Evaluation::Runner
12
+ # @param eval_name [String] Name of the eval
13
+ # @param skill_names [Array<String>] Names of the skills
14
+ # @return [Hash] Result with success status and trend data
15
+ def self.call(result, eval_name, skill_names)
16
+ new(result, eval_name, skill_names).call
17
+ end
18
+
19
+ # @param result [Hash] The evaluation result from Evaluation::Runner
20
+ # @param eval_name [String] Name of the eval
21
+ # @param skill_names [Array<String>] Names of the skills
22
+ def initialize(result, eval_name, skill_names)
23
+ @result = result
24
+ @eval_name = eval_name
25
+ @skill_names = skill_names
26
+ end
27
+
28
+ # Records evaluation results and computes trends.
29
+ #
30
+ # @return [Hash] Result with success status and trend data
31
+ def call
32
+ tracker = TrendTracker.new
33
+ enriched = @result.merge(eval_name: @eval_name, skill_names: @skill_names)
34
+ trend = tracker.trend_for(enriched)
35
+ record_result = tracker.record(enriched)
36
+
37
+ record_success = record_result.is_a?(Hash) && record_result[:success]
38
+ unless record_success
39
+ message = if record_result.is_a?(Hash)
40
+ record_result.dig(:response, :error, :message) ||
41
+ record_result.dig(:error, :message) ||
42
+ 'Unknown error'
43
+ else
44
+ 'Unexpected record response'
45
+ end
46
+ SkillBench::ErrorLogger.log_error(
47
+ StandardError.new(message),
48
+ "Trend tracking record failed for eval #{@eval_name}"
49
+ )
50
+ return {
51
+ success: false,
52
+ response: {
53
+ error: {
54
+ message: "Trend tracking record failed: #{message}",
55
+ record_result: record_result
56
+ }
57
+ }
58
+ }
59
+ end
60
+ { success: true, trend: trend }
61
+ rescue StandardError => e
62
+ SkillBench::ErrorLogger.log_error(e, 'Trend tracking failed')
63
+ { success: false, response: { error: { message: e.message } } }
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SkillBench
4
+ module Services
5
+ # Parses variant specifications for skill comparison.
6
+ class VariantParser
7
+ # Parses a variant specification string.
8
+ #
9
+ # @param spec [String] Variant spec (e.g., "pack:rails" or "/path/to/skill")
10
+ # @return [Hash] Parsed variant with :type (:pack or :path) and corresponding key
11
+ def self.call(spec)
12
+ new(spec).call
13
+ end
14
+
15
+ # @param spec [String] Variant spec (e.g., "pack:rails" or "/path/to/skill")
16
+ def initialize(spec)
17
+ @spec = spec
18
+ end
19
+
20
+ # Parses the variant specification.
21
+ #
22
+ # @return [Hash] Parsed variant with :type (:pack or :path) and corresponding key
23
+ def call
24
+ if @spec.start_with?('pack:')
25
+ { type: :pack, name: @spec.sub('pack:', '') }
26
+ else
27
+ { type: :path, path: @spec }
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../registry/pack_resolver'
4
+ require_relative 'runner_service'
5
+ require_relative 'manifest_finder'
6
+
7
+ module SkillBench
8
+ module Services
9
+ # Resolves skill paths from variant specifications.
10
+ class VariantResolver
11
+ # Resolves skill paths from a variant specification.
12
+ #
13
+ # @param variant [Hash] Parsed variant from VariantParser
14
+ # @param skill_name [String] Name of the skill to resolve
15
+ # @param manifest_path [String, nil] Optional path to registry manifest
16
+ # @return [Array<String>] Array of skill paths
17
+ # @raise [ArgumentError] when skill cannot be resolved
18
+ def self.call(variant, skill_name, manifest_path: nil)
19
+ new(variant, skill_name, manifest_path: manifest_path).call
20
+ end
21
+
22
+ # @param variant [Hash] Parsed variant from VariantParser
23
+ # @param skill_name [String] Name of the skill to resolve
24
+ # @param manifest_path [String, nil] Optional path to registry manifest
25
+ def initialize(variant, skill_name, manifest_path: nil)
26
+ @variant = variant
27
+ @skill_name = skill_name
28
+ @manifest_path = manifest_path
29
+ end
30
+
31
+ # Resolves skill paths from the variant specification.
32
+ #
33
+ # @return [Array<String>] Array of skill paths
34
+ # @raise [ArgumentError] when skill cannot be resolved or variant type is unknown
35
+ def call
36
+ case @variant[:type]
37
+ when :pack
38
+ resolve_pack_skill
39
+ when :path
40
+ [@variant[:path]]
41
+ else
42
+ raise ArgumentError, "Unknown variant type: #{@variant[:type]}, variant: #{@variant.inspect}"
43
+ end
44
+ end
45
+
46
+ private
47
+
48
+ # Resolves a skill from a pack.
49
+ #
50
+ # @return [Array<String>] Array containing the resolved skill path
51
+ # @raise [ArgumentError] when skill is not found in pack
52
+ def resolve_pack_skill
53
+ pack_name = @variant[:name]
54
+ manifest = @manifest_path || ManifestFinder.call
55
+ resolver = Registry::PackResolver.new(manifest)
56
+ resolved = resolver.resolve_skill(pack_name, @skill_name)
57
+ raise ArgumentError, "Skill '#{@skill_name}' not found in pack '#{pack_name}'" unless resolved
58
+
59
+ [resolved]
60
+ end
61
+ end
62
+ end
63
+ end
@@ -4,27 +4,12 @@ require 'open3'
4
4
  require 'timeout'
5
5
  require 'shellwords'
6
6
  require_relative '../config'
7
+ require_relative '../constants'
7
8
 
8
9
  module SkillBench
9
10
  module Tools
10
11
  # Handles executing a shell command within the working directory.
11
12
  class RunCommand
12
- # Commands that are always blocked even if listed in allowed_commands,
13
- # because they can be used to escape the sandbox or execute arbitrary code.
14
- DANGEROUS_COMMANDS = %w[
15
- bash sh zsh fish dash ksh csh tcsh
16
- python python3 python2 ruby perl node
17
- php lua tcl wish
18
- curl wget nc ncat socat
19
- eval exec
20
- sudo su doas
21
- chmod chown mount umount
22
- dd mkfs fdisk parted
23
- insmod rmmod modprobe
24
- systemctl service
25
- passwd useradd userdel groupadd groupdel
26
- ].freeze
27
-
28
13
  # @return [Hash] The tool definition for the LLM API.
29
14
  def self.definition
30
15
  {
@@ -59,7 +44,7 @@ module SkillBench
59
44
  return 'Error: Empty command.' if argv.empty?
60
45
 
61
46
  base_cmd = argv.first
62
- return "Error: Command '#{base_cmd}' is blocked for security reasons." if DANGEROUS_COMMANDS.include?(base_cmd)
47
+ return "Error: Command '#{base_cmd}' is blocked for security reasons." if Constants::Tools::DANGEROUS_COMMANDS.include?(base_cmd)
63
48
 
64
49
  allowed = SkillBench::Config.allowed_commands
65
50
  return 'Error: No allowed commands configured. Set allowed_commands in skill-bench.json or use --mode mock.' if allowed.nil?
@@ -2,5 +2,5 @@
2
2
 
3
3
  module SkillBench
4
4
  # The current gem version.
5
- VERSION = '0.1.0'
5
+ VERSION = '1.1.0'
6
6
  end
data/lib/skill_bench.rb CHANGED
@@ -8,6 +8,7 @@
8
8
 
9
9
  # Core modules
10
10
  require_relative 'skill_bench/version'
11
+ require_relative 'skill_bench/constants'
11
12
  require_relative 'skill_bench/dimension'
12
13
  require_relative 'skill_bench/criteria'
13
14
  require_relative 'skill_bench/delta_report'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-skill-bench
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ismael Marin
@@ -119,6 +119,7 @@ files:
119
119
  - lib/skill_bench/agent/runner.rb
120
120
  - lib/skill_bench/agent/summary.rb
121
121
  - lib/skill_bench/cli.rb
122
+ - lib/skill_bench/cli/compare_command.rb
122
123
  - lib/skill_bench/cli/eval/eval_command_registry.rb
123
124
  - lib/skill_bench/cli/eval/eval_commands.rb
124
125
  - lib/skill_bench/cli/eval/eval_options.rb
@@ -139,12 +140,14 @@ files:
139
140
  - lib/skill_bench/clients/providers/deepseek.rb
140
141
  - lib/skill_bench/clients/providers/gemini.rb
141
142
  - lib/skill_bench/clients/providers/groq.rb
143
+ - lib/skill_bench/clients/providers/mock.rb
142
144
  - lib/skill_bench/clients/providers/null_client.rb
143
145
  - lib/skill_bench/clients/providers/ollama.rb
144
146
  - lib/skill_bench/clients/providers/openai.rb
145
147
  - lib/skill_bench/clients/providers/opencode.rb
146
148
  - lib/skill_bench/clients/providers/openrouter.rb
147
149
  - lib/skill_bench/clients/request_builder.rb
150
+ - lib/skill_bench/clients/response_builder.rb
148
151
  - lib/skill_bench/clients/response_error_handler.rb
149
152
  - lib/skill_bench/clients/response_parser.rb
150
153
  - lib/skill_bench/clients/retry_handler.rb
@@ -160,6 +163,7 @@ files:
160
163
  - lib/skill_bench/config/facade_writers.rb
161
164
  - lib/skill_bench/config/json_loader.rb
162
165
  - lib/skill_bench/config/store.rb
166
+ - lib/skill_bench/constants.rb
163
167
  - lib/skill_bench/criteria.rb
164
168
  - lib/skill_bench/delta_report.rb
165
169
  - lib/skill_bench/dimension.rb
@@ -191,21 +195,40 @@ files:
191
195
  - lib/skill_bench/output_formatter.rb
192
196
  - lib/skill_bench/package_verifier.rb
193
197
  - lib/skill_bench/rails/skill_templates.rb
198
+ - lib/skill_bench/registry/pack_resolver.rb
194
199
  - lib/skill_bench/runner.rb
200
+ - lib/skill_bench/services/agent_spawner_service.rb
201
+ - lib/skill_bench/services/compare_option_parser.rb
202
+ - lib/skill_bench/services/comparison_reporter.rb
203
+ - lib/skill_bench/services/comparison_runner.rb
204
+ - lib/skill_bench/services/context_loader_service.rb
195
205
  - lib/skill_bench/services/delta_table_formatter.rb
206
+ - lib/skill_bench/services/error_response_builder.rb
207
+ - lib/skill_bench/services/eval_resolver.rb
208
+ - lib/skill_bench/services/exit_code_calculator.rb
196
209
  - lib/skill_bench/services/feedback_generator.rb
197
210
  - lib/skill_bench/services/formatting_helpers.rb
198
211
  - lib/skill_bench/services/iteration_formatter.rb
199
212
  - lib/skill_bench/services/json_formatter.rb
213
+ - lib/skill_bench/services/judge_params_builder.rb
200
214
  - lib/skill_bench/services/judge_score_parser_service.rb
201
215
  - lib/skill_bench/services/junit_formatter.rb
216
+ - lib/skill_bench/services/manifest_finder.rb
202
217
  - lib/skill_bench/services/option_parser_service.rb
218
+ - lib/skill_bench/services/output_formatter.rb
203
219
  - lib/skill_bench/services/output_persistence_service.rb
220
+ - lib/skill_bench/services/prompt_builder_service.rb
221
+ - lib/skill_bench/services/provider_resolver.rb
204
222
  - lib/skill_bench/services/result_printer_service.rb
205
223
  - lib/skill_bench/services/runner_service.rb
206
224
  - lib/skill_bench/services/skill_resolver.rb
225
+ - lib/skill_bench/services/skill_resolver_service.rb
226
+ - lib/skill_bench/services/source_path_resolver_service.rb
207
227
  - lib/skill_bench/services/template_registry.rb
208
228
  - lib/skill_bench/services/template_registry/category_data.rb
229
+ - lib/skill_bench/services/trend_recorder_service.rb
230
+ - lib/skill_bench/services/variant_parser.rb
231
+ - lib/skill_bench/services/variant_resolver.rb
209
232
  - lib/skill_bench/task.rb
210
233
  - lib/skill_bench/task/evaluator.rb
211
234
  - lib/skill_bench/task/file_reader.rb
@@ -241,7 +264,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
241
264
  - !ruby/object:Gem::Version
242
265
  version: '0'
243
266
  requirements: []
244
- rubygems_version: 4.0.11
267
+ rubygems_version: 4.0.12
245
268
  specification_version: 4
246
269
  summary: The evaluation engine for AI Agent Skills benchmarking.
247
270
  test_files: []