ruby-skill-bench 0.1.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +86 -0
- data/lib/skill_bench/cli/compare_command.rb +91 -0
- data/lib/skill_bench/cli/help_printer.rb +9 -1
- data/lib/skill_bench/cli/run_command.rb +6 -4
- data/lib/skill_bench/cli.rb +7 -4
- data/lib/skill_bench/clients/all.rb +1 -0
- data/lib/skill_bench/clients/providers/mock.rb +56 -0
- data/lib/skill_bench/commands/run.rb +6 -2
- data/lib/skill_bench/config/applier.rb +1 -0
- data/lib/skill_bench/config/defaults.rb +1 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/json_loader.rb +3 -3
- data/lib/skill_bench/config/store.rb +5 -0
- data/lib/skill_bench/config.rb +10 -1
- data/lib/skill_bench/delta_report.rb +20 -0
- data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
- data/lib/skill_bench/registry/pack_resolver.rb +119 -0
- data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
- data/lib/skill_bench/services/compare_option_parser.rb +55 -0
- data/lib/skill_bench/services/comparison_reporter.rb +97 -0
- data/lib/skill_bench/services/comparison_runner.rb +49 -0
- data/lib/skill_bench/services/context_loader_service.rb +42 -0
- data/lib/skill_bench/services/error_response_builder.rb +119 -0
- data/lib/skill_bench/services/eval_resolver.rb +33 -0
- data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
- data/lib/skill_bench/services/judge_params_builder.rb +54 -0
- data/lib/skill_bench/services/manifest_finder.rb +36 -0
- data/lib/skill_bench/services/output_formatter.rb +28 -0
- data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
- data/lib/skill_bench/services/provider_resolver.rb +73 -0
- data/lib/skill_bench/services/runner_service.rb +84 -315
- data/lib/skill_bench/services/skill_resolver.rb +37 -9
- data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
- data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
- data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
- data/lib/skill_bench/services/variant_parser.rb +32 -0
- data/lib/skill_bench/services/variant_resolver.rb +63 -0
- data/lib/skill_bench/version.rb +1 -1
- metadata +23 -2
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../trend_tracker'
|
|
4
|
+
|
|
5
|
+
module SkillBench
|
|
6
|
+
module Services
|
|
7
|
+
# Records evaluation results and computes trends.
|
|
8
|
+
class TrendRecorderService
|
|
9
|
+
# Records evaluation results and computes trends.
|
|
10
|
+
#
|
|
11
|
+
# @param result [Hash] The evaluation result from Evaluation::Runner
|
|
12
|
+
# @param eval_name [String] Name of the eval
|
|
13
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
14
|
+
# @return [Hash] Result with success status and trend data
|
|
15
|
+
def self.call(result, eval_name, skill_names)
|
|
16
|
+
new(result, eval_name, skill_names).call
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# @param result [Hash] The evaluation result from Evaluation::Runner
|
|
20
|
+
# @param eval_name [String] Name of the eval
|
|
21
|
+
# @param skill_names [Array<String>] Names of the skills
|
|
22
|
+
def initialize(result, eval_name, skill_names)
|
|
23
|
+
@result = result
|
|
24
|
+
@eval_name = eval_name
|
|
25
|
+
@skill_names = skill_names
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Records evaluation results and computes trends.
|
|
29
|
+
#
|
|
30
|
+
# @return [Hash] Result with success status and trend data
|
|
31
|
+
def call
|
|
32
|
+
tracker = TrendTracker.new
|
|
33
|
+
enriched = @result.merge(eval_name: @eval_name, skill_names: @skill_names)
|
|
34
|
+
trend = tracker.trend_for(enriched)
|
|
35
|
+
record_result = tracker.record(enriched)
|
|
36
|
+
|
|
37
|
+
record_success = record_result.is_a?(Hash) && record_result[:success]
|
|
38
|
+
unless record_success
|
|
39
|
+
message = if record_result.is_a?(Hash)
|
|
40
|
+
record_result.dig(:response, :error, :message) ||
|
|
41
|
+
record_result.dig(:error, :message) ||
|
|
42
|
+
'Unknown error'
|
|
43
|
+
else
|
|
44
|
+
'Unexpected record response'
|
|
45
|
+
end
|
|
46
|
+
SkillBench::ErrorLogger.log_error(
|
|
47
|
+
StandardError.new(message),
|
|
48
|
+
"Trend tracking record failed for eval #{@eval_name}"
|
|
49
|
+
)
|
|
50
|
+
return {
|
|
51
|
+
success: false,
|
|
52
|
+
response: {
|
|
53
|
+
error: {
|
|
54
|
+
message: "Trend tracking record failed: #{message}",
|
|
55
|
+
record_result: record_result
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
end
|
|
60
|
+
{ success: true, trend: trend }
|
|
61
|
+
rescue StandardError => e
|
|
62
|
+
SkillBench::ErrorLogger.log_error(e, 'Trend tracking failed')
|
|
63
|
+
{ success: false, response: { error: { message: e.message } } }
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Services
|
|
5
|
+
# Parses variant specifications for skill comparison.
|
|
6
|
+
class VariantParser
|
|
7
|
+
# Parses a variant specification string.
|
|
8
|
+
#
|
|
9
|
+
# @param spec [String] Variant spec (e.g., "pack:rails" or "/path/to/skill")
|
|
10
|
+
# @return [Hash] Parsed variant with :type (:pack or :path) and corresponding key
|
|
11
|
+
def self.call(spec)
|
|
12
|
+
new(spec).call
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @param spec [String] Variant spec (e.g., "pack:rails" or "/path/to/skill")
|
|
16
|
+
def initialize(spec)
|
|
17
|
+
@spec = spec
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Parses the variant specification.
|
|
21
|
+
#
|
|
22
|
+
# @return [Hash] Parsed variant with :type (:pack or :path) and corresponding key
|
|
23
|
+
def call
|
|
24
|
+
if @spec.start_with?('pack:')
|
|
25
|
+
{ type: :pack, name: @spec.sub('pack:', '') }
|
|
26
|
+
else
|
|
27
|
+
{ type: :path, path: @spec }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../registry/pack_resolver'
|
|
4
|
+
require_relative 'runner_service'
|
|
5
|
+
require_relative 'manifest_finder'
|
|
6
|
+
|
|
7
|
+
module SkillBench
|
|
8
|
+
module Services
|
|
9
|
+
# Resolves skill paths from variant specifications.
|
|
10
|
+
class VariantResolver
|
|
11
|
+
# Resolves skill paths from a variant specification.
|
|
12
|
+
#
|
|
13
|
+
# @param variant [Hash] Parsed variant from VariantParser
|
|
14
|
+
# @param skill_name [String] Name of the skill to resolve
|
|
15
|
+
# @param manifest_path [String, nil] Optional path to registry manifest
|
|
16
|
+
# @return [Array<String>] Array of skill paths
|
|
17
|
+
# @raise [ArgumentError] when skill cannot be resolved
|
|
18
|
+
def self.call(variant, skill_name, manifest_path: nil)
|
|
19
|
+
new(variant, skill_name, manifest_path: manifest_path).call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @param variant [Hash] Parsed variant from VariantParser
|
|
23
|
+
# @param skill_name [String] Name of the skill to resolve
|
|
24
|
+
# @param manifest_path [String, nil] Optional path to registry manifest
|
|
25
|
+
def initialize(variant, skill_name, manifest_path: nil)
|
|
26
|
+
@variant = variant
|
|
27
|
+
@skill_name = skill_name
|
|
28
|
+
@manifest_path = manifest_path
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Resolves skill paths from the variant specification.
|
|
32
|
+
#
|
|
33
|
+
# @return [Array<String>] Array of skill paths
|
|
34
|
+
# @raise [ArgumentError] when skill cannot be resolved or variant type is unknown
|
|
35
|
+
def call
|
|
36
|
+
case @variant[:type]
|
|
37
|
+
when :pack
|
|
38
|
+
resolve_pack_skill
|
|
39
|
+
when :path
|
|
40
|
+
[@variant[:path]]
|
|
41
|
+
else
|
|
42
|
+
raise ArgumentError, "Unknown variant type: #{@variant[:type]}, variant: #{@variant.inspect}"
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
# Resolves a skill from a pack.
|
|
49
|
+
#
|
|
50
|
+
# @return [Array<String>] Array containing the resolved skill path
|
|
51
|
+
# @raise [ArgumentError] when skill is not found in pack
|
|
52
|
+
def resolve_pack_skill
|
|
53
|
+
pack_name = @variant[:name]
|
|
54
|
+
manifest = @manifest_path || ManifestFinder.call
|
|
55
|
+
resolver = Registry::PackResolver.new(manifest)
|
|
56
|
+
resolved = resolver.resolve_skill(pack_name, @skill_name)
|
|
57
|
+
raise ArgumentError, "Skill '#{@skill_name}' not found in pack '#{pack_name}'" unless resolved
|
|
58
|
+
|
|
59
|
+
[resolved]
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
data/lib/skill_bench/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby-skill-bench
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1
|
|
4
|
+
version: 1.0.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ismael Marin
|
|
@@ -119,6 +119,7 @@ files:
|
|
|
119
119
|
- lib/skill_bench/agent/runner.rb
|
|
120
120
|
- lib/skill_bench/agent/summary.rb
|
|
121
121
|
- lib/skill_bench/cli.rb
|
|
122
|
+
- lib/skill_bench/cli/compare_command.rb
|
|
122
123
|
- lib/skill_bench/cli/eval/eval_command_registry.rb
|
|
123
124
|
- lib/skill_bench/cli/eval/eval_commands.rb
|
|
124
125
|
- lib/skill_bench/cli/eval/eval_options.rb
|
|
@@ -139,6 +140,7 @@ files:
|
|
|
139
140
|
- lib/skill_bench/clients/providers/deepseek.rb
|
|
140
141
|
- lib/skill_bench/clients/providers/gemini.rb
|
|
141
142
|
- lib/skill_bench/clients/providers/groq.rb
|
|
143
|
+
- lib/skill_bench/clients/providers/mock.rb
|
|
142
144
|
- lib/skill_bench/clients/providers/null_client.rb
|
|
143
145
|
- lib/skill_bench/clients/providers/ollama.rb
|
|
144
146
|
- lib/skill_bench/clients/providers/openai.rb
|
|
@@ -191,21 +193,40 @@ files:
|
|
|
191
193
|
- lib/skill_bench/output_formatter.rb
|
|
192
194
|
- lib/skill_bench/package_verifier.rb
|
|
193
195
|
- lib/skill_bench/rails/skill_templates.rb
|
|
196
|
+
- lib/skill_bench/registry/pack_resolver.rb
|
|
194
197
|
- lib/skill_bench/runner.rb
|
|
198
|
+
- lib/skill_bench/services/agent_spawner_service.rb
|
|
199
|
+
- lib/skill_bench/services/compare_option_parser.rb
|
|
200
|
+
- lib/skill_bench/services/comparison_reporter.rb
|
|
201
|
+
- lib/skill_bench/services/comparison_runner.rb
|
|
202
|
+
- lib/skill_bench/services/context_loader_service.rb
|
|
195
203
|
- lib/skill_bench/services/delta_table_formatter.rb
|
|
204
|
+
- lib/skill_bench/services/error_response_builder.rb
|
|
205
|
+
- lib/skill_bench/services/eval_resolver.rb
|
|
206
|
+
- lib/skill_bench/services/exit_code_calculator.rb
|
|
196
207
|
- lib/skill_bench/services/feedback_generator.rb
|
|
197
208
|
- lib/skill_bench/services/formatting_helpers.rb
|
|
198
209
|
- lib/skill_bench/services/iteration_formatter.rb
|
|
199
210
|
- lib/skill_bench/services/json_formatter.rb
|
|
211
|
+
- lib/skill_bench/services/judge_params_builder.rb
|
|
200
212
|
- lib/skill_bench/services/judge_score_parser_service.rb
|
|
201
213
|
- lib/skill_bench/services/junit_formatter.rb
|
|
214
|
+
- lib/skill_bench/services/manifest_finder.rb
|
|
202
215
|
- lib/skill_bench/services/option_parser_service.rb
|
|
216
|
+
- lib/skill_bench/services/output_formatter.rb
|
|
203
217
|
- lib/skill_bench/services/output_persistence_service.rb
|
|
218
|
+
- lib/skill_bench/services/prompt_builder_service.rb
|
|
219
|
+
- lib/skill_bench/services/provider_resolver.rb
|
|
204
220
|
- lib/skill_bench/services/result_printer_service.rb
|
|
205
221
|
- lib/skill_bench/services/runner_service.rb
|
|
206
222
|
- lib/skill_bench/services/skill_resolver.rb
|
|
223
|
+
- lib/skill_bench/services/skill_resolver_service.rb
|
|
224
|
+
- lib/skill_bench/services/source_path_resolver_service.rb
|
|
207
225
|
- lib/skill_bench/services/template_registry.rb
|
|
208
226
|
- lib/skill_bench/services/template_registry/category_data.rb
|
|
227
|
+
- lib/skill_bench/services/trend_recorder_service.rb
|
|
228
|
+
- lib/skill_bench/services/variant_parser.rb
|
|
229
|
+
- lib/skill_bench/services/variant_resolver.rb
|
|
209
230
|
- lib/skill_bench/task.rb
|
|
210
231
|
- lib/skill_bench/task/evaluator.rb
|
|
211
232
|
- lib/skill_bench/task/file_reader.rb
|
|
@@ -241,7 +262,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
241
262
|
- !ruby/object:Gem::Version
|
|
242
263
|
version: '0'
|
|
243
264
|
requirements: []
|
|
244
|
-
rubygems_version: 4.0.
|
|
265
|
+
rubygems_version: 4.0.12
|
|
245
266
|
specification_version: 4
|
|
246
267
|
summary: The evaluation engine for AI Agent Skills benchmarking.
|
|
247
268
|
test_files: []
|