ruby-skill-bench 0.1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +231 -0
- data/lib/skill_bench/agent/react_agent.rb +2 -1
- data/lib/skill_bench/cli/compare_command.rb +91 -0
- data/lib/skill_bench/cli/help_printer.rb +9 -1
- data/lib/skill_bench/cli/run_command.rb +6 -4
- data/lib/skill_bench/cli.rb +7 -4
- data/lib/skill_bench/clients/all.rb +2 -0
- data/lib/skill_bench/clients/base_client.rb +2 -5
- data/lib/skill_bench/clients/providers/mock.rb +56 -0
- data/lib/skill_bench/clients/request_builder.rb +2 -4
- data/lib/skill_bench/clients/response_builder.rb +91 -0
- data/lib/skill_bench/clients/response_error_handler.rb +5 -17
- data/lib/skill_bench/clients/retry_handler.rb +4 -7
- data/lib/skill_bench/commands/run.rb +6 -2
- data/lib/skill_bench/config/applier.rb +1 -0
- data/lib/skill_bench/config/defaults.rb +1 -0
- data/lib/skill_bench/config/facade_readers.rb +7 -0
- data/lib/skill_bench/config/json_loader.rb +3 -3
- data/lib/skill_bench/config/store.rb +5 -0
- data/lib/skill_bench/config.rb +10 -1
- data/lib/skill_bench/constants.rb +58 -0
- data/lib/skill_bench/delta_report.rb +20 -0
- data/lib/skill_bench/execution/context_hydrator.rb +16 -6
- data/lib/skill_bench/execution/sandbox.rb +18 -3
- data/lib/skill_bench/execution/source_path_resolver.rb +59 -3
- data/lib/skill_bench/registry/pack_resolver.rb +119 -0
- data/lib/skill_bench/services/agent_spawner_service.rb +114 -0
- data/lib/skill_bench/services/compare_option_parser.rb +55 -0
- data/lib/skill_bench/services/comparison_reporter.rb +97 -0
- data/lib/skill_bench/services/comparison_runner.rb +49 -0
- data/lib/skill_bench/services/context_loader_service.rb +42 -0
- data/lib/skill_bench/services/error_response_builder.rb +119 -0
- data/lib/skill_bench/services/eval_resolver.rb +33 -0
- data/lib/skill_bench/services/exit_code_calculator.rb +39 -0
- data/lib/skill_bench/services/judge_params_builder.rb +54 -0
- data/lib/skill_bench/services/manifest_finder.rb +36 -0
- data/lib/skill_bench/services/output_formatter.rb +28 -0
- data/lib/skill_bench/services/prompt_builder_service.rb +98 -0
- data/lib/skill_bench/services/provider_resolver.rb +73 -0
- data/lib/skill_bench/services/runner_service.rb +84 -315
- data/lib/skill_bench/services/skill_resolver.rb +37 -9
- data/lib/skill_bench/services/skill_resolver_service.rb +70 -0
- data/lib/skill_bench/services/source_path_resolver_service.rb +45 -0
- data/lib/skill_bench/services/trend_recorder_service.rb +67 -0
- data/lib/skill_bench/services/variant_parser.rb +32 -0
- data/lib/skill_bench/services/variant_resolver.rb +63 -0
- data/lib/skill_bench/tools/run_command.rb +2 -17
- data/lib/skill_bench/version.rb +1 -1
- data/lib/skill_bench.rb +1 -0
- metadata +25 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d2ad524e13bc006a56f0197d07b3ba7b0ce2f99f60b61f0739c3d5bc0d75a687
|
|
4
|
+
data.tar.gz: a920c473148b52584653acbb1e91cb3973791c09de6c4df994a77c097eabc476
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c1f131af9bcde90e7fc3a7e6bef7f3770edfa4e2826ee19c3aabf5c210d6d3b6e5bdd460778a87f6fdc77b5b99bc17b2225e1b79de31674ad4acfe1bbc89f862
|
|
7
|
+
data.tar.gz: d8e3791c91242b25779afa3a21c57daeb06995bc4c65b01ca6f378a69491aeec95c5844ea541e5dcaf18b46d8e7f153ffb38ff2bb060ab5dacd653c8c1026bcd
|
data/README.md
CHANGED
|
@@ -7,6 +7,21 @@
|
|
|
7
7
|
|
|
8
8
|
*A high-fidelity evaluation engine for benchmarking AI agent skills across any stack (Rails-first, but extensible).*
|
|
9
9
|
|
|
10
|
+
## Part of the AI Skill Ecosystem
|
|
11
|
+
|
|
12
|
+
This repo is one of 6 in a composable AI skill ecosystem:
|
|
13
|
+
|
|
14
|
+
| Repo | Role |
|
|
15
|
+
|------|------|
|
|
16
|
+
| [`ruby-core-skills`](https://github.com/igmarin/ruby-core-skills) | 15 shared Ruby skills + process discipline |
|
|
17
|
+
| [`rails-agent-skills`](https://github.com/igmarin/rails-agent-skills) | 28 Rails-specific skills + 9 agents |
|
|
18
|
+
| [`hanakai-yaku`](https://github.com/igmarin/hanakai-yaku) | 35 Hanami/dry-rb skills + 10 agents |
|
|
19
|
+
| [`agnostic-planning-skills`](https://github.com/igmarin/agnostic-planning-skills) | 10 planning skills + 4 agents |
|
|
20
|
+
| [`agent-mcp-runtime`](https://github.com/igmarin/agent-mcp-runtime) | Rust CLI runtime (pack resolution, MCP) |
|
|
21
|
+
| [**`ruby-skill-bench`**](https://github.com/igmarin/ruby-skill-bench) | Benchmark/eval engine |
|
|
22
|
+
|
|
23
|
+
See the [Ecosystem Overview](https://github.com/igmarin/agent-mcp-runtime/blob/main/docs/ecosystem.md) for the full architecture.
|
|
24
|
+
|
|
10
25
|
---
|
|
11
26
|
|
|
12
27
|
## Features
|
|
@@ -343,6 +358,77 @@ Both skill contexts are concatenated and sent to the agent. The judge evaluates
|
|
|
343
358
|
|
|
344
359
|
---
|
|
345
360
|
|
|
361
|
+
## Multi-Repo Skill Benchmarking
|
|
362
|
+
|
|
363
|
+
Skills in the ecosystem are split across multiple repos:
|
|
364
|
+
- `ruby-core-skills` — 15 shared Ruby skills (DDD, patterns, process discipline)
|
|
365
|
+
- `rails-agent-skills` — 28 Rails-specific skills
|
|
366
|
+
- `hanakai-yaku` — 35 Hanami/dry-rb skills
|
|
367
|
+
|
|
368
|
+
To benchmark a skill from an external repo, use the `--skill` flag:
|
|
369
|
+
|
|
370
|
+
```bash
|
|
371
|
+
# Benchmark a core skill
|
|
372
|
+
skill-bench run evals/skills/write-yard-docs/basic \
|
|
373
|
+
--skill /path/to/ruby-core-skills/skills/patterns/write-yard-docs
|
|
374
|
+
|
|
375
|
+
# Benchmark a Rails skill
|
|
376
|
+
skill-bench run evals/skills/code-review/pr-review \
|
|
377
|
+
--skill /path/to/rails-agent-skills/skills/code-quality/code-review
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
### Config-Based Multi-Repo Resolution
|
|
381
|
+
|
|
382
|
+
Configure `skill_sources` in `skill-bench.json` to automatically resolve skills across repos without `--skill` every time:
|
|
383
|
+
|
|
384
|
+
```json
|
|
385
|
+
{
|
|
386
|
+
"provider": "openai",
|
|
387
|
+
"model": "gpt-4o",
|
|
388
|
+
"skill_sources": {
|
|
389
|
+
"core": "../ruby-core-skills/skills",
|
|
390
|
+
"rails": "../rails-agent-skills/skills",
|
|
391
|
+
"hanami": "../hanakai-yaku/skills"
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
Each key is a source name (for logging), each value is a path to a `skills/` directory. When a skill is not found locally, SkillBench iterates through `skill_sources` and uses the first match.
|
|
397
|
+
|
|
398
|
+
### Pack-Based Resolution (`--pack`)
|
|
399
|
+
|
|
400
|
+
Resolve skills via the ecosystem registry manifest (from `agent-mcp-runtime`):
|
|
401
|
+
|
|
402
|
+
```bash
|
|
403
|
+
# Run an eval using the Rails pack's version of code-review
|
|
404
|
+
skill-bench run evals/skills/code-review/basic \
|
|
405
|
+
--skill code-review \
|
|
406
|
+
--pack rails
|
|
407
|
+
|
|
408
|
+
# Override the default registry manifest path
|
|
409
|
+
skill-bench run evals/skills/code-review/basic \
|
|
410
|
+
--skill code-review \
|
|
411
|
+
--pack rails \
|
|
412
|
+
--registry-manifest /path/to/registry.json
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
### Variant Comparison (`compare`)
|
|
416
|
+
|
|
417
|
+
Compare the same skill across two pack variants to measure context-dependent performance:
|
|
418
|
+
|
|
419
|
+
```bash
|
|
420
|
+
skill-bench compare code-review \
|
|
421
|
+
--variant-a "pack:rails" \
|
|
422
|
+
--variant-b "pack:hanami" \
|
|
423
|
+
--eval evals/skills/code-review/basic
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
The `--variant` spec supports two forms:
|
|
427
|
+
- `pack:<name>` — resolve via registry manifest
|
|
428
|
+
- `/absolute/path` or `relative/path` — use a direct path
|
|
429
|
+
|
|
430
|
+
---
|
|
431
|
+
|
|
346
432
|
## File Reference: What Lives on Disk
|
|
347
433
|
|
|
348
434
|
SkillBench creates and manages three files in your project. Understanding them helps you iterate faster.
|
|
@@ -773,6 +859,151 @@ bundle exec ruby -Itest test/integration_test.rb
|
|
|
773
859
|
- `test/agent_eval/` — CLI, models, and service tests
|
|
774
860
|
- `test/clients/` — Provider client tests
|
|
775
861
|
|
|
862
|
+
---
|
|
863
|
+
|
|
864
|
+
## Security
|
|
865
|
+
|
|
866
|
+
### Threat Model
|
|
867
|
+
|
|
868
|
+
Ruby Skill Bench is designed with security as a primary concern. The system executes AI agents in isolated environments and must protect against various attack vectors:
|
|
869
|
+
|
|
870
|
+
- **Path Traversal:** Preventing agents from accessing files outside the sandbox
|
|
871
|
+
- **Command Injection:** Preventing execution of arbitrary shell commands
|
|
872
|
+
- **Resource Exhaustion:** Preventing denial-of-service through resource consumption
|
|
873
|
+
- **Information Leakage:** Protecting sensitive data like API keys
|
|
874
|
+
|
|
875
|
+
### Security Features
|
|
876
|
+
|
|
877
|
+
#### Path Traversal Protection
|
|
878
|
+
|
|
879
|
+
- **Symlink Validation:** All symlinks are validated to ensure they don't escape the sandbox
|
|
880
|
+
- **TOCTOU Mitigation:** Path validation is re-checked after directory creation operations
|
|
881
|
+
- **Path Normalization:** All paths are normalized and validated against working directory boundaries
|
|
882
|
+
- **Character Validation:** Paths are validated against strict character patterns
|
|
883
|
+
|
|
884
|
+
#### Command Execution Security
|
|
885
|
+
|
|
886
|
+
- **Command Allowlist:** Only explicitly allowed commands can be executed
|
|
887
|
+
- **Dangerous Commands Blocklist:** Dangerous commands (bash, curl, sudo, etc.) are always blocked
|
|
888
|
+
- **Shell Tokenization:** Commands are tokenized before execution to prevent shell injection
|
|
889
|
+
- **Docker Isolation:** Commands can be executed in isolated Docker containers with hardened security settings
|
|
890
|
+
|
|
891
|
+
#### Docker Security Hardening
|
|
892
|
+
|
|
893
|
+
When Docker is available, containers are launched with hardened security settings:
|
|
894
|
+
|
|
895
|
+
- **Non-root User:** Containers run as a non-root user
|
|
896
|
+
- **Privilege Prevention:** `--security-opt no-new-privileges` prevents privilege escalation
|
|
897
|
+
- **Capability Dropping:** All Linux capabilities are dropped except minimal needed ones
|
|
898
|
+
- **Network Isolation:** `--network none` disables network access
|
|
899
|
+
- **Read-only Root:** Container filesystem is read-only (except for mounted volumes)
|
|
900
|
+
|
|
901
|
+
#### Resource Limits
|
|
902
|
+
|
|
903
|
+
- **File Size Limits:** Individual files in context hydration are limited to 50KB
|
|
904
|
+
- **Total Context Size:** Total context size is limited to 1MB to prevent memory exhaustion
|
|
905
|
+
- **Execution Timeout:** Commands are limited to a configurable timeout (default: 30 seconds)
|
|
906
|
+
- **Max Iterations:** Agent loops are limited to prevent infinite loops
|
|
907
|
+
|
|
908
|
+
### API Key Security
|
|
909
|
+
|
|
910
|
+
- **Environment Variables:** API keys are loaded from environment variables, not hardcoded
|
|
911
|
+
- **Configuration Hierarchy:** Keys can be set in `skill-bench.json` or environment variables
|
|
912
|
+
- **No Logging:** API keys are never logged or exposed in error messages
|
|
913
|
+
- **Provider-Specific Keys:** Each provider uses its own API key configuration
|
|
914
|
+
|
|
915
|
+
### Best Practices for Users
|
|
916
|
+
|
|
917
|
+
1. **Never Commit API Keys:** Never commit `skill-bench.json` with API keys to version control
|
|
918
|
+
2. **Use Environment Variables:** Prefer environment variables for sensitive configuration
|
|
919
|
+
3. **Minimal Command Allowlist:** Only allow commands necessary for your evals
|
|
920
|
+
4. **Regular Updates:** Keep dependencies updated to patch security vulnerabilities
|
|
921
|
+
5. **Review Changes:** Review skill files before execution to ensure they don't contain malicious code
|
|
922
|
+
|
|
923
|
+
### Reporting Security Issues
|
|
924
|
+
|
|
925
|
+
If you discover a security vulnerability:
|
|
926
|
+
|
|
927
|
+
1. **Do Not Open a Public Issue:** Send a private email to the maintainers
|
|
928
|
+
2. **Provide Details:** Include steps to reproduce and potential impact
|
|
929
|
+
3. **Allow Time for Fix:** Give maintainers time to address the issue before disclosure
|
|
930
|
+
4. **Follow Responsible Disclosure:** Follow responsible disclosure practices
|
|
931
|
+
|
|
932
|
+
---
|
|
933
|
+
|
|
934
|
+
## Troubleshooting
|
|
935
|
+
|
|
936
|
+
### Common Issues and Solutions
|
|
937
|
+
|
|
938
|
+
#### Configuration Issues
|
|
939
|
+
|
|
940
|
+
**Problem:** "Config load failed, using mock provider"
|
|
941
|
+
- **Solution:** Ensure your `skill-bench.json` file is properly formatted JSON and contains required fields
|
|
942
|
+
- **Check:** Verify the file exists in your project root or home directory
|
|
943
|
+
|
|
944
|
+
**Problem:** "API Key not set for [Provider]"
|
|
945
|
+
- **Solution:** Set the appropriate environment variable (e.g., `SKILL_BENCH_OPENAI_API_KEY`) or add it to your `skill-bench.json`
|
|
946
|
+
- **Check:** Run `env | grep SKILL_BENCH` to verify environment variables are set
|
|
947
|
+
|
|
948
|
+
**Problem:** "No allowed commands configured"
|
|
949
|
+
- **Solution:** Add `allowed_commands` array to your `skill-bench.json` with the commands you want to allow
|
|
950
|
+
- **Check:** Ensure commands are in the allowlist and not in the dangerous commands list
|
|
951
|
+
|
|
952
|
+
#### Execution Issues
|
|
953
|
+
|
|
954
|
+
**Problem:** "Command execution timed out"
|
|
955
|
+
- **Solution:** Increase `max_execution_time` in your `skill-bench.json` or simplify the task
|
|
956
|
+
- **Check:** Verify the command isn't hanging or waiting for input
|
|
957
|
+
|
|
958
|
+
**Problem:** "Docker container failed to start"
|
|
959
|
+
- **Solution:** Ensure Docker is running and you have permissions to run Docker commands
|
|
960
|
+
- **Check:** Run `docker info` to verify Docker daemon is accessible
|
|
961
|
+
|
|
962
|
+
**Problem:** "Context hydration failed"
|
|
963
|
+
- **Solution:** Verify the source path exists and is a directory
|
|
964
|
+
- **Check:** Ensure the path is within the base directory and file sizes are under limits
|
|
965
|
+
|
|
966
|
+
#### Network Issues
|
|
967
|
+
|
|
968
|
+
**Problem:** "Network Error: Connection refused"
|
|
969
|
+
- **Solution:** Check your internet connection and API provider status
|
|
970
|
+
- **Check:** Verify the base URL in your configuration is correct
|
|
971
|
+
|
|
972
|
+
**Problem:** "API Request failed: 429"
|
|
973
|
+
- **Solution:** This is a rate limit error. The system will retry automatically
|
|
974
|
+
- **Check:** Reduce request frequency or check your API quota
|
|
975
|
+
|
|
976
|
+
#### Test Failures
|
|
977
|
+
|
|
978
|
+
**Problem:** Tests fail with "WebMock::NetConnectNotAllowedError"
|
|
979
|
+
- **Solution:** This occurs when tests try to make real HTTP requests. Ensure test stubs are properly configured
|
|
980
|
+
- **Check:** Verify WebMock is properly stubbing the expected URLs
|
|
981
|
+
|
|
982
|
+
**Problem:** "E2E sibling repositories not present"
|
|
983
|
+
- **Solution:** This is expected if you don't have the agent-mcp-runtime repository cloned
|
|
984
|
+
- **Check:** These tests will be skipped and won't affect the overall test results
|
|
985
|
+
|
|
986
|
+
### Debug Mode
|
|
987
|
+
|
|
988
|
+
For detailed debugging, you can enable verbose logging:
|
|
989
|
+
|
|
990
|
+
```bash
|
|
991
|
+
# Set environment variable for verbose logging
|
|
992
|
+
export SKILL_BENCH_DEBUG=true
|
|
993
|
+
skill-bench run my-eval --skill=my-skill
|
|
994
|
+
```
|
|
995
|
+
|
|
996
|
+
### Getting Help
|
|
997
|
+
|
|
998
|
+
If you encounter issues not covered here:
|
|
999
|
+
|
|
1000
|
+
1. Check the [GitHub Issues](https://github.com/igmarin/ruby-skill-bench/issues) for similar problems
|
|
1001
|
+
2. Create a new issue with detailed information about your environment and the problem
|
|
1002
|
+
3. Include Ruby version, SkillBench version, and error messages
|
|
1003
|
+
4. Provide steps to reproduce the issue
|
|
1004
|
+
|
|
1005
|
+
---
|
|
1006
|
+
|
|
776
1007
|
## CI/CD Integration
|
|
777
1008
|
|
|
778
1009
|
GitHub Actions workflow included (`.github/workflows/ci.yml`):
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative '../constants'
|
|
3
4
|
require_relative 'react_agent/step'
|
|
4
5
|
require_relative 'react_agent/loop_runner'
|
|
5
6
|
|
|
@@ -29,7 +30,7 @@ module SkillBench
|
|
|
29
30
|
def initialize(params)
|
|
30
31
|
@system_prompt = params[:system_prompt]
|
|
31
32
|
@initial_prompt = params[:initial_prompt]
|
|
32
|
-
@max_iterations = params[:max_iterations] ||
|
|
33
|
+
@max_iterations = params[:max_iterations] || Constants::ReactAgent::DEFAULT_MAX_ITERATIONS
|
|
33
34
|
@working_dir = params[:working_dir] || Dir.pwd
|
|
34
35
|
@container_id = params[:container_id]
|
|
35
36
|
@client_params = params[:client_params] || {}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../services/compare_option_parser'
|
|
4
|
+
require_relative '../services/variant_parser'
|
|
5
|
+
require_relative '../services/comparison_runner'
|
|
6
|
+
require_relative '../services/comparison_reporter'
|
|
7
|
+
require_relative '../services/exit_code_calculator'
|
|
8
|
+
|
|
9
|
+
module SkillBench
|
|
10
|
+
module Cli
|
|
11
|
+
# Handles the `skill-bench compare` command.
|
|
12
|
+
# Runs the same eval with two skill variants and reports the comparison.
|
|
13
|
+
class CompareCommand
|
|
14
|
+
# Parses argv and executes the comparison.
|
|
15
|
+
#
|
|
16
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
17
|
+
# @return [Integer] Exit code
|
|
18
|
+
def self.call(argv)
|
|
19
|
+
new(argv).call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @param argv [Array<String>] Raw CLI arguments
|
|
23
|
+
def initialize(argv)
|
|
24
|
+
@argv = argv
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Parses options, runs both variants, and prints a comparison report.
|
|
28
|
+
#
|
|
29
|
+
# @return [Integer] Exit code (0 if both pass, 1 otherwise)
|
|
30
|
+
def call
|
|
31
|
+
options = Services::CompareOptionParser.call(@argv)
|
|
32
|
+
|
|
33
|
+
skill_name = @argv.shift
|
|
34
|
+
return error_missing_skill unless skill_name
|
|
35
|
+
return error_missing_variant_a unless options[:variant_a]
|
|
36
|
+
return error_missing_variant_b unless options[:variant_b]
|
|
37
|
+
return error_missing_eval unless options[:eval]
|
|
38
|
+
|
|
39
|
+
variant_a = Services::VariantParser.call(options[:variant_a])
|
|
40
|
+
variant_b = Services::VariantParser.call(options[:variant_b])
|
|
41
|
+
|
|
42
|
+
puts "--- Running Variant A: #{options[:variant_a]} ---"
|
|
43
|
+
puts "--- Running Variant B: #{options[:variant_b]} ---"
|
|
44
|
+
|
|
45
|
+
results = Services::ComparisonRunner.call(
|
|
46
|
+
variant_a,
|
|
47
|
+
variant_b,
|
|
48
|
+
skill_name,
|
|
49
|
+
options[:eval]
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
Services::ComparisonReporter.call(
|
|
53
|
+
results[:result_a],
|
|
54
|
+
results[:result_b],
|
|
55
|
+
options[:variant_a],
|
|
56
|
+
options[:variant_b]
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
Services::ExitCodeCalculator.call(results[:result_a], results[:result_b])
|
|
60
|
+
rescue SkillBench::HelpRequested
|
|
61
|
+
0
|
|
62
|
+
rescue StandardError => e
|
|
63
|
+
warn "Error: #{e.message}"
|
|
64
|
+
1
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
private
|
|
68
|
+
|
|
69
|
+
def error_missing_skill
|
|
70
|
+
warn 'Error: skill name is required'
|
|
71
|
+
warn 'Usage: skill-bench compare <skill-name> --variant-a <spec> --variant-b <spec> --eval <path>'
|
|
72
|
+
1
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def error_missing_variant_a
|
|
76
|
+
warn 'Error: --variant-a is required'
|
|
77
|
+
1
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def error_missing_variant_b
|
|
81
|
+
warn 'Error: --variant-b is required'
|
|
82
|
+
1
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def error_missing_eval
|
|
86
|
+
warn 'Error: --eval is required'
|
|
87
|
+
1
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
@@ -19,11 +19,19 @@ module SkillBench
|
|
|
19
19
|
Providers: #{providers}
|
|
20
20
|
--force Overwrite existing config file
|
|
21
21
|
|
|
22
|
-
run <eval> --skill <name> [--skill <name>] [--format FORMAT]
|
|
22
|
+
run <eval> --skill <name> [--skill <name>] [--format FORMAT] [--pack NAME]
|
|
23
23
|
Run an evaluation
|
|
24
24
|
--skill Skill to use (can be specified multiple times)
|
|
25
|
+
--pack Pack context for registry-based skill resolution
|
|
26
|
+
--registry-manifest PATH Path to registry.json manifest
|
|
25
27
|
--format Output format: human, json, junit (default: human)
|
|
26
28
|
|
|
29
|
+
compare <skill-name> --variant-a SPEC --variant-b SPEC --eval PATH
|
|
30
|
+
Compare the same skill across two pack variants
|
|
31
|
+
--variant-a First variant (e.g., "pack:rails" or "/path/to/skill")
|
|
32
|
+
--variant-b Second variant (e.g., "pack:hanami")
|
|
33
|
+
--eval Path to the eval directory
|
|
34
|
+
|
|
27
35
|
skill new <name> [--mode MODE] [--template TYPE]
|
|
28
36
|
Create a new skill
|
|
29
37
|
--mode simple, advanced, or rails (default: simple)
|
|
@@ -29,7 +29,7 @@ module SkillBench
|
|
|
29
29
|
|
|
30
30
|
eval_name = @argv.shift
|
|
31
31
|
return error_missing_eval unless eval_name
|
|
32
|
-
return error_missing_skill if options[:skill_names].empty?
|
|
32
|
+
return error_missing_skill if options[:skill_names].empty? && !options[:pack]
|
|
33
33
|
|
|
34
34
|
options[:eval_name] = eval_name
|
|
35
35
|
exec_options = options.reject { |key| key == :format }
|
|
@@ -48,6 +48,8 @@ module SkillBench
|
|
|
48
48
|
OptionParser.new do |opts|
|
|
49
49
|
opts.banner = 'Usage: skill-bench run <eval> [options]'
|
|
50
50
|
opts.on('--skill NAME', 'Skill to use (can be specified multiple times)') { |v| options[:skill_names] << v }
|
|
51
|
+
opts.on('--pack NAME', 'Pack context for skill resolution') { |v| options[:pack] = v }
|
|
52
|
+
opts.on('--registry-manifest PATH', 'Path to registry.json manifest') { |v| options[:registry_manifest] = v }
|
|
51
53
|
opts.on('--format FORMAT', 'Output format (human, json, junit)') { |v| options[:format] = v.to_sym }
|
|
52
54
|
opts.on('-h', '--help', 'Prints this help') do
|
|
53
55
|
puts opts
|
|
@@ -58,13 +60,13 @@ module SkillBench
|
|
|
58
60
|
|
|
59
61
|
def error_missing_eval
|
|
60
62
|
warn 'Error: eval name is required'
|
|
61
|
-
warn 'Usage: skill-bench run <eval> --skill <name>'
|
|
63
|
+
warn 'Usage: skill-bench run <eval> [--skill <name>] [--pack <name>]'
|
|
62
64
|
1
|
|
63
65
|
end
|
|
64
66
|
|
|
65
67
|
def error_missing_skill
|
|
66
|
-
warn 'Error: skill name is required'
|
|
67
|
-
warn 'Usage: skill-bench run <eval> --skill <name>'
|
|
68
|
+
warn 'Error: skill name or pack is required'
|
|
69
|
+
warn 'Usage: skill-bench run <eval> --skill <name> [--pack <name>]'
|
|
68
70
|
1
|
|
69
71
|
end
|
|
70
72
|
end
|
data/lib/skill_bench/cli.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative 'cli/init_command'
|
|
4
4
|
require_relative 'cli/run_command'
|
|
5
|
+
require_relative 'cli/compare_command'
|
|
5
6
|
require_relative 'cli/skill_command'
|
|
6
7
|
require_relative 'cli/eval_command'
|
|
7
8
|
require_relative 'cli/help_printer'
|
|
@@ -18,6 +19,7 @@ module SkillBench
|
|
|
18
19
|
# @param argv [Array<String>] Raw CLI arguments.
|
|
19
20
|
# @return [Integer] Exit code.
|
|
20
21
|
def self.call(argv)
|
|
22
|
+
Config.reset
|
|
21
23
|
new(argv).call
|
|
22
24
|
end
|
|
23
25
|
|
|
@@ -35,10 +37,11 @@ module SkillBench
|
|
|
35
37
|
|
|
36
38
|
subcommand = @argv.shift
|
|
37
39
|
case subcommand
|
|
38
|
-
when 'init'
|
|
39
|
-
when 'run'
|
|
40
|
-
when '
|
|
41
|
-
when '
|
|
40
|
+
when 'init' then Cli::InitCommand.call(@argv)
|
|
41
|
+
when 'run' then Cli::RunCommand.call(@argv)
|
|
42
|
+
when 'compare' then Cli::CompareCommand.call(@argv)
|
|
43
|
+
when 'skill' then Cli::SkillCommand.call(@argv)
|
|
44
|
+
when 'eval' then Cli::EvalCommand.call(@argv)
|
|
42
45
|
when '-h', '--help', 'help'
|
|
43
46
|
help.call
|
|
44
47
|
else
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative 'response_parser'
|
|
4
4
|
require_relative 'response_error_handler'
|
|
5
|
+
require_relative 'response_builder'
|
|
5
6
|
require_relative 'request_builder'
|
|
6
7
|
require_relative 'retry_handler'
|
|
7
8
|
require_relative 'base_client'
|
|
@@ -17,3 +18,4 @@ require_relative 'providers/opencode'
|
|
|
17
18
|
require_relative 'providers/groq'
|
|
18
19
|
require_relative 'providers/deepseek'
|
|
19
20
|
require_relative 'providers/openrouter'
|
|
21
|
+
require_relative 'providers/mock'
|
|
@@ -4,6 +4,7 @@ require_relative '../config'
|
|
|
4
4
|
require_relative 'provider_config'
|
|
5
5
|
require_relative 'response_parser'
|
|
6
6
|
require_relative 'response_error_handler'
|
|
7
|
+
require_relative 'response_builder'
|
|
7
8
|
require_relative 'request_builder'
|
|
8
9
|
require_relative 'retry_handler'
|
|
9
10
|
|
|
@@ -135,7 +136,7 @@ module SkillBench
|
|
|
135
136
|
else
|
|
136
137
|
"#{missing.first} not set for #{@provider_display_name}"
|
|
137
138
|
end
|
|
138
|
-
|
|
139
|
+
ResponseBuilder.error(message: message)
|
|
139
140
|
end
|
|
140
141
|
|
|
141
142
|
# Extracts the message hash from the provider's specific response body structure.
|
|
@@ -182,10 +183,6 @@ module SkillBench
|
|
|
182
183
|
message = extract_message(parsed)
|
|
183
184
|
return missing_message_response(response, parsed) unless ResponseParser.valid_message?(message)
|
|
184
185
|
|
|
185
|
-
success_response(parsed, message)
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
def success_response(parsed, message)
|
|
189
186
|
content = ResponseParser.extract_content(message)
|
|
190
187
|
{
|
|
191
188
|
success: true,
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../provider_registry'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module SkillBench
|
|
7
|
+
module Clients
|
|
8
|
+
module Providers
|
|
9
|
+
# Mock LLM client for testing and local validation.
|
|
10
|
+
class Mock
|
|
11
|
+
SkillBench::Clients::ProviderRegistry.register(:mock, self)
|
|
12
|
+
|
|
13
|
+
# Mock call implementation to simulate LLM responses for test suites.
|
|
14
|
+
#
|
|
15
|
+
# @param system_prompt [String] system prompt instructions.
|
|
16
|
+
# @param messages [Array<Hash>] chat history messages.
|
|
17
|
+
# @param _options [Hash] additional keyword options.
|
|
18
|
+
# @return [Hash] mock response hash.
|
|
19
|
+
def self.call(system_prompt:, messages:, **_options)
|
|
20
|
+
_ = system_prompt
|
|
21
|
+
prompt = messages.first[:content] || messages.first['content'] || ''
|
|
22
|
+
|
|
23
|
+
# Parse dimensions from prompt
|
|
24
|
+
dimensions = {}
|
|
25
|
+
prompt.scan(/-\s+([^:]+):\s+max_score=(\d+)/).each do |name, max_score|
|
|
26
|
+
max = max_score.to_i
|
|
27
|
+
# Give baseline slightly lower score than context to simulate improvement
|
|
28
|
+
is_context = prompt.match?(/## Skill Context\s+\S+/)
|
|
29
|
+
score = is_context ? (max * 0.95).round : (max * 0.8).round
|
|
30
|
+
dimensions[name] = {
|
|
31
|
+
'score' => score,
|
|
32
|
+
'max_score' => max,
|
|
33
|
+
'reasoning' => "Mock evaluation for #{name}"
|
|
34
|
+
}
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
dimensions['correctness'] = { 'score' => 8, 'max_score' => 10, 'reasoning' => 'Mock correctness' } if dimensions.empty?
|
|
38
|
+
|
|
39
|
+
content = {
|
|
40
|
+
'dimensions' => dimensions,
|
|
41
|
+
'overall_reasoning' => 'Mock evaluation overall reasoning'
|
|
42
|
+
}.to_json
|
|
43
|
+
|
|
44
|
+
{
|
|
45
|
+
success: true,
|
|
46
|
+
response: {
|
|
47
|
+
message: {
|
|
48
|
+
content: content
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -1,22 +1,20 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'faraday'
|
|
4
|
+
require_relative '../constants'
|
|
4
5
|
|
|
5
6
|
module SkillBench
|
|
6
7
|
module Clients
|
|
7
8
|
# Builds and executes HTTP requests to LLM provider APIs.
|
|
8
9
|
# Encapsulates Faraday connection setup and request execution.
|
|
9
10
|
class RequestBuilder
|
|
10
|
-
DEFAULT_OPEN_TIMEOUT = 10
|
|
11
|
-
DEFAULT_TIMEOUT = 120
|
|
12
|
-
|
|
13
11
|
# Creates a Faraday connection with JSON middleware.
|
|
14
12
|
#
|
|
15
13
|
# @param base_url [String] The API base URL
|
|
16
14
|
# @param open_timeout [Integer] Connection open timeout in seconds
|
|
17
15
|
# @param timeout [Integer] Request timeout in seconds
|
|
18
16
|
# @return [Faraday::Connection] Configured Faraday connection
|
|
19
|
-
def self.build_connection(base_url, open_timeout: DEFAULT_OPEN_TIMEOUT, timeout: DEFAULT_TIMEOUT)
|
|
17
|
+
def self.build_connection(base_url, open_timeout: Constants::HttpClient::DEFAULT_OPEN_TIMEOUT, timeout: Constants::HttpClient::DEFAULT_TIMEOUT)
|
|
20
18
|
Faraday.new(url: base_url) do |f|
|
|
21
19
|
f.request :json
|
|
22
20
|
f.response :json
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SkillBench
|
|
4
|
+
module Clients
|
|
5
|
+
# Service object for building standardized response hashes.
|
|
6
|
+
# Eliminates duplication of error response formatting across the codebase.
|
|
7
|
+
class ResponseBuilder
|
|
8
|
+
# Builds a standardized error response.
|
|
9
|
+
#
|
|
10
|
+
# @param message [String] The error message.
|
|
11
|
+
# @param status [String] The status identifier (default: 'error').
|
|
12
|
+
# @return [Hash] Standardized error response hash.
|
|
13
|
+
def self.error(message:, status: 'error')
|
|
14
|
+
{
|
|
15
|
+
success: false,
|
|
16
|
+
response: { error: { message: message } },
|
|
17
|
+
result: message,
|
|
18
|
+
status: status
|
|
19
|
+
}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Builds a standardized success response.
|
|
23
|
+
#
|
|
24
|
+
# @param content [String] The response content.
|
|
25
|
+
# @param metadata [Hash] Additional metadata to include in response.
|
|
26
|
+
# @return [Hash] Standardized success response hash.
|
|
27
|
+
def self.success(content:, metadata: {})
|
|
28
|
+
{
|
|
29
|
+
success: true,
|
|
30
|
+
result: content,
|
|
31
|
+
response: { content: content }.merge(metadata),
|
|
32
|
+
status: 'success'
|
|
33
|
+
}
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Builds a standardized API error response.
|
|
37
|
+
#
|
|
38
|
+
# @param error_message [String] The API error message.
|
|
39
|
+
# @param usage [Hash] Token usage information.
|
|
40
|
+
# @return [Hash] Standardized API error response hash.
|
|
41
|
+
def self.api_error(error_message:, usage: {})
|
|
42
|
+
{
|
|
43
|
+
success: false,
|
|
44
|
+
result: "API Error: #{error_message}",
|
|
45
|
+
usage: usage,
|
|
46
|
+
response: { error: { message: "API Error: #{error_message}" } },
|
|
47
|
+
status: 'error'
|
|
48
|
+
}
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Builds a standardized network error response.
|
|
52
|
+
#
|
|
53
|
+
# @param error_message [String] The network error message.
|
|
54
|
+
# @return [Hash] Standardized network error response hash.
|
|
55
|
+
def self.network_error(error_message:)
|
|
56
|
+
{
|
|
57
|
+
success: false,
|
|
58
|
+
response: { error: { message: "Network Error: #{error_message}" } },
|
|
59
|
+
result: "Network Error: #{error_message}",
|
|
60
|
+
status: 'error'
|
|
61
|
+
}
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Builds a standardized parsing error response.
|
|
65
|
+
#
|
|
66
|
+
# @param error_message [String] The parsing error message.
|
|
67
|
+
# @return [Hash] Standardized parsing error response hash.
|
|
68
|
+
def self.parsing_error(error_message:)
|
|
69
|
+
{
|
|
70
|
+
success: false,
|
|
71
|
+
response: { error: { message: "Parsing Error: #{error_message}" } },
|
|
72
|
+
result: "Parsing Error: #{error_message}",
|
|
73
|
+
status: 'error'
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Builds a standardized unexpected error response.
|
|
78
|
+
#
|
|
79
|
+
# @param error_message [String] The unexpected error message.
|
|
80
|
+
# @return [Hash] Standardized unexpected error response hash.
|
|
81
|
+
def self.unexpected_error(error_message:)
|
|
82
|
+
{
|
|
83
|
+
success: false,
|
|
84
|
+
response: { error: { message: "Unexpected Error: #{error_message}" } },
|
|
85
|
+
result: "Unexpected Error: #{error_message}",
|
|
86
|
+
status: 'error'
|
|
87
|
+
}
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|