fableforge-bench-agent 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ name: Release to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ permissions:
9
+ id-token: write
10
+ packages: write
11
+
12
+ jobs:
13
+ build-and-publish:
14
+ name: Build and publish to PyPI
15
+ runs-on: ubuntu-latest
16
+ permissions:
17
+ id-token: write
18
+
19
+ steps:
20
+ - name: Checkout code
21
+ uses: actions/checkout@v4
22
+
23
+ - name: Set up Python
24
+ uses: actions/setup-python@v5
25
+ with:
26
+ python-version: '3.12'
27
+
28
+ - name: Install build dependencies
29
+ run: python -m pip install --upgrade build
30
+
31
+ - name: Build package
32
+ run: python -m build
33
+
34
+ - name: Publish package to PyPI
35
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,15 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .pytest_cache/
10
+ .mypy_cache/
11
+ .ruff_cache/
12
+ .venv/
13
+ venv/
14
+ *.so
15
+ .env
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 FableForge Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,224 @@
1
+ Metadata-Version: 2.4
2
+ Name: fableforge-bench-agent
3
+ Version: 0.1.0
4
+ Summary: HumanEval for tool use — a standardized benchmark for evaluating LLM tool-use capabilities
5
+ Author: FableForge
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.10
9
+ Requires-Dist: click>=8.0
10
+ Requires-Dist: docker>=6.0
11
+ Requires-Dist: jinja2>=3.1
12
+ Requires-Dist: pydantic>=2.0
13
+ Requires-Dist: rich>=13.0
14
+ Provides-Extra: dev
15
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
16
+ Requires-Dist: pytest>=7.0; extra == 'dev'
17
+ Description-Content-Type: text/markdown
18
+
19
+ # BenchAgent — HumanEval for Tool Use
20
+
21
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/) [![Tests](https://img.shields.io/badge/tests-0-yellow.svg)](tests/)
22
+
23
+
24
+ A standardized benchmark for evaluating LLM tool-use capabilities across multiple categories: bash commands, code editing, code reading, code writing, multi-tool orchestration, and error recovery.
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install bench-agent
30
+ ```
31
+
32
+ For development:
33
+
34
+ ```bash
35
+ pip install -e ".[dev]"
36
+ ```
37
+
38
+ ## Quick Start
39
+
40
+ ```bash
41
+ # List available tasks
42
+ bench-agent list-tasks
43
+
44
+ # List tasks by category
45
+ bench-agent list-tasks --category bash
46
+
47
+ # Run benchmark against a model
48
+ bench-agent run --model gpt-4 --category bash
49
+
50
+ # Run all categories
51
+ bench-agent run --model fableforge-14b --all
52
+
53
+ # View leaderboard
54
+ bench-agent leaderboard
55
+
56
+ # Export leaderboard as markdown
57
+ bench-agent export --format markdown
58
+ ```
59
+
60
+ ## Task Categories
61
+
62
+ ### BASH (21 tasks)
63
+ Shell command execution: finding files, processing text, managing processes, network operations, log parsing, and system administration tasks.
64
+
65
+ ### EDIT (22 tasks)
66
+ Code modification: fixing bugs, refactoring code, adding features, changing APIs, adding type hints, converting sync to async, error handling, and API evolution.
67
+
68
+ ### READ (16 tasks)
69
+ Code comprehension: understanding structure, finding patterns, tracing execution, identifying vulnerabilities, and explaining code behavior.
70
+
71
+ ### WRITE (16 tasks)
72
+ Code creation: generating new files, configuration, tests, Dockerfiles, project scaffolding, and CI/CD pipelines.
73
+
74
+ ### MULTI-TOOL (16 tasks)
75
+ Complex tasks requiring 3+ tools in sequence: read → analyze → modify → verify, full project setup, and multi-file refactoring.
76
+
77
+ ### ERROR RECOVERY (16 tasks)
78
+ Fixing broken code, recovering from errors, handling edge cases: syntax errors, runtime errors, race conditions, security vulnerabilities, and infinite loops.
79
+
80
+ ## Scoring Methodology
81
+
82
+ Each task produces a `TaskResult` with:
83
+
84
+ | Metric | Weight | Description |
85
+ |--------|--------|-------------|
86
+ | Functional correctness | 60% | Does the solution work as expected? |
87
+ | Efficiency | 25% | Fewer turns and tokens = higher score |
88
+ | Error recovery | 15% | How well does the model recover from errors? |
89
+
90
+ For failed tasks, partial credit applies:
91
+
92
+ | Component | Weight | Description |
93
+ |-----------|--------|-------------|
94
+ | Partial completion | 50% | How close to a correct solution? |
95
+ | Error recovery rate | 30% | Were errors identified and addressed? |
96
+ | Efficiency | 20% | Resource usage despite failure |
97
+
98
+ ### Score Calculation
99
+
100
+ ```
101
+ Overall Score = 0.6 * functional_score + 0.15 * recovery_score + 0.25 * efficiency_score
102
+ ```
103
+
104
+ For failed tasks:
105
+ ```
106
+ Score = 0.5 * partial_credit + 0.3 * recovery_score + 0.2 * efficiency_score
107
+ ```
108
+
109
+ Final scores are scaled to 0–100.
110
+
111
+ ## Task Structure
112
+
113
+ Each task defines:
114
+
115
+ - **task_id**: Unique identifier (e.g., `bash-001`, `edit-015`)
116
+ - **category**: One of the six categories
117
+ - **difficulty**: `easy`, `medium`, or `hard`
118
+ - **description**: What the model needs to accomplish
119
+ - **initial_state**: Files to create before task execution
120
+ - **expected_outcome**: What constitutes success
121
+ - **tools_required**: Which tools the model should use
122
+ - **max_turns**: Maximum tool-use turns allowed
123
+ - **verification_script**: Python script to verify correctness
124
+
125
+ ## Task Counts
126
+
127
+ | Category | Count |
128
+ |----------|-------|
129
+ | BASH | 21 |
130
+ | EDIT | 22 |
131
+ | READ | 16 |
132
+ | WRITE | 16 |
133
+ | MULTI-TOOL | 16 |
134
+ | ERROR RECOVERY | 16 |
135
+ | **Total** | **107** |
136
+
137
+ ## Python API
138
+
139
+ ```python
140
+ from bench_agent.evaluator import evaluate_model
141
+ from bench_agent.runner import TaskRunner
142
+ from bench_agent.tasks import BASH_TASKS, EDIT_TASKS
143
+
144
+ # Run evaluation
145
+ report = evaluate_model(
146
+ model_name="gpt-4",
147
+ provider="openai",
148
+ categories=[TaskCategory.BASH, TaskCategory.EDIT],
149
+ num_tasks=10,
150
+ )
151
+
152
+ print(f"Total Score: {report.total_score}")
153
+ print(f"Category Scores: {report.category_scores}")
154
+ print(f"Error Recovery Rate: {report.error_recovery_rate}")
155
+ ```
156
+
157
+ ## Leaderboard
158
+
159
+ ```python
160
+ from bench_agent.leaderboard import load_leaderboard, update_leaderboard, export_markdown
161
+
162
+ lb = load_leaderboard("leaderboard.json")
163
+ lb = update_leaderboard(lb, "gpt-4", results)
164
+ print(export_markdown(lb))
165
+ ```
166
+
167
+ ## Architecture
168
+
169
+ ```
170
+ src/bench_agent/
171
+ ├── __init__.py # Package init
172
+ ├── models.py # Pydantic data models
173
+ ├── tasks.py # 107 task definitions
174
+ ├── runner.py # Task execution runner
175
+ ├── scorer.py # Scoring system
176
+ ├── leaderboard.py # Leaderboard management
177
+ ├── evaluator.py # Model evaluation
178
+ └── cli.py # Click CLI interface
179
+ ```
180
+
181
+ ## Development
182
+
183
+ ```bash
184
+ # Run tests
185
+ pytest tests/ -v
186
+
187
+ # Run with coverage
188
+ pytest tests/ -v --cov=bench_agent
189
+
190
+ # Lint
191
+ ruff check src/
192
+ ```
193
+
194
+ ## License
195
+
196
+ MIT
197
+
198
+ ## Ecosystem
199
+
200
+ Part of the [FableForge](../) ecosystem — 21 open-source projects built from 210K real agent traces:
201
+
202
+ | Project | Description |
203
+ | --- | --- |
204
+ | **[Anvil](../anvil)** | Self-verified coding agent |
205
+ | **[VerifyLoop](../verifyloop)** | Plan→Execute→Verify→Recover framework |
206
+ | **[ErrorRecovery](../error-recovery)** | Self-healing middleware (3,725 error patterns) |
207
+ | **[FableForge-14B](../fableforge-14b)** | The fine-tuned 14B model (4-stage training) |
208
+ | **[ShellWhisperer](../shell-whisperer)** | 1.5B edge agent (phone/RPi, 50ms) |
209
+ | **[ReasonCritic](../reason-critic)** | Verification model (130 benchmark tasks) |
210
+ | **[TraceCompiler](../trace-compiler)** | Compile traces → LoRA skills |
211
+ | **[AgentRuntime](../agent-runtime)** | Persistent agent daemon (systemd for AI) |
212
+ | **[AgentSwarm](../agent-swarm)** | Multi-agent from real trace transitions |
213
+ | **[AgentTelemetry](../agent-telemetry)** | Datadog for agents (token tracking, costs) |
214
+ | **[BenchAgent](../bench-agent)** | HumanEval for tool-use (107 tasks) |
215
+ | **[AgentDev](../agent-dev)** | VSCode extension with verification |
216
+ | **[TraceViz](../trace-viz)** | Trace replay visualizer (Next.js) |
217
+ | **[AgentSkills](../agent-skills)** | npm for agent behaviors |
218
+ | **[AgentCurriculum](../agent-curriculum)** | 5-stage progressive training |
219
+ | **[AgentFuzzer](../agent-fuzzer)** | Adversarial testing for agents |
220
+ | **[AgentConstitution](../agent-constitution)** | Safety guardrails from traces |
221
+ | **[CostOptimizer](../cost-optimizer)** | Token cost reduction (50-80%) |
222
+ | **[AgentProfiler](../agent-profiler)** | Behavioral fingerprinting |
223
+ | **[TrajectoryDistiller](../trajectory-distiller)** | Trace→training data pipeline |
224
+ | **[Fable5-Dataset](../fable5-dataset)** | HuggingFace dataset release |
@@ -0,0 +1,206 @@
1
+ # BenchAgent — HumanEval for Tool Use
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/) [![Tests](https://img.shields.io/badge/tests-0-yellow.svg)](tests/)
4
+
5
+
6
+ A standardized benchmark for evaluating LLM tool-use capabilities across multiple categories: bash commands, code editing, code reading, code writing, multi-tool orchestration, and error recovery.
7
+
8
+ ## Installation
9
+
10
+ ```bash
11
+ pip install bench-agent
12
+ ```
13
+
14
+ For development:
15
+
16
+ ```bash
17
+ pip install -e ".[dev]"
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ```bash
23
+ # List available tasks
24
+ bench-agent list-tasks
25
+
26
+ # List tasks by category
27
+ bench-agent list-tasks --category bash
28
+
29
+ # Run benchmark against a model
30
+ bench-agent run --model gpt-4 --category bash
31
+
32
+ # Run all categories
33
+ bench-agent run --model fableforge-14b --all
34
+
35
+ # View leaderboard
36
+ bench-agent leaderboard
37
+
38
+ # Export leaderboard as markdown
39
+ bench-agent export --format markdown
40
+ ```
41
+
42
+ ## Task Categories
43
+
44
+ ### BASH (21 tasks)
45
+ Shell command execution: finding files, processing text, managing processes, network operations, log parsing, and system administration tasks.
46
+
47
+ ### EDIT (22 tasks)
48
+ Code modification: fixing bugs, refactoring code, adding features, changing APIs, adding type hints, converting sync to async, error handling, and API evolution.
49
+
50
+ ### READ (16 tasks)
51
+ Code comprehension: understanding structure, finding patterns, tracing execution, identifying vulnerabilities, and explaining code behavior.
52
+
53
+ ### WRITE (16 tasks)
54
+ Code creation: generating new files, configuration, tests, Dockerfiles, project scaffolding, and CI/CD pipelines.
55
+
56
+ ### MULTI-TOOL (16 tasks)
57
+ Complex tasks requiring 3+ tools in sequence: read → analyze → modify → verify, full project setup, and multi-file refactoring.
58
+
59
+ ### ERROR RECOVERY (16 tasks)
60
+ Fixing broken code, recovering from errors, handling edge cases: syntax errors, runtime errors, race conditions, security vulnerabilities, and infinite loops.
61
+
62
+ ## Scoring Methodology
63
+
64
+ Each task produces a `TaskResult` with:
65
+
66
+ | Metric | Weight | Description |
67
+ |--------|--------|-------------|
68
+ | Functional correctness | 60% | Does the solution work as expected? |
69
+ | Efficiency | 25% | Fewer turns and tokens = higher score |
70
+ | Error recovery | 15% | How well does the model recover from errors? |
71
+
72
+ For failed tasks, partial credit applies:
73
+
74
+ | Component | Weight | Description |
75
+ |-----------|--------|-------------|
76
+ | Partial completion | 50% | How close to a correct solution? |
77
+ | Error recovery rate | 30% | Were errors identified and addressed? |
78
+ | Efficiency | 20% | Resource usage despite failure |
79
+
80
+ ### Score Calculation
81
+
82
+ ```
83
+ Overall Score = 0.6 * functional_score + 0.15 * recovery_score + 0.25 * efficiency_score
84
+ ```
85
+
86
+ For failed tasks:
87
+ ```
88
+ Score = 0.5 * partial_credit + 0.3 * recovery_score + 0.2 * efficiency_score
89
+ ```
90
+
91
+ Final scores are scaled to 0–100.
92
+
93
+ ## Task Structure
94
+
95
+ Each task defines:
96
+
97
+ - **task_id**: Unique identifier (e.g., `bash-001`, `edit-015`)
98
+ - **category**: One of the six categories
99
+ - **difficulty**: `easy`, `medium`, or `hard`
100
+ - **description**: What the model needs to accomplish
101
+ - **initial_state**: Files to create before task execution
102
+ - **expected_outcome**: What constitutes success
103
+ - **tools_required**: Which tools the model should use
104
+ - **max_turns**: Maximum tool-use turns allowed
105
+ - **verification_script**: Python script to verify correctness
106
+
107
+ ## Task Counts
108
+
109
+ | Category | Count |
110
+ |----------|-------|
111
+ | BASH | 21 |
112
+ | EDIT | 22 |
113
+ | READ | 16 |
114
+ | WRITE | 16 |
115
+ | MULTI-TOOL | 16 |
116
+ | ERROR RECOVERY | 16 |
117
+ | **Total** | **107** |
118
+
119
+ ## Python API
120
+
121
+ ```python
122
+ from bench_agent.evaluator import evaluate_model
123
+ from bench_agent.runner import TaskRunner
124
+ from bench_agent.tasks import BASH_TASKS, EDIT_TASKS
125
+
126
+ # Run evaluation
127
+ report = evaluate_model(
128
+ model_name="gpt-4",
129
+ provider="openai",
130
+ categories=[TaskCategory.BASH, TaskCategory.EDIT],
131
+ num_tasks=10,
132
+ )
133
+
134
+ print(f"Total Score: {report.total_score}")
135
+ print(f"Category Scores: {report.category_scores}")
136
+ print(f"Error Recovery Rate: {report.error_recovery_rate}")
137
+ ```
138
+
139
+ ## Leaderboard
140
+
141
+ ```python
142
+ from bench_agent.leaderboard import load_leaderboard, update_leaderboard, export_markdown
143
+
144
+ lb = load_leaderboard("leaderboard.json")
145
+ lb = update_leaderboard(lb, "gpt-4", results)
146
+ print(export_markdown(lb))
147
+ ```
148
+
149
+ ## Architecture
150
+
151
+ ```
152
+ src/bench_agent/
153
+ ├── __init__.py # Package init
154
+ ├── models.py # Pydantic data models
155
+ ├── tasks.py # 107 task definitions
156
+ ├── runner.py # Task execution runner
157
+ ├── scorer.py # Scoring system
158
+ ├── leaderboard.py # Leaderboard management
159
+ ├── evaluator.py # Model evaluation
160
+ └── cli.py # Click CLI interface
161
+ ```
162
+
163
+ ## Development
164
+
165
+ ```bash
166
+ # Run tests
167
+ pytest tests/ -v
168
+
169
+ # Run with coverage
170
+ pytest tests/ -v --cov=bench_agent
171
+
172
+ # Lint
173
+ ruff check src/
174
+ ```
175
+
176
+ ## License
177
+
178
+ MIT
179
+
180
+ ## Ecosystem
181
+
182
+ Part of the [FableForge](../) ecosystem — 21 open-source projects built from 210K real agent traces:
183
+
184
+ | Project | Description |
185
+ | --- | --- |
186
+ | **[Anvil](../anvil)** | Self-verified coding agent |
187
+ | **[VerifyLoop](../verifyloop)** | Plan→Execute→Verify→Recover framework |
188
+ | **[ErrorRecovery](../error-recovery)** | Self-healing middleware (3,725 error patterns) |
189
+ | **[FableForge-14B](../fableforge-14b)** | The fine-tuned 14B model (4-stage training) |
190
+ | **[ShellWhisperer](../shell-whisperer)** | 1.5B edge agent (phone/RPi, 50ms) |
191
+ | **[ReasonCritic](../reason-critic)** | Verification model (130 benchmark tasks) |
192
+ | **[TraceCompiler](../trace-compiler)** | Compile traces → LoRA skills |
193
+ | **[AgentRuntime](../agent-runtime)** | Persistent agent daemon (systemd for AI) |
194
+ | **[AgentSwarm](../agent-swarm)** | Multi-agent from real trace transitions |
195
+ | **[AgentTelemetry](../agent-telemetry)** | Datadog for agents (token tracking, costs) |
196
+ | **[BenchAgent](../bench-agent)** | HumanEval for tool-use (107 tasks) |
197
+ | **[AgentDev](../agent-dev)** | VSCode extension with verification |
198
+ | **[TraceViz](../trace-viz)** | Trace replay visualizer (Next.js) |
199
+ | **[AgentSkills](../agent-skills)** | npm for agent behaviors |
200
+ | **[AgentCurriculum](../agent-curriculum)** | 5-stage progressive training |
201
+ | **[AgentFuzzer](../agent-fuzzer)** | Adversarial testing for agents |
202
+ | **[AgentConstitution](../agent-constitution)** | Safety guardrails from traces |
203
+ | **[CostOptimizer](../cost-optimizer)** | Token cost reduction (50-80%) |
204
+ | **[AgentProfiler](../agent-profiler)** | Behavioral fingerprinting |
205
+ | **[TrajectoryDistiller](../trajectory-distiller)** | Trace→training data pipeline |
206
+ | **[Fable5-Dataset](../fable5-dataset)** | HuggingFace dataset release |
@@ -0,0 +1,34 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "fableforge-bench-agent"
7
+ version = "0.1.0"
8
+ description = "HumanEval for tool use — a standardized benchmark for evaluating LLM tool-use capabilities"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "MIT"}
12
+ authors = [{name = "FableForge"}]
13
+ dependencies = [
14
+ "pydantic>=2.0",
15
+ "rich>=13.0",
16
+ "jinja2>=3.1",
17
+ "docker>=6.0",
18
+ "click>=8.0",
19
+ ]
20
+
21
+ [project.scripts]
22
+ bench-agent = "bench_agent.cli:main"
23
+
24
+ [project.optional-dependencies]
25
+ dev = [
26
+ "pytest>=7.0",
27
+ "pytest-cov>=4.0",
28
+ ]
29
+
30
+ [tool.hatch.build.targets.wheel]
31
+ packages = ["src/bench_agent"]
32
+
33
+ [tool.pytest.ini_options]
34
+ testpaths = ["tests"]
@@ -0,0 +1,3 @@
1
+ """BenchAgent — HumanEval for tool use."""
2
+
3
+ __version__ = "0.1.0"