awb 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. awb-0.2.0/.gitignore +19 -0
  2. awb-0.2.0/LICENSE +21 -0
  3. awb-0.2.0/PKG-INFO +298 -0
  4. awb-0.2.0/README.md +261 -0
  5. awb-0.2.0/awb/__init__.py +3 -0
  6. awb-0.2.0/awb/adapters/__init__.py +0 -0
  7. awb-0.2.0/awb/adapters/aider.py +28 -0
  8. awb-0.2.0/awb/adapters/base.py +50 -0
  9. awb-0.2.0/awb/adapters/claude_code.py +184 -0
  10. awb-0.2.0/awb/adapters/cursor.py +28 -0
  11. awb-0.2.0/awb/adapters/registry.py +63 -0
  12. awb-0.2.0/awb/cli.py +751 -0
  13. awb-0.2.0/awb/core/__init__.py +0 -0
  14. awb-0.2.0/awb/core/config.py +236 -0
  15. awb-0.2.0/awb/core/metrics.py +118 -0
  16. awb-0.2.0/awb/core/repo_manager.py +103 -0
  17. awb-0.2.0/awb/core/results.py +121 -0
  18. awb-0.2.0/awb/core/runner.py +191 -0
  19. awb-0.2.0/awb/core/task_loader.py +111 -0
  20. awb-0.2.0/awb/core/timeout.py +29 -0
  21. awb-0.2.0/awb/leaderboard/__init__.py +0 -0
  22. awb-0.2.0/awb/leaderboard/generate.py +120 -0
  23. awb-0.2.0/awb/leaderboard/static/leaderboard.js +222 -0
  24. awb-0.2.0/awb/leaderboard/static/style.css +308 -0
  25. awb-0.2.0/awb/leaderboard/templates/index.html +128 -0
  26. awb-0.2.0/awb/scoring/__init__.py +0 -0
  27. awb-0.2.0/awb/scoring/baselines.py +48 -0
  28. awb-0.2.0/awb/scoring/capabilities.py +111 -0
  29. awb-0.2.0/awb/scoring/composite.py +153 -0
  30. awb-0.2.0/awb/scoring/integrity.py +108 -0
  31. awb-0.2.0/awb/scoring/normalize.py +86 -0
  32. awb-0.2.0/awb/scoring/report.py +100 -0
  33. awb-0.2.0/awb/scoring/statistics.py +210 -0
  34. awb-0.2.0/awb/scoring/weights.yaml +29 -0
  35. awb-0.2.0/awb/submission/__init__.py +1 -0
  36. awb-0.2.0/awb/submission/compare.py +97 -0
  37. awb-0.2.0/awb/submission/ingest.py +170 -0
  38. awb-0.2.0/awb/submission/schema.py +113 -0
  39. awb-0.2.0/awb/tasks/__init__.py +0 -0
  40. awb-0.2.0/awb/tasks/_template.yaml +52 -0
  41. awb-0.2.0/awb/tasks/bug-fix/BF-001.yaml +62 -0
  42. awb-0.2.0/awb/tasks/bug-fix/BF-003.yaml +85 -0
  43. awb-0.2.0/awb/tasks/bug-fix/BF-004.yaml +91 -0
  44. awb-0.2.0/awb/tasks/bug-fix/BF-005.yaml +86 -0
  45. awb-0.2.0/awb/tasks/bug-fix/BF-006.yaml +104 -0
  46. awb-0.2.0/awb/tasks/bug-fix/BF-007.yaml +119 -0
  47. awb-0.2.0/awb/tasks/bug-fix/BF-008.yaml +97 -0
  48. awb-0.2.0/awb/tasks/bug-fix/BF-009.yaml +99 -0
  49. awb-0.2.0/awb/tasks/bug-fix/BF-010.yaml +78 -0
  50. awb-0.2.0/awb/tasks/bug-fix/BF-011.yaml +98 -0
  51. awb-0.2.0/awb/tasks/code-review/CR-001.yaml +106 -0
  52. awb-0.2.0/awb/tasks/code-review/CR-002.yaml +98 -0
  53. awb-0.2.0/awb/tasks/code-review/CR-003.yaml +97 -0
  54. awb-0.2.0/awb/tasks/code-review/CR-004.yaml +131 -0
  55. awb-0.2.0/awb/tasks/code-review/CR-005.yaml +106 -0
  56. awb-0.2.0/awb/tasks/code-review/CR-006.yaml +115 -0
  57. awb-0.2.0/awb/tasks/code-review/CR-007.yaml +99 -0
  58. awb-0.2.0/awb/tasks/debugging/DB-001.yaml +71 -0
  59. awb-0.2.0/awb/tasks/debugging/DB-002.yaml +97 -0
  60. awb-0.2.0/awb/tasks/debugging/DB-003.yaml +70 -0
  61. awb-0.2.0/awb/tasks/debugging/DB-004.yaml +75 -0
  62. awb-0.2.0/awb/tasks/debugging/DB-005.yaml +81 -0
  63. awb-0.2.0/awb/tasks/debugging/DB-006.yaml +73 -0
  64. awb-0.2.0/awb/tasks/debugging/DB-007.yaml +86 -0
  65. awb-0.2.0/awb/tasks/feature-addition/FA-001.yaml +99 -0
  66. awb-0.2.0/awb/tasks/feature-addition/FA-002.yaml +110 -0
  67. awb-0.2.0/awb/tasks/feature-addition/FA-003.yaml +107 -0
  68. awb-0.2.0/awb/tasks/feature-addition/FA-004.yaml +124 -0
  69. awb-0.2.0/awb/tasks/feature-addition/FA-005.yaml +143 -0
  70. awb-0.2.0/awb/tasks/feature-addition/FA-006.yaml +124 -0
  71. awb-0.2.0/awb/tasks/feature-addition/FA-007.yaml +128 -0
  72. awb-0.2.0/awb/tasks/feature-addition/FA-008.yaml +164 -0
  73. awb-0.2.0/awb/tasks/legacy-code/LC-001.yaml +104 -0
  74. awb-0.2.0/awb/tasks/legacy-code/LC-002.yaml +124 -0
  75. awb-0.2.0/awb/tasks/legacy-code/LC-003.yaml +155 -0
  76. awb-0.2.0/awb/tasks/legacy-code/LC-004.yaml +129 -0
  77. awb-0.2.0/awb/tasks/legacy-code/LC-005.yaml +142 -0
  78. awb-0.2.0/awb/tasks/legacy-code/LC-006.yaml +174 -0
  79. awb-0.2.0/awb/tasks/legacy-code/LC-007.yaml +167 -0
  80. awb-0.2.0/awb/tasks/legacy-code/LC-008.yaml +150 -0
  81. awb-0.2.0/awb/tasks/legacy-code/LC-009.yaml +138 -0
  82. awb-0.2.0/awb/tasks/legacy-code/LC-010.yaml +157 -0
  83. awb-0.2.0/awb/tasks/multi-file/MF-001.yaml +80 -0
  84. awb-0.2.0/awb/tasks/multi-file/MF-002.yaml +106 -0
  85. awb-0.2.0/awb/tasks/multi-file/MF-003.yaml +100 -0
  86. awb-0.2.0/awb/tasks/multi-file/MF-004.yaml +85 -0
  87. awb-0.2.0/awb/tasks/multi-file/MF-005.yaml +102 -0
  88. awb-0.2.0/awb/tasks/multi-file/MF-006.yaml +117 -0
  89. awb-0.2.0/awb/tasks/multi-file/MF-007.yaml +101 -0
  90. awb-0.2.0/awb/tasks/multi-file/MF-008.yaml +86 -0
  91. awb-0.2.0/awb/tasks/refactoring/RF-001.yaml +78 -0
  92. awb-0.2.0/awb/tasks/refactoring/RF-002.yaml +87 -0
  93. awb-0.2.0/awb/tasks/refactoring/RF-003.yaml +113 -0
  94. awb-0.2.0/awb/tasks/refactoring/RF-004.yaml +103 -0
  95. awb-0.2.0/awb/tasks/refactoring/RF-005.yaml +93 -0
  96. awb-0.2.0/awb/tasks/refactoring/RF-006.yaml +91 -0
  97. awb-0.2.0/awb/tasks/refactoring/RF-007.yaml +102 -0
  98. awb-0.2.0/awb/tasks/refactoring/RF-008.yaml +108 -0
  99. awb-0.2.0/awb/tasks/refactoring/RF-009.yaml +93 -0
  100. awb-0.2.0/awb/tasks/refactoring/RF-010.yaml +97 -0
  101. awb-0.2.0/awb/tasks/schema.json +123 -0
  102. awb-0.2.0/awb/verification/__init__.py +0 -0
  103. awb-0.2.0/awb/verification/code_review_scorer.py +47 -0
  104. awb-0.2.0/awb/verification/diff_analyzer.py +54 -0
  105. awb-0.2.0/awb/verification/lint_checker.py +62 -0
  106. awb-0.2.0/awb/verification/partial_credit.py +49 -0
  107. awb-0.2.0/awb/verification/security_scanner.py +76 -0
  108. awb-0.2.0/awb/verification/test_runner.py +41 -0
  109. awb-0.2.0/awb/workflow/__init__.py +0 -0
  110. awb-0.2.0/awb/workflow/descriptor.py +126 -0
  111. awb-0.2.0/awb/workflow/exporter.py +60 -0
  112. awb-0.2.0/awb/workflow/loader.py +37 -0
  113. awb-0.2.0/pyproject.toml +84 -0
awb-0.2.0/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ *.eggs/
5
+ dist/
6
+ build/
7
+ .ruff_cache/
8
+ .pytest_cache/
9
+ .claude/
10
+ .planning/
11
+ .venv/
12
+ venv/
13
+ results/runs/
14
+ !results/examples/
15
+ leaderboard/output/
16
+ analysis/
17
+ *.jsonl
18
+ .env
19
+ .DS_Store
awb-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Xavier Puspus
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
awb-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,298 @@
1
+ Metadata-Version: 2.4
2
+ Name: awb
3
+ Version: 0.2.0
4
+ Summary: Benchmark harness measuring AI coding tool+workflow performance, not just model capability
5
+ Project-URL: Homepage, https://github.com/xmpuspus/ai-workflow-benchmark
6
+ Project-URL: Repository, https://github.com/xmpuspus/ai-workflow-benchmark
7
+ Project-URL: Documentation, https://github.com/xmpuspus/ai-workflow-benchmark/blob/main/METHODOLOGY.md
8
+ Project-URL: Bug Tracker, https://github.com/xmpuspus/ai-workflow-benchmark/issues
9
+ Author: Xavier Puspus
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: ai-coding,benchmark,claude-code,developer-tools,swe-bench,workflow
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Software Development :: Testing
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: click<9,>=8.1.0
21
+ Requires-Dist: jinja2<4,>=3.1.0
22
+ Requires-Dist: jsonschema<5,>=4.20.0
23
+ Requires-Dist: pyyaml<7,>=6.0
24
+ Requires-Dist: rich<14,>=13.0.0
25
+ Requires-Dist: tabulate<1,>=0.9.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest-asyncio<1,>=0.23.0; extra == 'dev'
28
+ Requires-Dist: pytest<9,>=8.0.0; extra == 'dev'
29
+ Requires-Dist: ruff<1,>=0.3.0; extra == 'dev'
30
+ Provides-Extra: publish
31
+ Requires-Dist: build>=1.0.0; extra == 'publish'
32
+ Requires-Dist: twine>=5.0.0; extra == 'publish'
33
+ Provides-Extra: stats
34
+ Requires-Dist: numpy<3,>=1.26.0; extra == 'stats'
35
+ Requires-Dist: scipy<2,>=1.12.0; extra == 'stats'
36
+ Description-Content-Type: text/markdown
37
+
38
+ <div align="center">
39
+ <h1>AI Workflow Benchmark (AWB)</h1>
40
+ <p><strong>Measure AI coding tool+workflow performance, not just model capability.</strong></p>
41
+ <p>
42
+ <a href="https://pypi.org/project/awb/"><img src="https://img.shields.io/pypi/v/awb" alt="PyPI"></a>
43
+ <a href="https://github.com/xmpuspus/ai-workflow-benchmark/actions"><img src="https://img.shields.io/github/actions/workflow/status/xmpuspus/ai-workflow-benchmark/test.yml" alt="Tests"></a>
44
+ <img src="https://img.shields.io/badge/tasks-60-blue" alt="Tasks">
45
+ <img src="https://img.shields.io/badge/python-3.11%2B-blue" alt="Python">
46
+ <a href="LICENSE"><img src="https://img.shields.io/badge/license-MIT-green" alt="License"></a>
47
+ </p>
48
+ </div>
49
+
50
+ ---
51
+
52
+ ## Why This Exists
53
+
54
+ SWE-bench tests models. AWB tests workflows. The same model running vanilla Claude Code vs. a purpose-built setup with a tuned CLAUDE.md, hooks, and structured agents produces meaningfully different results on real engineering tasks. No existing benchmark captures that gap — they all evaluate the model in isolation.
55
+
56
+ AWB benchmarks the full stack: **tool + configuration + workflow + model**, together, on 60 tasks drawn from real open-source repositories.
57
+
58
+ ## Quick Start
59
+
60
+ ```bash
61
+ pip install awb
62
+
63
+ awb quickstart # verify your setup
64
+ awb run --runs 3 # full benchmark (3 runs each for stable scores)
65
+ awb gap results/runs/<run_dir>/ # analyze capability gaps
66
+ ```
67
+
68
+ ## How It Works
69
+
70
+ ```
71
+ Clone repo at pinned SHA
72
+ → Run setup commands
73
+ → Capture baseline lint/security counts
74
+ → Execute tool with task prompt
75
+ → Run test suite + partial credit rubric
76
+ → Sigmoid-normalize 7 metrics
77
+ → Produce weighted composite + capability profile
78
+ ```
79
+
80
+ Each task starts from a fresh `git clone` at a pinned commit. Every tool gets the same prompt, the same timeout, and the same verification suite. Results are scored with sigmoid normalization so scores are never negative and never collapse at the boundary.
81
+
82
+ ## Scoring System
83
+
84
+ Seven dimensions, sigmoid-normalized with per-task baselines derived from difficulty:
85
+
86
+ | Dimension | Weight | What It Measures |
87
+ |-----------|--------|-----------------|
88
+ | Correctness | 55% | Pass/fail (60%) + partial credit rubric (40%) |
89
+ | Cost efficiency | 15% | Estimated USD per task |
90
+ | Speed | 10% | Wall-clock seconds vs. estimated task time |
91
+ | Code quality | 10% | Lint warning delta (pre vs. post) |
92
+ | Reliability | 5% | Pre-existing tests broken by the change |
93
+ | Security | 3% | New security issues introduced |
94
+ | Efficiency | 2% | Tool turns used vs. task max |
95
+
96
+ **Sigmoid curve:** `score = 100 / (1 + exp(k * (value - baseline)))`
97
+
98
+ - Optimal performance (excellent) → ~95
99
+ - Baseline performance (adequate) → ~50
100
+ - Above baseline → smooth decay, never negative
101
+
102
+ **Difficulty-weighted aggregation:** hard tasks count 2.5×, medium 1.5×, easy 1.0×. A tool that solves hard tasks beats one that only solves easy ones even if the easy-task count is higher.
103
+
104
+ **Per-task baselines by difficulty:**
105
+
106
+ | Metric | Easy | Medium | Hard |
107
+ |--------|------|--------|------|
108
+ | Cost optimal / baseline | $0.05 / $0.30 | $0.20 / $1.00 | $1.00 / $3.00 |
109
+ | Speed | 50% / 100% of estimated_minutes | same | same |
110
+ | Iterations | 3 / max_iters | 8 / max_iters | 15 / max_iters |
111
+
112
+ ## The 60 Tasks
113
+
114
+ Real open-source repos, pinned to release tag SHAs. Setup runs in under 15 seconds via venv + pip (Python) or npm (TypeScript).
115
+
116
+ | Category | Count | Easy / Med / Hard | What It Tests |
117
+ |----------|-------|-------------------|---------------|
118
+ | bug-fix | 10 | 3 / 3 / 2 | Root cause analysis, None handling, async bugs, race conditions |
119
+ | feature-addition | 8 | 2 / 3 / 2 | Convention adherence, middleware patterns, cross-cutting features |
120
+ | refactoring | 10 | 2 / 3 / 2 | Multi-file consistency, pattern extraction, async migration |
121
+ | code-review | 7 | 2 / 3 / 1 | Security awareness, OWASP, concurrency bugs, CORS/auth |
122
+ | debugging | 7 | 2 / 1 / 3 | Hypothesis testing, connection leaks, pipeline tracing |
123
+ | multi-file | 8 | 0 / 3 / 3 | Cross-module architecture, plugin systems, auth chains |
124
+ | legacy-code | 10 | 4 / 4 / 2 | Modernization, migration, dead code removal, type annotations |
125
+
126
+ **Repos used:** FastAPI, httpx, Flask, Starlette, Click, Pydantic, SQLAlchemy 2.0, Hono
127
+
128
+ **Task IDs:**
129
+ `BF-001–011` · `FA-001–008` · `RF-001–010` · `CR-001–007` · `DB-001–007` · `MF-001–008` · `LC-001–010`
130
+
131
+ ## Capability Profiles
132
+
133
+ Each task maps to 1–3 capabilities, producing a radar chart of tool strengths:
134
+
135
+ | Capability | Tasks | What It Measures |
136
+ |------------|-------|-----------------|
137
+ | code_comprehension | 27 | Understanding existing code before modifying |
138
+ | framework_knowledge | 26 | Knowing API patterns (Pydantic v2, async SQLAlchemy, etc.) |
139
+ | refactoring_discipline | 23 | Changing code without breaking behavior |
140
+ | multi_file_reasoning | 20 | Coordinating changes across multiple files |
141
+ | bug_diagnosis | 17 | Structured root cause analysis |
142
+ | test_writing | 8 | Writing correct, meaningful tests |
143
+ | security_awareness | 8 | Identifying and fixing vulnerabilities |
144
+ | cost_discipline | derived | Token efficiency across all tasks |
145
+
146
+ Example `awb gap` output:
147
+
148
+ ```
149
+ Capability Profile
150
+ ------------------
151
+ code_comprehension ████████████████████ 82.4 (n=27, conf=high)
152
+ framework_knowledge ████████████████░░░░ 68.1 (n=26, conf=high)
153
+ refactoring_discipline████████████████░░░░ 65.3 (n=23, conf=high)
154
+ multi_file_reasoning ████████████░░░░░░░░ 51.2 (n=20, conf=high)
155
+ bug_diagnosis ███████████████░░░░░ 63.7 (n=17, conf=med)
156
+ test_writing ██████████░░░░░░░░░░ 44.1 (n=8, conf=low)
157
+ security_awareness █████████████░░░░░░░ 55.8 (n=8, conf=low)
158
+
159
+ Systematic Patterns
160
+ -------------------
161
+ - Fails 70%+ of multi_file_reasoning tasks → consider multi-agent workflows
162
+ - Token spend on failed hard tasks: $4.20 → add early-exit heuristics
163
+ - No failures on easy tasks → baseline is solid
164
+
165
+ Top Suggestions
166
+ ---------------
167
+ 1. Enable subagent mode for tasks spanning >3 files (impact: high)
168
+ 2. Add repo-level CLAUDE.md with architecture overview (impact: medium)
169
+ 3. Use --think flag for debugging tasks (impact: medium)
170
+ ```
171
+
172
+ ## CLI Reference
173
+
174
+ | Command | Description |
175
+ |---------|-------------|
176
+ | `awb run [tool] [options]` | Run benchmark tasks |
177
+ | `awb gap <run_dir>` | Analyze capability gaps and generate improvement suggestions |
178
+ | `awb compare <run1> <run2>` | Compare two runs with significance testing |
179
+ | `awb export <run_dir> -o file.json` | Export results in external submission format |
180
+ | `awb submit <file.json>` | Validate and display an external submission |
181
+ | `awb compare-submissions <a.json> <b.json>` | Cross-tool comparison with statistics |
182
+ | `awb quickstart` | Verify setup: tools available, tasks load, validation passes |
183
+ | `awb info <task_id>` | Show task details |
184
+ | `awb tools` | List registered adapters and availability |
185
+ | `awb validate` | Validate all task YAMLs against schema |
186
+ | `awb leaderboard` | Generate HTML leaderboard from run results |
187
+ | `awb workflow <subcommand>` | Export, validate, diff, or init workflow descriptors |
188
+
189
+ **Common options for `awb run`:**
190
+
191
+ ```bash
192
+ awb run # all tools, all tasks, 3 runs
193
+ awb run claude-code-custom # single tool
194
+ awb run -t BF-001 # single task
195
+ awb run --category legacy-code # filter by category
196
+ awb run --difficulty hard # filter by difficulty
197
+ awb run --capability bug_diagnosis # filter by capability
198
+ awb run --runs 1 --dry-run # preview without executing
199
+ ```
200
+
201
+ ## Adding Tasks
202
+
203
+ Tasks live in `awb/tasks/<category>/`. Copy `awb/tasks/_template.yaml`:
204
+
205
+ ```yaml
206
+ id: BF-012
207
+ category: bug-fix
208
+ title: "Fix response_model silently dropping extra fields in FastAPI"
209
+ difficulty: easy
210
+ estimated_minutes: 15
211
+ languages: [python]
212
+ capabilities: [framework_knowledge, test_writing]
213
+
214
+ repo:
215
+ url: "https://github.com/tiangolo/fastapi"
216
+ commit: "628c34e0"
217
+ setup_commands:
218
+ - "python3 -m venv .venv && source .venv/bin/activate && pip install -e '.[all]'"
219
+
220
+ issue:
221
+ description: |
222
+ The endpoint's response_model silently strips extra fields...
223
+ files_to_examine:
224
+ - "fastapi/routing.py"
225
+
226
+ verification:
227
+ test_commands:
228
+ - "source .venv/bin/activate && python3 -m pytest tests/test_extra_fields.py -v"
229
+ partial_credit:
230
+ - criterion: "Uses Pydantic v2 ConfigDict"
231
+ points: 50
232
+ check: "grep -q 'ConfigDict' tests/test_extra_fields.py"
233
+ - criterion: "Tests pass"
234
+ points: 50
235
+ check: "source .venv/bin/activate && python3 -m pytest tests/test_extra_fields.py -v"
236
+
237
+ constraints:
238
+ max_iterations: 20
239
+ timeout_seconds: 1800
240
+ ```
241
+
242
+ Run `awb validate` to check your task before opening a PR. Full guide: [CONTRIBUTING.md](CONTRIBUTING.md)
243
+
244
+ ## Adding Tools
245
+
246
+ Implement the `ToolAdapter` ABC in `awb/adapters/`:
247
+
248
+ ```python
249
+ from awb.adapters.base import ToolAdapter, ToolResult
250
+ from pathlib import Path
251
+
252
+ class MyToolAdapter(ToolAdapter):
253
+ name = "my-tool"
254
+ display_name = "My Tool"
255
+
256
+ async def execute(self, prompt: str, workspace: Path,
257
+ max_turns: int = 20, timeout_seconds: int = 1800) -> ToolResult:
258
+ ...
259
+
260
+ def check_available(self) -> bool:
261
+ ...
262
+
263
+ def get_config_hash(self) -> str:
264
+ ...
265
+ ```
266
+
267
+ Register in `awb/adapters/registry.py` and add an entry point in `pyproject.toml`.
268
+
269
+ ## External Submissions
270
+
271
+ Anyone can share results using the submission format defined in `results/submission-schema.json`:
272
+
273
+ ```bash
274
+ awb run --runs 3
275
+ awb export results/runs/<run_dir>/ -o my-results.json
276
+ awb submit my-results.json # validate locally
277
+ awb compare-submissions a.json b.json # compare with significance testing
278
+ ```
279
+
280
+ The format captures tool version, model, hardware class, and per-task run results. Hardware classes (e.g., `apple_m5_24gb`, `linux_x86_16gb`) enable fair speed comparisons — only compared within the same tier.
281
+
282
+ ## Statistical Framework
283
+
284
+ - Confidence intervals via t-distribution (no scipy required for core scoring)
285
+ - Significance testing via sign test for paired tool comparison
286
+ - Integrity checks: contamination detection (completions <10s flagged), variance anomalies (identical times/tokens across runs)
287
+ - Weight profiles: `default`, `correctness_focused`, `production` (see `awb/scoring/weights.yaml`)
288
+
289
+ ## Links
290
+
291
+ - [Methodology](METHODOLOGY.md) — Fair comparison principles, metric definitions, known limitations
292
+ - [Architecture](ARCHITECTURE.md) — Module graph, data models, pipeline diagrams
293
+ - [Contributing](CONTRIBUTING.md) — Adding tasks, tools, and submitting results
294
+ - [PyPI](https://pypi.org/project/awb/) — `pip install awb`
295
+
296
+ ## License
297
+
298
+ MIT
awb-0.2.0/README.md ADDED
@@ -0,0 +1,261 @@
1
+ <div align="center">
2
+ <h1>AI Workflow Benchmark (AWB)</h1>
3
+ <p><strong>Measure AI coding tool+workflow performance, not just model capability.</strong></p>
4
+ <p>
5
+ <a href="https://pypi.org/project/awb/"><img src="https://img.shields.io/pypi/v/awb" alt="PyPI"></a>
6
+ <a href="https://github.com/xmpuspus/ai-workflow-benchmark/actions"><img src="https://img.shields.io/github/actions/workflow/status/xmpuspus/ai-workflow-benchmark/test.yml" alt="Tests"></a>
7
+ <img src="https://img.shields.io/badge/tasks-60-blue" alt="Tasks">
8
+ <img src="https://img.shields.io/badge/python-3.11%2B-blue" alt="Python">
9
+ <a href="LICENSE"><img src="https://img.shields.io/badge/license-MIT-green" alt="License"></a>
10
+ </p>
11
+ </div>
12
+
13
+ ---
14
+
15
+ ## Why This Exists
16
+
17
+ SWE-bench tests models. AWB tests workflows. The same model running vanilla Claude Code vs. a purpose-built setup with a tuned CLAUDE.md, hooks, and structured agents produces meaningfully different results on real engineering tasks. No existing benchmark captures that gap — they all evaluate the model in isolation.
18
+
19
+ AWB benchmarks the full stack: **tool + configuration + workflow + model**, together, on 60 tasks drawn from real open-source repositories.
20
+
21
+ ## Quick Start
22
+
23
+ ```bash
24
+ pip install awb
25
+
26
+ awb quickstart # verify your setup
27
+ awb run --runs 3 # full benchmark (3 runs each for stable scores)
28
+ awb gap results/runs/<run_dir>/ # analyze capability gaps
29
+ ```
30
+
31
+ ## How It Works
32
+
33
+ ```
34
+ Clone repo at pinned SHA
35
+ → Run setup commands
36
+ → Capture baseline lint/security counts
37
+ → Execute tool with task prompt
38
+ → Run test suite + partial credit rubric
39
+ → Sigmoid-normalize 7 metrics
40
+ → Produce weighted composite + capability profile
41
+ ```
42
+
43
+ Each task starts from a fresh `git clone` at a pinned commit. Every tool gets the same prompt, the same timeout, and the same verification suite. Results are scored with sigmoid normalization so scores are never negative and never collapse at the boundary.
44
+
45
+ ## Scoring System
46
+
47
+ Seven dimensions, sigmoid-normalized with per-task baselines derived from difficulty:
48
+
49
+ | Dimension | Weight | What It Measures |
50
+ |-----------|--------|-----------------|
51
+ | Correctness | 55% | Pass/fail (60%) + partial credit rubric (40%) |
52
+ | Cost efficiency | 15% | Estimated USD per task |
53
+ | Speed | 10% | Wall-clock seconds vs. estimated task time |
54
+ | Code quality | 10% | Lint warning delta (pre vs. post) |
55
+ | Reliability | 5% | Pre-existing tests broken by the change |
56
+ | Security | 3% | New security issues introduced |
57
+ | Efficiency | 2% | Tool turns used vs. task max |
58
+
59
+ **Sigmoid curve:** `score = 100 / (1 + exp(k * (value - baseline)))`
60
+
61
+ - Optimal performance (excellent) → ~95
62
+ - Baseline performance (adequate) → ~50
63
+ - Above baseline → smooth decay, never negative
64
+
65
+ **Difficulty-weighted aggregation:** hard tasks count 2.5×, medium 1.5×, easy 1.0×. A tool that solves hard tasks beats one that only solves easy ones even if the easy-task count is higher.
66
+
67
+ **Per-task baselines by difficulty:**
68
+
69
+ | Metric | Easy | Medium | Hard |
70
+ |--------|------|--------|------|
71
+ | Cost optimal / baseline | $0.05 / $0.30 | $0.20 / $1.00 | $1.00 / $3.00 |
72
+ | Speed | 50% / 100% of estimated_minutes | same | same |
73
+ | Iterations | 3 / max_iters | 8 / max_iters | 15 / max_iters |
74
+
75
+ ## The 60 Tasks
76
+
77
+ Real open-source repos, pinned to release tag SHAs. Setup runs in under 15 seconds via venv + pip (Python) or npm (TypeScript).
78
+
79
+ | Category | Count | Easy / Med / Hard | What It Tests |
80
+ |----------|-------|-------------------|---------------|
81
+ | bug-fix | 10 | 3 / 3 / 2 | Root cause analysis, None handling, async bugs, race conditions |
82
+ | feature-addition | 8 | 2 / 3 / 2 | Convention adherence, middleware patterns, cross-cutting features |
83
+ | refactoring | 10 | 2 / 3 / 2 | Multi-file consistency, pattern extraction, async migration |
84
+ | code-review | 7 | 2 / 3 / 1 | Security awareness, OWASP, concurrency bugs, CORS/auth |
85
+ | debugging | 7 | 2 / 1 / 3 | Hypothesis testing, connection leaks, pipeline tracing |
86
+ | multi-file | 8 | 0 / 3 / 3 | Cross-module architecture, plugin systems, auth chains |
87
+ | legacy-code | 10 | 4 / 4 / 2 | Modernization, migration, dead code removal, type annotations |
88
+
89
+ **Repos used:** FastAPI, httpx, Flask, Starlette, Click, Pydantic, SQLAlchemy 2.0, Hono
90
+
91
+ **Task IDs:**
92
+ `BF-001–011` · `FA-001–008` · `RF-001–010` · `CR-001–007` · `DB-001–007` · `MF-001–008` · `LC-001–010`
93
+
94
+ ## Capability Profiles
95
+
96
+ Each task maps to 1–3 capabilities, producing a radar chart of tool strengths:
97
+
98
+ | Capability | Tasks | What It Measures |
99
+ |------------|-------|-----------------|
100
+ | code_comprehension | 27 | Understanding existing code before modifying |
101
+ | framework_knowledge | 26 | Knowing API patterns (Pydantic v2, async SQLAlchemy, etc.) |
102
+ | refactoring_discipline | 23 | Changing code without breaking behavior |
103
+ | multi_file_reasoning | 20 | Coordinating changes across multiple files |
104
+ | bug_diagnosis | 17 | Structured root cause analysis |
105
+ | test_writing | 8 | Writing correct, meaningful tests |
106
+ | security_awareness | 8 | Identifying and fixing vulnerabilities |
107
+ | cost_discipline | derived | Token efficiency across all tasks |
108
+
109
+ Example `awb gap` output:
110
+
111
+ ```
112
+ Capability Profile
113
+ ------------------
114
+ code_comprehension ████████████████████ 82.4 (n=27, conf=high)
115
+ framework_knowledge ████████████████░░░░ 68.1 (n=26, conf=high)
116
+ refactoring_discipline████████████████░░░░ 65.3 (n=23, conf=high)
117
+ multi_file_reasoning ████████████░░░░░░░░ 51.2 (n=20, conf=high)
118
+ bug_diagnosis ███████████████░░░░░ 63.7 (n=17, conf=med)
119
+ test_writing ██████████░░░░░░░░░░ 44.1 (n=8, conf=low)
120
+ security_awareness █████████████░░░░░░░ 55.8 (n=8, conf=low)
121
+
122
+ Systematic Patterns
123
+ -------------------
124
+ - Fails 70%+ of multi_file_reasoning tasks → consider multi-agent workflows
125
+ - Token spend on failed hard tasks: $4.20 → add early-exit heuristics
126
+ - No failures on easy tasks → baseline is solid
127
+
128
+ Top Suggestions
129
+ ---------------
130
+ 1. Enable subagent mode for tasks spanning >3 files (impact: high)
131
+ 2. Add repo-level CLAUDE.md with architecture overview (impact: medium)
132
+ 3. Use --think flag for debugging tasks (impact: medium)
133
+ ```
134
+
135
+ ## CLI Reference
136
+
137
+ | Command | Description |
138
+ |---------|-------------|
139
+ | `awb run [tool] [options]` | Run benchmark tasks |
140
+ | `awb gap <run_dir>` | Analyze capability gaps and generate improvement suggestions |
141
+ | `awb compare <run1> <run2>` | Compare two runs with significance testing |
142
+ | `awb export <run_dir> -o file.json` | Export results in external submission format |
143
+ | `awb submit <file.json>` | Validate and display an external submission |
144
+ | `awb compare-submissions <a.json> <b.json>` | Cross-tool comparison with statistics |
145
+ | `awb quickstart` | Verify setup: tools available, tasks load, validation passes |
146
+ | `awb info <task_id>` | Show task details |
147
+ | `awb tools` | List registered adapters and availability |
148
+ | `awb validate` | Validate all task YAMLs against schema |
149
+ | `awb leaderboard` | Generate HTML leaderboard from run results |
150
+ | `awb workflow <subcommand>` | Export, validate, diff, or init workflow descriptors |
151
+
152
+ **Common options for `awb run`:**
153
+
154
+ ```bash
155
+ awb run # all tools, all tasks, 3 runs
156
+ awb run claude-code-custom # single tool
157
+ awb run -t BF-001 # single task
158
+ awb run --category legacy-code # filter by category
159
+ awb run --difficulty hard # filter by difficulty
160
+ awb run --capability bug_diagnosis # filter by capability
161
+ awb run --runs 1 --dry-run # preview without executing
162
+ ```
163
+
164
+ ## Adding Tasks
165
+
166
+ Tasks live in `awb/tasks/<category>/`. Copy `awb/tasks/_template.yaml`:
167
+
168
+ ```yaml
169
+ id: BF-012
170
+ category: bug-fix
171
+ title: "Fix response_model silently dropping extra fields in FastAPI"
172
+ difficulty: easy
173
+ estimated_minutes: 15
174
+ languages: [python]
175
+ capabilities: [framework_knowledge, test_writing]
176
+
177
+ repo:
178
+ url: "https://github.com/tiangolo/fastapi"
179
+ commit: "628c34e0"
180
+ setup_commands:
181
+ - "python3 -m venv .venv && source .venv/bin/activate && pip install -e '.[all]'"
182
+
183
+ issue:
184
+ description: |
185
+ The endpoint's response_model silently strips extra fields...
186
+ files_to_examine:
187
+ - "fastapi/routing.py"
188
+
189
+ verification:
190
+ test_commands:
191
+ - "source .venv/bin/activate && python3 -m pytest tests/test_extra_fields.py -v"
192
+ partial_credit:
193
+ - criterion: "Uses Pydantic v2 ConfigDict"
194
+ points: 50
195
+ check: "grep -q 'ConfigDict' tests/test_extra_fields.py"
196
+ - criterion: "Tests pass"
197
+ points: 50
198
+ check: "source .venv/bin/activate && python3 -m pytest tests/test_extra_fields.py -v"
199
+
200
+ constraints:
201
+ max_iterations: 20
202
+ timeout_seconds: 1800
203
+ ```
204
+
205
+ Run `awb validate` to check your task before opening a PR. Full guide: [CONTRIBUTING.md](CONTRIBUTING.md)
206
+
207
+ ## Adding Tools
208
+
209
+ Implement the `ToolAdapter` ABC in `awb/adapters/`:
210
+
211
+ ```python
212
+ from awb.adapters.base import ToolAdapter, ToolResult
213
+ from pathlib import Path
214
+
215
+ class MyToolAdapter(ToolAdapter):
216
+ name = "my-tool"
217
+ display_name = "My Tool"
218
+
219
+ async def execute(self, prompt: str, workspace: Path,
220
+ max_turns: int = 20, timeout_seconds: int = 1800) -> ToolResult:
221
+ ...
222
+
223
+ def check_available(self) -> bool:
224
+ ...
225
+
226
+ def get_config_hash(self) -> str:
227
+ ...
228
+ ```
229
+
230
+ Register in `awb/adapters/registry.py` and add an entry point in `pyproject.toml`.
231
+
232
+ ## External Submissions
233
+
234
+ Anyone can share results using the submission format defined in `results/submission-schema.json`:
235
+
236
+ ```bash
237
+ awb run --runs 3
238
+ awb export results/runs/<run_dir>/ -o my-results.json
239
+ awb submit my-results.json # validate locally
240
+ awb compare-submissions a.json b.json # compare with significance testing
241
+ ```
242
+
243
+ The format captures tool version, model, hardware class, and per-task run results. Hardware classes (e.g., `apple_m5_24gb`, `linux_x86_16gb`) enable fair speed comparisons — only compared within the same tier.
244
+
245
+ ## Statistical Framework
246
+
247
+ - Confidence intervals via t-distribution (no scipy required for core scoring)
248
+ - Significance testing via sign test for paired tool comparison
249
+ - Integrity checks: contamination detection (completions <10s flagged), variance anomalies (identical times/tokens across runs)
250
+ - Weight profiles: `default`, `correctness_focused`, `production` (see `awb/scoring/weights.yaml`)
251
+
252
+ ## Links
253
+
254
+ - [Methodology](METHODOLOGY.md) — Fair comparison principles, metric definitions, known limitations
255
+ - [Architecture](ARCHITECTURE.md) — Module graph, data models, pipeline diagrams
256
+ - [Contributing](CONTRIBUTING.md) — Adding tasks, tools, and submitting results
257
+ - [PyPI](https://pypi.org/project/awb/) — `pip install awb`
258
+
259
+ ## License
260
+
261
+ MIT
@@ -0,0 +1,3 @@
1
+ """AI Workflow Benchmark - measure tool+workflow performance."""
2
+
3
+ __version__ = "0.2.0"
File without changes
@@ -0,0 +1,28 @@
1
+ """Aider CLI adapter (placeholder)."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+ from awb.adapters.base import ToolAdapter, ToolResult
7
+
8
+
9
+ class AiderAdapter(ToolAdapter):
10
+ name = "aider"
11
+ display_name = "Aider"
12
+
13
+ async def execute(
14
+ self,
15
+ prompt: str,
16
+ workspace: Path,
17
+ max_turns: int = 20,
18
+ timeout_seconds: int = 1800,
19
+ ) -> ToolResult:
20
+ raise NotImplementedError(
21
+ "Aider adapter not yet implemented - contributions welcome"
22
+ )
23
+
24
+ def check_available(self) -> bool:
25
+ raise NotImplementedError("Aider adapter not yet implemented")
26
+
27
+ def get_config_hash(self) -> str:
28
+ return "n/a"
@@ -0,0 +1,50 @@
1
+ """Abstract base for tool adapters."""
2
+ from __future__ import annotations
3
+
4
+ import abc
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+
8
+
9
+ @dataclass
10
+ class ToolResult:
11
+ """Normalized output from a tool execution."""
12
+
13
+ success: bool
14
+ raw_output: str = ""
15
+ stream_events: list[dict] = field(default_factory=list)
16
+ exit_code: int = 0
17
+ tool_version: str = ""
18
+ model: str = ""
19
+
20
+
21
+ class ToolAdapter(abc.ABC):
22
+ """Base class for AI coding tool adapters."""
23
+
24
+ name: str # e.g. "claude-code-vanilla"
25
+ display_name: str # e.g. "Claude Code (Vanilla)"
26
+
27
+ @abc.abstractmethod
28
+ async def execute(
29
+ self,
30
+ prompt: str,
31
+ workspace: Path,
32
+ max_turns: int = 20,
33
+ timeout_seconds: int = 1800,
34
+ ) -> ToolResult:
35
+ """Run the tool against a task in the given workspace."""
36
+ ...
37
+
38
+ @abc.abstractmethod
39
+ def check_available(self) -> bool:
40
+ """Return True if this tool is installed and usable."""
41
+ ...
42
+
43
+ @abc.abstractmethod
44
+ def get_config_hash(self) -> str:
45
+ """Return a hash of the tool's configuration for reproducibility."""
46
+ ...
47
+
48
+ def get_version(self) -> str:
49
+ """Return tool version string."""
50
+ return "unknown"