@athenaflow/plugin-web-bench 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/.claude-plugin/plugin.json +16 -0
  2. package/.codex-plugin/plugin.json +16 -0
  3. package/.mcp.json +19 -0
  4. package/dist/1.0.3/.agents/plugins/marketplace.json +14 -0
  5. package/dist/1.0.3/claude/plugin/.claude-plugin/plugin.json +16 -0
  6. package/dist/1.0.3/claude/plugin/.mcp.json +19 -0
  7. package/dist/1.0.3/claude/plugin/package.json +9 -0
  8. package/dist/1.0.3/claude/plugin/skills/evaluate-task/SKILL.md +173 -0
  9. package/dist/1.0.3/claude/plugin/skills/evaluate-task/agents/claude.yaml +2 -0
  10. package/dist/1.0.3/claude/plugin/skills/execute-task/SKILL.md +133 -0
  11. package/dist/1.0.3/claude/plugin/skills/execute-task/agents/claude.yaml +2 -0
  12. package/dist/1.0.3/claude/plugin/skills/generate-report/SKILL.md +204 -0
  13. package/dist/1.0.3/claude/plugin/skills/generate-report/agents/claude.yaml +2 -0
  14. package/dist/1.0.3/claude/plugin/skills/load-dataset/SKILL.md +209 -0
  15. package/dist/1.0.3/claude/plugin/skills/load-dataset/agents/claude.yaml +2 -0
  16. package/dist/1.0.3/claude/plugin/skills/run-benchmark/SKILL.md +92 -0
  17. package/dist/1.0.3/claude/plugin/skills/run-benchmark/agents/claude.yaml +3 -0
  18. package/dist/1.0.3/codex/plugin/.codex-plugin/plugin.json +16 -0
  19. package/dist/1.0.3/codex/plugin/.mcp.json +19 -0
  20. package/dist/1.0.3/codex/plugin/package.json +9 -0
  21. package/dist/1.0.3/codex/plugin/skills/evaluate-task/SKILL.md +173 -0
  22. package/dist/1.0.3/codex/plugin/skills/evaluate-task/agents/claude.yaml +2 -0
  23. package/dist/1.0.3/codex/plugin/skills/execute-task/SKILL.md +133 -0
  24. package/dist/1.0.3/codex/plugin/skills/execute-task/agents/claude.yaml +2 -0
  25. package/dist/1.0.3/codex/plugin/skills/generate-report/SKILL.md +204 -0
  26. package/dist/1.0.3/codex/plugin/skills/generate-report/agents/claude.yaml +2 -0
  27. package/dist/1.0.3/codex/plugin/skills/load-dataset/SKILL.md +209 -0
  28. package/dist/1.0.3/codex/plugin/skills/load-dataset/agents/claude.yaml +2 -0
  29. package/dist/1.0.3/codex/plugin/skills/run-benchmark/SKILL.md +92 -0
  30. package/dist/1.0.3/codex/plugin/skills/run-benchmark/agents/claude.yaml +3 -0
  31. package/dist/1.0.3/release.json +18 -0
  32. package/dist/1.0.5/.agents/plugins/marketplace.json +14 -0
  33. package/dist/1.0.5/claude/plugin/.claude-plugin/plugin.json +16 -0
  34. package/dist/1.0.5/claude/plugin/.mcp.json +19 -0
  35. package/dist/1.0.5/claude/plugin/package.json +9 -0
  36. package/dist/1.0.5/claude/plugin/skills/evaluate-task/SKILL.md +173 -0
  37. package/dist/1.0.5/claude/plugin/skills/evaluate-task/agents/claude.yaml +2 -0
  38. package/dist/1.0.5/claude/plugin/skills/execute-task/SKILL.md +133 -0
  39. package/dist/1.0.5/claude/plugin/skills/execute-task/agents/claude.yaml +2 -0
  40. package/dist/1.0.5/claude/plugin/skills/generate-report/SKILL.md +204 -0
  41. package/dist/1.0.5/claude/plugin/skills/generate-report/agents/claude.yaml +2 -0
  42. package/dist/1.0.5/claude/plugin/skills/load-dataset/SKILL.md +209 -0
  43. package/dist/1.0.5/claude/plugin/skills/load-dataset/agents/claude.yaml +2 -0
  44. package/dist/1.0.5/claude/plugin/skills/run-benchmark/SKILL.md +92 -0
  45. package/dist/1.0.5/claude/plugin/skills/run-benchmark/agents/claude.yaml +3 -0
  46. package/dist/1.0.5/codex/plugin/.codex-plugin/plugin.json +16 -0
  47. package/dist/1.0.5/codex/plugin/.mcp.json +19 -0
  48. package/dist/1.0.5/codex/plugin/package.json +9 -0
  49. package/dist/1.0.5/codex/plugin/skills/evaluate-task/SKILL.md +173 -0
  50. package/dist/1.0.5/codex/plugin/skills/evaluate-task/agents/claude.yaml +2 -0
  51. package/dist/1.0.5/codex/plugin/skills/execute-task/SKILL.md +133 -0
  52. package/dist/1.0.5/codex/plugin/skills/execute-task/agents/claude.yaml +2 -0
  53. package/dist/1.0.5/codex/plugin/skills/generate-report/SKILL.md +204 -0
  54. package/dist/1.0.5/codex/plugin/skills/generate-report/agents/claude.yaml +2 -0
  55. package/dist/1.0.5/codex/plugin/skills/load-dataset/SKILL.md +209 -0
  56. package/dist/1.0.5/codex/plugin/skills/load-dataset/agents/claude.yaml +2 -0
  57. package/dist/1.0.5/codex/plugin/skills/run-benchmark/SKILL.md +92 -0
  58. package/dist/1.0.5/codex/plugin/skills/run-benchmark/agents/claude.yaml +3 -0
  59. package/dist/1.0.5/release.json +18 -0
  60. package/package.json +13 -0
  61. package/skills/evaluate-task/SKILL.md +173 -0
  62. package/skills/evaluate-task/agents/claude.yaml +2 -0
  63. package/skills/execute-task/SKILL.md +133 -0
  64. package/skills/execute-task/agents/claude.yaml +2 -0
  65. package/skills/generate-report/SKILL.md +204 -0
  66. package/skills/generate-report/agents/claude.yaml +2 -0
  67. package/skills/load-dataset/SKILL.md +209 -0
  68. package/skills/load-dataset/agents/claude.yaml +2 -0
  69. package/skills/run-benchmark/SKILL.md +92 -0
  70. package/skills/run-benchmark/agents/claude.yaml +3 -0
@@ -0,0 +1,204 @@
1
+ ---
2
+ name: generate-report
3
+ description: >
4
+ Aggregate WebBench benchmark results into a comprehensive evaluation report.
5
+ Triggers: "generate report", "create benchmark report", "summarize results",
6
+ "aggregate scores", "produce evaluation report".
7
+ Reads web-bench-results.jsonl, computes statistics by category/website/failure mode,
8
+ and writes web-bench-report.md with pass rates, timing, token usage, and analysis.
9
+ Does NOT execute or evaluate tasks — only aggregates existing results.
10
+ allowed-tools: Read Write Edit Bash Glob Grep
11
+ ---
12
+
13
+ # Generate WebBench Benchmark Report
14
+
15
+ Aggregate all results from `web-bench-results.jsonl` into a comprehensive markdown report.
16
+
17
+ ## Input
18
+
19
+ - **Results file:** `web-bench-results.jsonl` — one JSON line per completed task
20
+ - **Result line schema:**
21
+ ```json
22
+ {
23
+ "id": 42,
24
+ "url": "https://acehardware.com",
25
+ "category": "READ",
26
+ "task": "Navigate to...",
27
+ "score": 1.0,
28
+ "verdict": "PASS",
29
+ "reasoning": "Successfully extracted all specs",
30
+ "error": null,
31
+ "duration_ms": 34200,
32
+ "tokens_used": {"input": 12450, "output": 3200},
33
+ "timestamp": "2026-03-19T14:30:00Z"
34
+ }
35
+ ```
36
+
37
+ ## Report Generation
38
+
39
+ Use Node.js to compute all statistics:
40
+
41
+ ```bash
42
+ node -e "
43
+ const fs = require('fs');
44
+ const results = fs.readFileSync('web-bench-results.jsonl','utf-8').trim().split('\n')
45
+ .filter(l => l.trim()).map(JSON.parse);
46
+
47
+ const total = results.length;
48
+ const passed = results.filter(r => r.verdict === 'PASS').length;
49
+ const partial = results.filter(r => r.verdict === 'PARTIAL').length;
50
+ const failed = results.filter(r => r.verdict === 'FAIL').length;
51
+ const totalScore = results.reduce((s, r) => s + r.score, 0);
52
+
53
+ // Timing
54
+ const totalDurationMs = results.reduce((s, r) => s + (r.duration_ms || 0), 0);
55
+ const avgDurationMs = total ? totalDurationMs / total : 0;
56
+
57
+ // Tokens
58
+ const totalInputTokens = results.reduce((s, r) => s + ((r.tokens_used || {}).input || 0), 0);
59
+ const totalOutputTokens = results.reduce((s, r) => s + ((r.tokens_used || {}).output || 0), 0);
60
+ const totalTokens = totalInputTokens + totalOutputTokens;
61
+
62
+ // By category
63
+ const byCat = {};
64
+ for (const r of results) {
65
+ if (!byCat[r.category]) byCat[r.category] = [];
66
+ byCat[r.category].push(r);
67
+ }
68
+
69
+ // By website (top failures)
70
+ const bySite = {};
71
+ for (const r of results) {
72
+ if (!bySite[r.url]) bySite[r.url] = [];
73
+ bySite[r.url].push(r);
74
+ }
75
+
76
+ // Failure modes
77
+ const blockers = {};
78
+ for (const r of results) {
79
+ if (r.error) blockers[r.error] = (blockers[r.error] || 0) + 1;
80
+ }
81
+
82
+ const catStats = {};
83
+ for (const [cat, rs] of Object.entries(byCat)) {
84
+ catStats[cat] = {
85
+ total: rs.length,
86
+ passed: rs.filter(r => r.verdict === 'PASS').length,
87
+ score: rs.reduce((s, r) => s + r.score, 0) / rs.length
88
+ };
89
+ }
90
+
91
+ const siteFailures = Object.entries(bySite)
92
+ .map(([site, rs]) => [site, rs.filter(r => r.verdict === 'FAIL').length])
93
+ .sort((a, b) => b[1] - a[1]).slice(0, 20);
94
+
95
+ console.log(JSON.stringify({
96
+ total, passed, partial, failed,
97
+ totalScore, avgScore: total ? totalScore / total : 0,
98
+ totalDurationMs, avgDurationMs,
99
+ totalInputTokens, totalOutputTokens, totalTokens,
100
+ byCategory: catStats,
101
+ bySiteFailures: Object.fromEntries(siteFailures),
102
+ blockers
103
+ }, null, 2));
104
+ "
105
+ ```
106
+
107
+ ## Report Template
108
+
109
+ Write `web-bench-report.md` with this structure:
110
+
111
+ ```markdown
112
+ # WebBench Benchmark Report
113
+
114
+ **Date:** {timestamp}
115
+ **Agent:** {agent identifier}
116
+ **Dataset:** Halluminate/WebBench
117
+ **Tasks evaluated:** {total} / 2454
118
+
119
+ ---
120
+
121
+ ## Overall Results
122
+
123
+ | Metric | Value |
124
+ |--------|-------|
125
+ | **Pass Rate** | {passed}/{total} ({pass_pct}%) |
126
+ | **Partial Rate** | {partial}/{total} ({partial_pct}%) |
127
+ | **Fail Rate** | {failed}/{total} ({fail_pct}%) |
128
+ | **Average Score** | {avg_score:.2f} / 1.0 |
129
+ | **Total Duration** | {total_duration_formatted} |
130
+ | **Avg Duration/Task** | {avg_duration_formatted} |
131
+ | **Total Tokens** | {total_tokens:,} ({input_tokens:,} input + {output_tokens:,} output) |
132
+ | **Avg Tokens/Task** | {avg_tokens:,} |
133
+
134
+ ## Results by Category
135
+
136
+ | Category | Total | Pass | Partial | Fail | Pass Rate | Avg Score |
137
+ |----------|-------|------|---------|------|-----------|-----------|
138
+ | READ | ... | ... | ... | ... | ...% | ... |
139
+ | CREATE | ... | ... | ... | ... | ...% | ... |
140
+ | UPDATE | ... | ... | ... | ... | ...% | ... |
141
+ | DELETE | ... | ... | ... | ... | ...% | ... |
142
+ | FILE_MANIPULATION | ... | ... | ... | ... | ...% | ... |
143
+
144
+ ## Timing Breakdown
145
+
146
+ | Category | Avg Duration | Min | Max |
147
+ |----------|-------------|-----|-----|
148
+ | READ | ... | ... | ... |
149
+ | CREATE | ... | ... | ... |
150
+ | ... | | | |
151
+
152
+ ## Token Usage Breakdown
153
+
154
+ | Category | Avg Input Tokens | Avg Output Tokens | Avg Total |
155
+ |----------|-----------------|-------------------|-----------|
156
+ | READ | ... | ... | ... |
157
+ | CREATE | ... | ... | ... |
158
+ | ... | | | |
159
+
160
+ ## Top Failure Modes
161
+
162
+ | Failure Mode | Count | % of Failures |
163
+ |-------------|-------|---------------|
164
+ | Auth required | ... | ... |
165
+ | CAPTCHA | ... | ... |
166
+ | Site unavailable | ... | ... |
167
+ | Navigation failure | ... | ... |
168
+
169
+ ## Worst Performing Websites (by failure count)
170
+
171
+ | Website | Tasks | Failures | Failure Rate |
172
+ |---------|-------|----------|-------------|
173
+ | ... | ... | ... | ... |
174
+
175
+ ## Best Performing Websites (by pass rate, min 3 tasks)
176
+
177
+ | Website | Tasks | Pass Rate | Avg Score |
178
+ |---------|-------|-----------|-----------|
179
+ | ... | ... | ... | ... |
180
+
181
+ ## Sample Failures
182
+
183
+ {Show 5-10 representative failures with task description, what went wrong, and verdict reasoning}
184
+
185
+ ## Methodology
186
+
187
+ - **Execution:** One task per session via agent-web-interface browser automation
188
+ - **Evaluation:** LLM-as-judge with structured rubric (PASS=1.0, PARTIAL=0.5, FAIL=0.0)
189
+ - **Scoring dimensions:** Navigation, Comprehension, Completeness, Accuracy, Confirmation
190
+ - **Infrastructure blockers** (auth, CAPTCHA, site down) scored as FAIL but flagged separately
191
+ ```
192
+
193
+ ## Output
194
+
195
+ - **File:** `web-bench-report.md` in working directory
196
+ - Report should be self-contained and readable without the raw JSONL data
197
+
198
+ ## Guardrails
199
+
200
+ - Use the JSONL file as the sole source of truth — do not fabricate statistics
201
+ - Format all durations as human-readable (e.g., "2h 34m 12s" not "9252000ms")
202
+ - Format token counts with thousands separators
203
+ - Round percentages to one decimal place
204
+ - If results file has fewer than the expected total tasks, note this prominently in the report header
@@ -0,0 +1,2 @@
1
+ frontmatter:
2
+ user-invocable: false
@@ -0,0 +1,209 @@
1
+ ---
2
+ name: load-dataset
3
+ description: >
4
+ Download and prepare the Halluminate/WebBench dataset from HuggingFace for benchmarking.
5
+ Triggers: "load dataset", "download WebBench", "prepare benchmark data", "fetch tasks".
6
+ Downloads the CSV dataset via curl, converts to JSONL with Node.js, applies optional filters
7
+ (category, sample size, website allowlist/blocklist), and writes web-bench-tasks.jsonl to the
8
+ working directory. Zero Python dependencies — uses only curl and Node.js.
9
+ Does NOT execute tasks — use execute-task for that.
10
+ allowed-tools: Bash Read Write Edit Glob
11
+ ---
12
+
13
+ # Load WebBench Dataset
14
+
15
+ Download the Halluminate/WebBench dataset from HuggingFace and prepare it for benchmark execution.
16
+
17
+ ## Dataset Source
18
+
19
+ - **HuggingFace:** `Halluminate/WebBench`
20
+ - **Source file:** `webbenchfinal.csv` (CSV format)
21
+ - **Size:** ~2,454 tasks across 452 websites
22
+ - **Fields per row:** `ID` (int), `Starting_URL` (string), `Category` (enum), `Task` (string)
23
+
24
+ ## Pre-check: Skip Download if Dataset Exists
25
+
26
+ Before downloading, check if `web-bench-tasks.jsonl` already exists in the working directory:
27
+
28
+ ```bash
29
+ if [ -f web-bench-tasks.jsonl ]; then
30
+ echo "Dataset already exists: $(wc -l < web-bench-tasks.jsonl) tasks"
31
+ head -1 web-bench-tasks.jsonl
32
+ fi
33
+ ```
34
+
35
+ **If `web-bench-tasks.jsonl` exists and is non-empty, skip the download and conversion entirely.** Jump straight to [Applying Filters](#applying-filters) if filters need to be applied, or report the existing dataset to the tracker.
36
+
37
+ Only proceed with download if the file does not exist or is empty.
38
+
39
+ ## Download Method
40
+
41
+ Download the CSV directly with `curl`, then convert to JSONL with Node.js. No Python dependencies required.
42
+
43
+ ### Step 1: Download the CSV
44
+
45
+ ```bash
46
+ curl -fSL -o web-bench-dataset.csv \
47
+ "https://huggingface.co/datasets/Halluminate/WebBench/resolve/main/webbenchfinal.csv"
48
+ ```
49
+
50
+ If the above URL fails (HuggingFace sometimes changes paths), try:
51
+
52
+ ```bash
53
+ curl -fSL -o web-bench-dataset.csv \
54
+ "https://huggingface.co/datasets/Halluminate/WebBench/raw/main/webbenchfinal.csv"
55
+ ```
56
+
57
+ ### Step 2: Convert CSV to JSONL
58
+
59
+ ```bash
60
+ node -e "
61
+ const fs = require('fs');
62
+ const csv = fs.readFileSync('web-bench-dataset.csv', 'utf-8');
63
+ const lines = csv.split('\n');
64
+ const header = lines[0].split(',').map(h => h.trim().replace(/^\"|\"$/g, ''));
65
+
66
+ // Find column indices
67
+ const idIdx = header.findIndex(h => h === 'ID');
68
+ const urlIdx = header.findIndex(h => h === 'Starting_URL');
69
+ const catIdx = header.findIndex(h => h === 'Category');
70
+ const taskIdx = header.findIndex(h => h === 'Task');
71
+
72
+ const out = fs.createWriteStream('web-bench-tasks.jsonl');
73
+ let count = 0;
74
+
75
+ for (let i = 1; i < lines.length; i++) {
76
+ const line = lines[i].trim();
77
+ if (!line) continue;
78
+
79
+ // Parse CSV line respecting quoted fields
80
+ const fields = [];
81
+ let field = '';
82
+ let inQuotes = false;
83
+ for (let j = 0; j < line.length; j++) {
84
+ const ch = line[j];
85
+ if (ch === '\"') {
86
+ inQuotes = !inQuotes;
87
+ } else if (ch === ',' && !inQuotes) {
88
+ fields.push(field.trim());
89
+ field = '';
90
+ } else {
91
+ field += ch;
92
+ }
93
+ }
94
+ fields.push(field.trim());
95
+
96
+ if (fields.length > taskIdx) {
97
+ out.write(JSON.stringify({
98
+ id: parseInt(fields[idIdx], 10),
99
+ url: fields[urlIdx],
100
+ category: fields[catIdx],
101
+ task: fields[taskIdx]
102
+ }) + '\n');
103
+ count++;
104
+ }
105
+ }
106
+
107
+ out.end();
108
+ console.log('Wrote ' + count + ' tasks to web-bench-tasks.jsonl');
109
+ "
110
+ ```
111
+
112
+ ### Step 3: Verify the output
113
+
114
+ ```bash
115
+ wc -l web-bench-tasks.jsonl
116
+ head -1 web-bench-tasks.jsonl
117
+ node -e "
118
+ const fs = require('fs');
119
+ const tasks = fs.readFileSync('web-bench-tasks.jsonl','utf-8').trim().split('\n').map(JSON.parse);
120
+ const cats = {};
121
+ const sites = new Set();
122
+ for (const t of tasks) {
123
+ cats[t.category] = (cats[t.category] || 0) + 1;
124
+ sites.add(t.url);
125
+ }
126
+ for (const [c, n] of Object.entries(cats).sort()) console.log(' ' + c + ': ' + n);
127
+ console.log('Total: ' + tasks.length + ' tasks across ' + sites.size + ' websites');
128
+ "
129
+ ```
130
+
131
+ ## Applying Filters
132
+
133
+ After downloading, apply filters based on tracker configuration. All filters use Node.js.
134
+
135
+ ### Category Filter
136
+
137
+ If the tracker specifies a category filter (e.g., `READ`, `CREATE`):
138
+
139
+ ```bash
140
+ node -e "
141
+ const fs = require('fs');
142
+ const category = process.argv[1];
143
+ const tasks = fs.readFileSync('web-bench-tasks.jsonl','utf-8').trim().split('\n').map(JSON.parse);
144
+ const filtered = tasks.filter(t => t.category === category);
145
+ fs.writeFileSync('web-bench-tasks.jsonl', filtered.map(JSON.stringify).join('\n') + '\n');
146
+ console.log('Filtered to ' + filtered.length + ' ' + category + ' tasks');
147
+ " "READ"
148
+ ```
149
+
150
+ ### Sample Size
151
+
152
+ If the tracker specifies a sample size (e.g., `--sample 50`):
153
+
154
+ ```bash
155
+ node -e "
156
+ const fs = require('fs');
157
+ const n = parseInt(process.argv[1], 10);
158
+ const tasks = fs.readFileSync('web-bench-tasks.jsonl','utf-8').trim().split('\n').map(JSON.parse);
159
+
160
+ // Deterministic shuffle (seed-based) for reproducibility
161
+ function seededShuffle(arr, seed) {
162
+ const a = [...arr];
163
+ let s = seed;
164
+ for (let i = a.length - 1; i > 0; i--) {
165
+ s = (s * 1664525 + 1013904223) & 0xffffffff;
166
+ const j = ((s >>> 0) % (i + 1));
167
+ [a[i], a[j]] = [a[j], a[i]];
168
+ }
169
+ return a;
170
+ }
171
+
172
+ const sample = seededShuffle(tasks, 42).slice(0, Math.min(n, tasks.length));
173
+ fs.writeFileSync('web-bench-tasks.jsonl', sample.map(JSON.stringify).join('\n') + '\n');
174
+ console.log('Sampled ' + sample.length + ' tasks');
175
+ " "50"
176
+ ```
177
+
178
+ ### Website Blocklist
179
+
180
+ ```bash
181
+ node -e "
182
+ const fs = require('fs');
183
+ const blocklist = new Set(process.argv[1] ? process.argv[1].split(',') : []);
184
+ const tasks = fs.readFileSync('web-bench-tasks.jsonl','utf-8').trim().split('\n').map(JSON.parse);
185
+ const filtered = tasks.filter(t => !blocklist.has(t.url));
186
+ fs.writeFileSync('web-bench-tasks.jsonl', filtered.map(JSON.stringify).join('\n') + '\n');
187
+ console.log(filtered.length + ' tasks after blocklist filter');
188
+ " ""
189
+ ```
190
+
191
+ ## Output
192
+
193
+ - **File:** `web-bench-tasks.jsonl` in working directory
194
+ - **Intermediate file:** `web-bench-dataset.csv` (can be deleted after conversion)
195
+ - **Format:** One JSON object per line
196
+ - **Schema:**
197
+ ```json
198
+ {"id": 42, "url": "https://acehardware.com", "category": "READ", "task": "Navigate to..."}
199
+ ```
200
+
201
+ ## Cleanup
202
+
203
+ After successful conversion, remove the intermediate CSV:
204
+
205
+ ```bash
206
+ rm -f web-bench-dataset.csv
207
+ ```
208
+
209
+ Report the total count and category breakdown to the tracker.
@@ -0,0 +1,2 @@
1
+ frontmatter:
2
+ user-invocable: false
@@ -0,0 +1,92 @@
1
+ ---
2
+ name: run-benchmark
3
+ description: >
4
+ Run the WebBench browser agent benchmark — main entry point and orchestrator.
5
+ Triggers: "run benchmark", "run WebBench", "start benchmark", "benchmark browser agent",
6
+ "web bench", "execute WebBench", "run web-bench".
7
+ Parses user configuration (category filter, sample size, resume), delegates to
8
+ load-dataset, execute-task, evaluate-task, and generate-report skills.
9
+ This is the user-invocable orchestrator that ties the full benchmark pipeline together.
10
+ allowed-tools: Read Write Edit Glob Grep Bash Task mcp__browser__ping mcp__browser__navigate mcp__browser__find mcp__browser__get_element mcp__browser__get_form mcp__browser__get_field mcp__browser__click mcp__browser__type mcp__browser__press mcp__browser__select mcp__browser__hover mcp__browser__drag mcp__browser__scroll mcp__browser__scroll_to mcp__browser__wheel mcp__browser__snapshot mcp__browser__screenshot mcp__browser__go_back mcp__browser__go_forward mcp__browser__reload mcp__browser__list_pages mcp__browser__close_page
11
+ ---
12
+
13
+ # Run WebBench Benchmark
14
+
15
+ Main entry point for running the WebBench browser agent benchmark. This skill is used in interactive (single-session) mode. For multi-session workflow execution, see the system prompt.
16
+
17
+ ## Input
18
+
19
+ Parse configuration from: `$ARGUMENTS`
20
+
21
+ Supported flags:
22
+
23
+ | Flag | Description | Default |
24
+ |------|-------------|---------|
25
+ | `--category <CAT>` | Filter tasks by category (READ, CREATE, UPDATE, DELETE, FILE_MANIPULATION) | All categories |
26
+ | `--sample <N>` | Random sample of N tasks (deterministic seed=42) | Full dataset |
27
+ | `--resume` | Resume from existing web-bench-results.jsonl, skip completed task IDs | Fresh run |
28
+ | `--report-only` | Skip execution, just generate report from existing results | Full run |
29
+
30
+ Examples:
31
+ - `run-benchmark --category READ --sample 50` — 50 random READ tasks
32
+ - `run-benchmark --resume` — continue from where last run stopped
33
+ - `run-benchmark --report-only` — just aggregate existing results
34
+
35
+ ## Interactive Execution Protocol
36
+
37
+ When run interactively (not via the workflow loop), this skill executes the full pipeline in a single session:
38
+
39
+ ### 1. Setup
40
+
41
+ 1. Parse arguments
42
+ 2. Check for existing state (`web-bench-tasks.jsonl`, `web-bench-results.jsonl`)
43
+ 3. If `--resume` and results exist: determine completed task IDs, skip them
44
+ 4. If not resuming: load the `load-dataset` skill to download and prepare the dataset
45
+ 5. Report configuration and task count
46
+
47
+ ### 2. Execute Tasks
48
+
49
+ For each task in `web-bench-tasks.jsonl` (skipping completed if resuming):
50
+
51
+ 1. Read the task line
52
+ 2. Record start time: `date +%s%3N`
53
+ 3. Load `execute-task` methodology and perform browser automation
54
+ 4. Load `evaluate-task` methodology and score the result
55
+ 5. Record end time: `date +%s%3N`, compute duration
56
+ 6. Append result to `web-bench-results.jsonl`:
57
+ ```json
58
+ {"id": 42, "url": "...", "category": "READ", "task": "...", "score": 1.0, "verdict": "PASS", "reasoning": "...", "error": null, "duration_ms": 34200, "tokens_used": {"input": 12450, "output": 3200}, "timestamp": "2026-03-19T14:30:00Z"}
59
+ ```
60
+ 7. Print progress: `[42/2454] PASS (1.0) — acehardware.com — READ — 34.2s`
61
+
62
+ ### 3. Generate Report
63
+
64
+ After all tasks are processed (or if `--report-only`):
65
+
66
+ 1. Load `generate-report` methodology
67
+ 2. Aggregate `web-bench-results.jsonl` into `web-bench-report.md`
68
+ 3. Print summary statistics to console
69
+
70
+ ## Token Tracking
71
+
72
+ Token usage should be tracked per task. The agent should estimate tokens consumed during task execution by recording:
73
+
74
+ - **Input tokens:** Approximate from the size of prompts, page snapshots, and tool responses received during execution
75
+ - **Output tokens:** Approximate from the size of responses and tool calls generated
76
+
77
+ If exact token counts are available from the session metadata, prefer those over estimates.
78
+
79
+ ## Progress Display
80
+
81
+ After each task, print a status line:
82
+
83
+ ```
84
+ [1/50] PASS (1.0) acehardware.com READ 34.2s 15,650 tokens
85
+ [2/50] FAIL (0.0) airbnb.com CREATE 12.1s 8,200 tokens [auth_required]
86
+ [3/50] PARTIAL(0.5) amazon.com READ 45.8s 22,100 tokens
87
+ ```
88
+
89
+ ## Guardrails
90
+
91
+ - **Always append, never overwrite** results. The JSONL file is append-only.
92
+ - **Respect the dataset.** Do not modify task descriptions or skip tasks without recording a FAIL.
@@ -0,0 +1,3 @@
1
+ frontmatter:
2
+ argument-hint: "[--category READ|CREATE|UPDATE|DELETE|FILE_MANIPULATION] [--sample N] [--resume]"
3
+ user-invocable: true
@@ -0,0 +1,18 @@
1
+ {
2
+ "schemaVersion": 1,
3
+ "pluginRef": "web-bench@athena-workflow-marketplace",
4
+ "pluginName": "web-bench",
5
+ "marketplaceName": "athena-workflow-marketplace",
6
+ "version": "1.0.3",
7
+ "artifacts": {
8
+ "claude": {
9
+ "type": "directory",
10
+ "path": "./claude/plugin"
11
+ },
12
+ "codex": {
13
+ "type": "marketplace",
14
+ "marketplacePath": "./.agents/plugins/marketplace.json",
15
+ "pluginPath": "./codex/plugin"
16
+ }
17
+ }
18
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "schemaVersion": 1,
3
+ "name": "athena-workflow-marketplace",
4
+ "plugins": [
5
+ {
6
+ "name": "web-bench",
7
+ "version": "1.0.5",
8
+ "source": {
9
+ "source": "local",
10
+ "path": "./codex/plugin"
11
+ }
12
+ }
13
+ ]
14
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "name": "web-bench",
3
+ "description": "WebBench benchmark runner \u2014 executes real-world browser tasks from the Halluminate/WebBench dataset, scores via LLM-as-judge, and produces evaluation reports",
4
+ "version": "1.0.5",
5
+ "author": {
6
+ "name": "Athenaflow"
7
+ },
8
+ "keywords": [
9
+ "benchmark",
10
+ "evaluation",
11
+ "webbench",
12
+ "llm-judge",
13
+ "browser-tasks"
14
+ ],
15
+ "category": "evaluation"
16
+ }
@@ -0,0 +1,19 @@
1
+ {
2
+ "mcpServers": {
3
+ "agent-web-interface": {
4
+ "command": "npx",
5
+ "args": ["-y", "agent-web-interface@latest"],
6
+ "env": {
7
+ "NODE_ENV": "production"
8
+ },
9
+ "options": [
10
+ { "label": "Auto (user → persistent → isolated)", "env": {} },
11
+ { "label": "User's Chrome", "env": { "AWI_BROWSER_MODE": "user" } },
12
+ { "label": "Persistent profile", "env": { "AWI_BROWSER_MODE": "persistent" } },
13
+ { "label": "Isolated (temp profile)", "env": { "AWI_BROWSER_MODE": "isolated" } },
14
+ { "label": "Headless", "env": { "AWI_HEADLESS": "true" } },
15
+ { "label": "Connect to CDP endpoint", "env": { "AWI_CDP_URL": "http://localhost:9222" } }
16
+ ]
17
+ }
18
+ }
19
+ }
@@ -0,0 +1,9 @@
1
+ {
2
+ "name": "@athenaflow/plugin-web-bench",
3
+ "version": "1.0.5",
4
+ "description": "WebBench benchmark runner — executes real-world browser tasks from the Halluminate/WebBench dataset, scores via LLM-as-judge, and produces evaluation reports",
5
+ "license": "MIT",
6
+ "publishConfig": {
7
+ "access": "public"
8
+ }
9
+ }