@athenaflow/plugin-web-bench 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/.claude-plugin/plugin.json +16 -0
  2. package/.codex-plugin/plugin.json +16 -0
  3. package/.mcp.json +19 -0
  4. package/dist/1.0.3/.agents/plugins/marketplace.json +14 -0
  5. package/dist/1.0.3/claude/plugin/.claude-plugin/plugin.json +16 -0
  6. package/dist/1.0.3/claude/plugin/.mcp.json +19 -0
  7. package/dist/1.0.3/claude/plugin/package.json +9 -0
  8. package/dist/1.0.3/claude/plugin/skills/evaluate-task/SKILL.md +173 -0
  9. package/dist/1.0.3/claude/plugin/skills/evaluate-task/agents/claude.yaml +2 -0
  10. package/dist/1.0.3/claude/plugin/skills/execute-task/SKILL.md +133 -0
  11. package/dist/1.0.3/claude/plugin/skills/execute-task/agents/claude.yaml +2 -0
  12. package/dist/1.0.3/claude/plugin/skills/generate-report/SKILL.md +204 -0
  13. package/dist/1.0.3/claude/plugin/skills/generate-report/agents/claude.yaml +2 -0
  14. package/dist/1.0.3/claude/plugin/skills/load-dataset/SKILL.md +209 -0
  15. package/dist/1.0.3/claude/plugin/skills/load-dataset/agents/claude.yaml +2 -0
  16. package/dist/1.0.3/claude/plugin/skills/run-benchmark/SKILL.md +92 -0
  17. package/dist/1.0.3/claude/plugin/skills/run-benchmark/agents/claude.yaml +3 -0
  18. package/dist/1.0.3/codex/plugin/.codex-plugin/plugin.json +16 -0
  19. package/dist/1.0.3/codex/plugin/.mcp.json +19 -0
  20. package/dist/1.0.3/codex/plugin/package.json +9 -0
  21. package/dist/1.0.3/codex/plugin/skills/evaluate-task/SKILL.md +173 -0
  22. package/dist/1.0.3/codex/plugin/skills/evaluate-task/agents/claude.yaml +2 -0
  23. package/dist/1.0.3/codex/plugin/skills/execute-task/SKILL.md +133 -0
  24. package/dist/1.0.3/codex/plugin/skills/execute-task/agents/claude.yaml +2 -0
  25. package/dist/1.0.3/codex/plugin/skills/generate-report/SKILL.md +204 -0
  26. package/dist/1.0.3/codex/plugin/skills/generate-report/agents/claude.yaml +2 -0
  27. package/dist/1.0.3/codex/plugin/skills/load-dataset/SKILL.md +209 -0
  28. package/dist/1.0.3/codex/plugin/skills/load-dataset/agents/claude.yaml +2 -0
  29. package/dist/1.0.3/codex/plugin/skills/run-benchmark/SKILL.md +92 -0
  30. package/dist/1.0.3/codex/plugin/skills/run-benchmark/agents/claude.yaml +3 -0
  31. package/dist/1.0.3/release.json +18 -0
  32. package/dist/1.0.5/.agents/plugins/marketplace.json +14 -0
  33. package/dist/1.0.5/claude/plugin/.claude-plugin/plugin.json +16 -0
  34. package/dist/1.0.5/claude/plugin/.mcp.json +19 -0
  35. package/dist/1.0.5/claude/plugin/package.json +9 -0
  36. package/dist/1.0.5/claude/plugin/skills/evaluate-task/SKILL.md +173 -0
  37. package/dist/1.0.5/claude/plugin/skills/evaluate-task/agents/claude.yaml +2 -0
  38. package/dist/1.0.5/claude/plugin/skills/execute-task/SKILL.md +133 -0
  39. package/dist/1.0.5/claude/plugin/skills/execute-task/agents/claude.yaml +2 -0
  40. package/dist/1.0.5/claude/plugin/skills/generate-report/SKILL.md +204 -0
  41. package/dist/1.0.5/claude/plugin/skills/generate-report/agents/claude.yaml +2 -0
  42. package/dist/1.0.5/claude/plugin/skills/load-dataset/SKILL.md +209 -0
  43. package/dist/1.0.5/claude/plugin/skills/load-dataset/agents/claude.yaml +2 -0
  44. package/dist/1.0.5/claude/plugin/skills/run-benchmark/SKILL.md +92 -0
  45. package/dist/1.0.5/claude/plugin/skills/run-benchmark/agents/claude.yaml +3 -0
  46. package/dist/1.0.5/codex/plugin/.codex-plugin/plugin.json +16 -0
  47. package/dist/1.0.5/codex/plugin/.mcp.json +19 -0
  48. package/dist/1.0.5/codex/plugin/package.json +9 -0
  49. package/dist/1.0.5/codex/plugin/skills/evaluate-task/SKILL.md +173 -0
  50. package/dist/1.0.5/codex/plugin/skills/evaluate-task/agents/claude.yaml +2 -0
  51. package/dist/1.0.5/codex/plugin/skills/execute-task/SKILL.md +133 -0
  52. package/dist/1.0.5/codex/plugin/skills/execute-task/agents/claude.yaml +2 -0
  53. package/dist/1.0.5/codex/plugin/skills/generate-report/SKILL.md +204 -0
  54. package/dist/1.0.5/codex/plugin/skills/generate-report/agents/claude.yaml +2 -0
  55. package/dist/1.0.5/codex/plugin/skills/load-dataset/SKILL.md +209 -0
  56. package/dist/1.0.5/codex/plugin/skills/load-dataset/agents/claude.yaml +2 -0
  57. package/dist/1.0.5/codex/plugin/skills/run-benchmark/SKILL.md +92 -0
  58. package/dist/1.0.5/codex/plugin/skills/run-benchmark/agents/claude.yaml +3 -0
  59. package/dist/1.0.5/release.json +18 -0
  60. package/package.json +13 -0
  61. package/skills/evaluate-task/SKILL.md +173 -0
  62. package/skills/evaluate-task/agents/claude.yaml +2 -0
  63. package/skills/execute-task/SKILL.md +133 -0
  64. package/skills/execute-task/agents/claude.yaml +2 -0
  65. package/skills/generate-report/SKILL.md +204 -0
  66. package/skills/generate-report/agents/claude.yaml +2 -0
  67. package/skills/load-dataset/SKILL.md +209 -0
  68. package/skills/load-dataset/agents/claude.yaml +2 -0
  69. package/skills/run-benchmark/SKILL.md +92 -0
  70. package/skills/run-benchmark/agents/claude.yaml +3 -0
@@ -0,0 +1,173 @@
1
+ ---
2
+ name: evaluate-task
3
+ description: >
4
+ Evaluate whether a WebBench task was successfully completed using LLM-as-judge scoring.
5
+ Triggers: "evaluate task", "score task", "judge result", "grade benchmark task".
6
+ Examines the execution trace, final page state, and extracted data against the original
7
+ task description. Produces a structured verdict (PASS/PARTIAL/FAIL) with reasoning.
8
+ Does NOT execute browser actions — use execute-task for that.
9
+ allowed-tools: Read Write Edit
10
+ ---
11
+
12
+ # Evaluate WebBench Task
13
+
14
+ Judge whether a completed task execution meets the success criteria defined in the original task description. This is a post-hoc evaluation — no browser interaction, only analysis of the execution trace and captured state.
15
+
16
+ ## Input
17
+
18
+ You receive:
19
+
20
+ 1. **Original task:** `{"id": 42, "url": "...", "category": "READ", "task": "Navigate to the news section and summarize..."}`
21
+ 2. **Execution trace:** Actions taken, final URL, extracted data, blockers encountered, screenshots
22
+
23
+ ## Scoring Rubric
24
+
25
+ ### Verdict Scale
26
+
27
+ | Verdict | Score | Criteria |
28
+ |---------|-------|----------|
29
+ | **PASS** | 1.0 | Task fully completed. All requested information extracted or all requested actions performed. |
30
+ | **PARTIAL** | 0.5 | Task partially completed. Some but not all requirements met. Meaningful progress was made. |
31
+ | **FAIL** | 0.0 | Task not completed. No meaningful progress, wrong information, or blocked before starting. |
32
+
33
+ ### Category-Specific Evaluation
34
+
35
+ #### READ Tasks
36
+ - **PASS:** All requested data was extracted accurately and completely
37
+ - **PARTIAL:** Some data extracted but incomplete (e.g., found the page but missed some fields)
38
+ - **FAIL:** Wrong data, wrong page, or no data extracted
39
+
40
+ #### CREATE Tasks
41
+ - **PASS:** Item was created as specified, confirmation visible
42
+ - **PARTIAL:** Creation started but not confirmed (e.g., form filled but not submitted)
43
+ - **FAIL:** Could not reach the creation form, or creation failed
44
+
45
+ #### UPDATE Tasks
46
+ - **PASS:** Data was modified as specified, change confirmed
47
+ - **PARTIAL:** Found the item but could not complete the modification
48
+ - **FAIL:** Could not find the item or reach the edit interface
49
+
50
+ #### DELETE Tasks
51
+ - **PASS:** Item was deleted and removal confirmed
52
+ - **PARTIAL:** Found the item and initiated deletion but could not confirm
53
+ - **FAIL:** Could not find the item or reach the delete action
54
+
55
+ #### FILE_MANIPULATION Tasks
56
+ - **PASS:** File downloaded with correct name/content
57
+ - **PARTIAL:** Download initiated but not verified
58
+ - **FAIL:** Could not locate or download the file
59
+
60
+ ### Blocker Handling
61
+
62
+ If the execution trace contains blockers, evaluate based on the blocker type:
63
+
64
+ | Blocker | Verdict | Reasoning |
65
+ |---------|---------|-----------|
66
+ | Login required (no credentials) | FAIL | Infrastructure limitation — task requires auth |
67
+ | CAPTCHA | FAIL | Infrastructure limitation — cannot solve programmatically |
68
+ | Site down / 404 | FAIL | External dependency — site unavailable |
69
+ | Geo-restricted | FAIL | Infrastructure limitation — content not accessible |
70
+ | Paywall | FAIL | Infrastructure limitation — paid content |
71
+ | Pop-up could not be dismissed | PARTIAL or FAIL | Depends on whether task could proceed |
72
+
73
+ ### Evaluation Dimensions
74
+
75
+ Score each dimension and use them to determine the overall verdict:
76
+
77
+ 1. **Navigation (required):** Did the agent reach the correct page/section?
78
+ - Correct site? Correct section? Correct page?
79
+
80
+ 2. **Comprehension (required):** Did the agent understand what was being asked?
81
+ - Did it attempt the right type of action? Did it target the right elements?
82
+
83
+ 3. **Completeness (required):** Did the agent fulfill ALL parts of the task?
84
+ - Multi-part tasks: each part must be addressed
85
+ - Quantitative tasks: all requested data points must be present
86
+
87
+ 4. **Accuracy (for READ tasks):** Is the extracted information correct?
88
+ - Does it match what's visible on the page?
89
+ - Are numbers, names, and details accurate?
90
+
91
+ 5. **Confirmation (for WRITE tasks):** Is there evidence the action was performed?
92
+ - Success message visible? Item appears in list? State changed?
93
+
94
+ ## Evaluation Process
95
+
96
+ ### Step 1: Parse the Task Requirements
97
+
98
+ Break the task description into discrete, verifiable requirements:
99
+
100
+ ```
101
+ Task: "Navigate to the news section and summarize the headline and key points from the latest science policy update."
102
+
103
+ Requirements:
104
+ 1. Navigate to the news section
105
+ 2. Find the latest science policy update
106
+ 3. Extract the headline
107
+ 4. Extract key points
108
+ ```
109
+
110
+ ### Step 2: Check Each Requirement Against the Trace
111
+
112
+ For each requirement, determine if the execution trace shows it was fulfilled:
113
+
114
+ ```
115
+ 1. Navigate to news section → DONE (action 2: clicked "News", URL changed to /news)
116
+ 2. Find latest science policy update → DONE (action 3: found article "New Science Policy...")
117
+ 3. Extract headline → DONE (extracted: "New Science Policy Framework Announced")
118
+ 4. Extract key points → NOT DONE (only headline extracted, no key points)
119
+ ```
120
+
121
+ ### Step 3: Determine Verdict
122
+
123
+ - All requirements met → **PASS**
124
+ - Some requirements met → **PARTIAL**
125
+ - No requirements met or fundamentally wrong approach → **FAIL**
126
+
127
+ ### Step 4: Write Reasoning
128
+
129
+ Provide clear, structured reasoning:
130
+
131
+ ```
132
+ Verdict: PARTIAL (0.5)
133
+ Reasoning: Agent successfully navigated to the news section and identified the correct article.
134
+ The headline was extracted accurately. However, the task also requested "key points" from the
135
+ article, which were not extracted. 3 of 4 requirements met.
136
+ ```
137
+
138
+ ## Output Format
139
+
140
+ Return a structured evaluation result:
141
+
142
+ ```json
143
+ {
144
+ "task_id": 42,
145
+ "verdict": "PARTIAL",
146
+ "score": 0.5,
147
+ "reasoning": "Agent navigated correctly and extracted the headline, but missed the key points requirement. 3/4 requirements fulfilled.",
148
+ "requirements_met": 3,
149
+ "requirements_total": 4,
150
+ "blocker": null
151
+ }
152
+ ```
153
+
154
+ If a blocker prevented execution:
155
+
156
+ ```json
157
+ {
158
+ "task_id": 43,
159
+ "verdict": "FAIL",
160
+ "score": 0.0,
161
+ "reasoning": "Task requires account login. No credentials available — infrastructure limitation.",
162
+ "requirements_met": 0,
163
+ "requirements_total": 3,
164
+ "blocker": "auth_required"
165
+ }
166
+ ```
167
+
168
+ ## Guardrails
169
+
170
+ - **Be strict but fair.** A task that asks for 5 data points and delivers 4 is PARTIAL, not PASS.
171
+ - **Do not hallucinate success.** If the trace doesn't show evidence of completion, it didn't happen.
172
+ - **Separate agent failure from infrastructure failure.** Auth requirements, CAPTCHAs, and site outages are not agent failures — but they are still FAIL verdicts for scoring purposes. Note the distinction in reasoning.
173
+ - **Evaluate what was asked, not what was attempted.** A well-executed wrong approach is still a FAIL.
@@ -0,0 +1,2 @@
1
+ frontmatter:
2
+ user-invocable: false
@@ -0,0 +1,133 @@
1
+ ---
2
+ name: execute-task
3
+ description: >
4
+ Methodology for executing a single WebBench benchmark task via browser automation.
5
+ Triggers: "execute task", "run task", "perform benchmark task", "browser task".
6
+ Interprets the natural-language task description, defines the required browser actions,
7
+ and specifies what final evidence to capture (for example screenshot + snapshot).
8
+ Records an execution trace with actions taken and errors encountered.
9
+ Does NOT evaluate success — use evaluate-task for that.
10
+ allowed-tools: Read Write Edit Bash
11
+ ---
12
+
13
+ # Execute WebBench Task
14
+
15
+ Execute a single WebBench task using a browser-capable calling context. This skill does not own browser MCP tools; it defines the execution protocol the caller should follow while navigating, clicking, typing, and extracting data.
16
+
17
+ ## Input
18
+
19
+ You receive a single task object:
20
+
21
+ ```json
22
+ {"id": 42, "url": "https://acehardware.com", "category": "READ", "task": "Navigate to the news section and summarize..."}
23
+ ```
24
+
25
+ ## Execution Protocol
26
+
27
+ ### 1. Record Start Time
28
+
29
+ Before any browser interaction by the calling context:
30
+
31
+ ```bash
32
+ date +%s%3N
33
+ ```
34
+
35
+ Save this as `start_time_ms`. You will need it for the result record.
36
+
37
+ ### 2. Navigate to Starting URL
38
+
39
+ ```
40
+ navigate → task.url
41
+ ```
42
+
43
+ The calling context should wait for the page to load and take an initial snapshot to understand the page structure.
44
+
45
+ ### 3. Interpret the Task
46
+
47
+ Read the task description carefully. Classify it:
48
+
49
+ | Category | What to Do |
50
+ |----------|------------|
51
+ | **READ** | Navigate to the right page/section, extract the requested information. Your "result" is the extracted data. |
52
+ | **CREATE** | Fill forms, create accounts/entries/posts as described. Your "result" is confirmation the item was created. |
53
+ | **UPDATE** | Find existing data and modify it as described. Your "result" is confirmation the update was applied. |
54
+ | **DELETE** | Find and remove the specified item. Your "result" is confirmation of deletion. |
55
+ | **FILE_MANIPULATION** | Download the specified file. Your "result" is the filename and confirmation of download. |
56
+
57
+ ### 4. Execute Browser Actions
58
+
59
+ Work through the task step by step:
60
+
61
+ 1. **Observe** — Use `snapshot` or `find` to understand the current page state
62
+ 2. **Plan** — Decide the next action based on what you see
63
+ 3. **Act** — Use the appropriate browser tool available in the calling context (`click`, `type`, `select`, etc.)
64
+ 4. **Verify** — Check that the action had the expected effect
65
+
66
+ **Key principles:**
67
+
68
+ - **Use `find` with `kind` filters** to locate interactive elements (buttons, links, textboxes)
69
+ - **Use `snapshot`** to get a full page state when you need orientation
70
+ - **Use `screenshot`** to visually verify state when snapshots are ambiguous
71
+ - **Handle pop-ups and modals** — cookie banners, newsletter pop-ups, chat widgets. Dismiss them before proceeding
72
+ - **Stay on the specified site** — Tasks often say "Only use [site] to achieve the task." Respect this constraint
73
+ - **Handle pagination** — If data spans multiple pages, navigate through them
74
+ - **Be patient with slow sites** — Some sites load content dynamically. If elements aren't found immediately, try scrolling or waiting
75
+
76
+ ### 5. Handle Common Obstacles
77
+
78
+ | Obstacle | Strategy |
79
+ |----------|----------|
80
+ | **Cookie consent banner** | Find and click "Accept" or "Close" |
81
+ | **Login required** | Record as blocker — do not attempt to create accounts or guess credentials |
82
+ | **CAPTCHA** | Record as blocker — cannot solve programmatically |
83
+ | **Paywall** | Record as blocker |
84
+ | **Geo-restricted content** | Record as blocker |
85
+ | **Site down / 404** | Record as error |
86
+ | **Pop-up / overlay blocking interaction** | Dismiss it, then continue |
87
+
88
+ ### 6. Capture Final State
89
+
90
+ After completing the task (or hitting a blocker):
91
+
92
+ 1. Have the calling context take a **screenshot** of the final page state
93
+ 2. Have the calling context take a **snapshot** of the final DOM state
94
+ 3. Record the **current URL**
95
+
96
+ ### 7. Record End Time
97
+
98
+ ```bash
99
+ date +%s%3N
100
+ ```
101
+
102
+ Save as `end_time_ms`. Compute `duration_ms = end_time_ms - start_time_ms`.
103
+
104
+ ### 8. Build Execution Trace
105
+
106
+ Construct a structured trace of what happened:
107
+
108
+ ```json
109
+ {
110
+ "task_id": 42,
111
+ "actions": [
112
+ {"step": 1, "action": "navigate", "target": "https://acehardware.com", "result": "loaded"},
113
+ {"step": 2, "action": "click", "target": "News section link", "result": "navigated to /news"},
114
+ {"step": 3, "action": "extract", "target": "headline text", "result": "Black Friday Deals..."}
115
+ ],
116
+ "final_url": "https://acehardware.com/news",
117
+ "blockers": [],
118
+ "extracted_data": "The headline is 'Black Friday Deals'...",
119
+ "duration_ms": 34200
120
+ }
121
+ ```
122
+
123
+ ## Output
124
+
125
+ Return the execution trace to the calling context (system prompt / run-benchmark skill). Do NOT write results to disk — that is the system prompt's responsibility after evaluation.
126
+
127
+ ## Guardrails
128
+
129
+ - **One task only.** Execute exactly the task given. Do not browse further or attempt other tasks.
130
+ - **No account creation.** If a task requires login, record it as a blocker. Do not create accounts.
131
+ - **No credential guessing.** Never attempt to guess passwords or bypass authentication.
132
+ - **Time limit awareness.** If a task is taking an unreasonable number of steps (>20 actions), consider it likely stuck and record what you have.
133
+ - **No destructive actions on real sites.** For WRITE/UPDATE/DELETE tasks on production sites, be aware these are real websites. If the task would create real accounts or modify real data, record this concern but still attempt the task as specified (this is the nature of the benchmark).
@@ -0,0 +1,2 @@
1
+ frontmatter:
2
+ user-invocable: false
@@ -0,0 +1,204 @@
1
+ ---
2
+ name: generate-report
3
+ description: >
4
+ Aggregate WebBench benchmark results into a comprehensive evaluation report.
5
+ Triggers: "generate report", "create benchmark report", "summarize results",
6
+ "aggregate scores", "produce evaluation report".
7
+ Reads web-bench-results.jsonl, computes statistics by category/website/failure mode,
8
+ and writes web-bench-report.md with pass rates, timing, token usage, and analysis.
9
+ Does NOT execute or evaluate tasks — only aggregates existing results.
10
+ allowed-tools: Read Write Edit Bash Glob Grep
11
+ ---
12
+
13
+ # Generate WebBench Benchmark Report
14
+
15
+ Aggregate all results from `web-bench-results.jsonl` into a comprehensive markdown report.
16
+
17
+ ## Input
18
+
19
+ - **Results file:** `web-bench-results.jsonl` — one JSON line per completed task
20
+ - **Result line schema:**
21
+ ```json
22
+ {
23
+ "id": 42,
24
+ "url": "https://acehardware.com",
25
+ "category": "READ",
26
+ "task": "Navigate to...",
27
+ "score": 1.0,
28
+ "verdict": "PASS",
29
+ "reasoning": "Successfully extracted all specs",
30
+ "error": null,
31
+ "duration_ms": 34200,
32
+ "tokens_used": {"input": 12450, "output": 3200},
33
+ "timestamp": "2026-03-19T14:30:00Z"
34
+ }
35
+ ```
36
+
37
+ ## Report Generation
38
+
39
+ Use Node.js to compute all statistics:
40
+
41
+ ```bash
42
+ node -e "
43
+ const fs = require('fs');
44
+ const results = fs.readFileSync('web-bench-results.jsonl','utf-8').trim().split('\n')
45
+ .filter(l => l.trim()).map(JSON.parse);
46
+
47
+ const total = results.length;
48
+ const passed = results.filter(r => r.verdict === 'PASS').length;
49
+ const partial = results.filter(r => r.verdict === 'PARTIAL').length;
50
+ const failed = results.filter(r => r.verdict === 'FAIL').length;
51
+ const totalScore = results.reduce((s, r) => s + r.score, 0);
52
+
53
+ // Timing
54
+ const totalDurationMs = results.reduce((s, r) => s + (r.duration_ms || 0), 0);
55
+ const avgDurationMs = total ? totalDurationMs / total : 0;
56
+
57
+ // Tokens
58
+ const totalInputTokens = results.reduce((s, r) => s + ((r.tokens_used || {}).input || 0), 0);
59
+ const totalOutputTokens = results.reduce((s, r) => s + ((r.tokens_used || {}).output || 0), 0);
60
+ const totalTokens = totalInputTokens + totalOutputTokens;
61
+
62
+ // By category
63
+ const byCat = {};
64
+ for (const r of results) {
65
+ if (!byCat[r.category]) byCat[r.category] = [];
66
+ byCat[r.category].push(r);
67
+ }
68
+
69
+ // By website (top failures)
70
+ const bySite = {};
71
+ for (const r of results) {
72
+ if (!bySite[r.url]) bySite[r.url] = [];
73
+ bySite[r.url].push(r);
74
+ }
75
+
76
+ // Failure modes
77
+ const blockers = {};
78
+ for (const r of results) {
79
+ if (r.error) blockers[r.error] = (blockers[r.error] || 0) + 1;
80
+ }
81
+
82
+ const catStats = {};
83
+ for (const [cat, rs] of Object.entries(byCat)) {
84
+ catStats[cat] = {
85
+ total: rs.length,
86
+ passed: rs.filter(r => r.verdict === 'PASS').length,
87
+ score: rs.reduce((s, r) => s + r.score, 0) / rs.length
88
+ };
89
+ }
90
+
91
+ const siteFailures = Object.entries(bySite)
92
+ .map(([site, rs]) => [site, rs.filter(r => r.verdict === 'FAIL').length])
93
+ .sort((a, b) => b[1] - a[1]).slice(0, 20);
94
+
95
+ console.log(JSON.stringify({
96
+ total, passed, partial, failed,
97
+ totalScore, avgScore: total ? totalScore / total : 0,
98
+ totalDurationMs, avgDurationMs,
99
+ totalInputTokens, totalOutputTokens, totalTokens,
100
+ byCategory: catStats,
101
+ bySiteFailures: Object.fromEntries(siteFailures),
102
+ blockers
103
+ }, null, 2));
104
+ "
105
+ ```
106
+
107
+ ## Report Template
108
+
109
+ Write `web-bench-report.md` with this structure:
110
+
111
+ ```markdown
112
+ # WebBench Benchmark Report
113
+
114
+ **Date:** {timestamp}
115
+ **Agent:** {agent identifier}
116
+ **Dataset:** Halluminate/WebBench
117
+ **Tasks evaluated:** {total} / 2454
118
+
119
+ ---
120
+
121
+ ## Overall Results
122
+
123
+ | Metric | Value |
124
+ |--------|-------|
125
+ | **Pass Rate** | {passed}/{total} ({pass_pct}%) |
126
+ | **Partial Rate** | {partial}/{total} ({partial_pct}%) |
127
+ | **Fail Rate** | {failed}/{total} ({fail_pct}%) |
128
+ | **Average Score** | {avg_score:.2f} / 1.0 |
129
+ | **Total Duration** | {total_duration_formatted} |
130
+ | **Avg Duration/Task** | {avg_duration_formatted} |
131
+ | **Total Tokens** | {total_tokens:,} ({input_tokens:,} input + {output_tokens:,} output) |
132
+ | **Avg Tokens/Task** | {avg_tokens:,} |
133
+
134
+ ## Results by Category
135
+
136
+ | Category | Total | Pass | Partial | Fail | Pass Rate | Avg Score |
137
+ |----------|-------|------|---------|------|-----------|-----------|
138
+ | READ | ... | ... | ... | ... | ...% | ... |
139
+ | CREATE | ... | ... | ... | ... | ...% | ... |
140
+ | UPDATE | ... | ... | ... | ... | ...% | ... |
141
+ | DELETE | ... | ... | ... | ... | ...% | ... |
142
+ | FILE_MANIPULATION | ... | ... | ... | ... | ...% | ... |
143
+
144
+ ## Timing Breakdown
145
+
146
+ | Category | Avg Duration | Min | Max |
147
+ |----------|-------------|-----|-----|
148
+ | READ | ... | ... | ... |
149
+ | CREATE | ... | ... | ... |
150
+ | ... | | | |
151
+
152
+ ## Token Usage Breakdown
153
+
154
+ | Category | Avg Input Tokens | Avg Output Tokens | Avg Total |
155
+ |----------|-----------------|-------------------|-----------|
156
+ | READ | ... | ... | ... |
157
+ | CREATE | ... | ... | ... |
158
+ | ... | | | |
159
+
160
+ ## Top Failure Modes
161
+
162
+ | Failure Mode | Count | % of Failures |
163
+ |-------------|-------|---------------|
164
+ | Auth required | ... | ... |
165
+ | CAPTCHA | ... | ... |
166
+ | Site unavailable | ... | ... |
167
+ | Navigation failure | ... | ... |
168
+
169
+ ## Worst Performing Websites (by failure count)
170
+
171
+ | Website | Tasks | Failures | Failure Rate |
172
+ |---------|-------|----------|-------------|
173
+ | ... | ... | ... | ... |
174
+
175
+ ## Best Performing Websites (by pass rate, min 3 tasks)
176
+
177
+ | Website | Tasks | Pass Rate | Avg Score |
178
+ |---------|-------|-----------|-----------|
179
+ | ... | ... | ... | ... |
180
+
181
+ ## Sample Failures
182
+
183
+ {Show 5-10 representative failures with task description, what went wrong, and verdict reasoning}
184
+
185
+ ## Methodology
186
+
187
+ - **Execution:** One task per session via agent-web-interface browser automation
188
+ - **Evaluation:** LLM-as-judge with structured rubric (PASS=1.0, PARTIAL=0.5, FAIL=0.0)
189
+ - **Scoring dimensions:** Navigation, Comprehension, Completeness, Accuracy, Confirmation
190
+ - **Infrastructure blockers** (auth, CAPTCHA, site down) scored as FAIL but flagged separately
191
+ ```
192
+
193
+ ## Output
194
+
195
+ - **File:** `web-bench-report.md` in working directory
196
+ - Report should be self-contained and readable without the raw JSONL data
197
+
198
+ ## Guardrails
199
+
200
+ - Use the JSONL file as the sole source of truth — do not fabricate statistics
201
+ - Format all durations as human-readable (e.g., "2h 34m 12s" not "9252000ms")
202
+ - Format token counts with thousands separators
203
+ - Round percentages to one decimal place
204
+ - If results file has fewer than the expected total tasks, note this prominently in the report header
@@ -0,0 +1,2 @@
1
+ frontmatter:
2
+ user-invocable: false