npm - @athenaflow/plugin-web-bench - Versions diffs - 1.0.5 - Mend

@athenaflow/plugin-web-bench 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/dist/1.0.5/codex/plugin/skills/generate-report/SKILL.md ADDED Viewed

@@ -0,0 +1,204 @@
+---
+name: generate-report
+description: >
+  Aggregate WebBench benchmark results into a comprehensive evaluation report.
+  Triggers: "generate report", "create benchmark report", "summarize results",
+  "aggregate scores", "produce evaluation report".
+  Reads web-bench-results.jsonl, computes statistics by category/website/failure mode,
+  and writes web-bench-report.md with pass rates, timing, token usage, and analysis.
+  Does NOT execute or evaluate tasks — only aggregates existing results.
+allowed-tools: Read Write Edit Bash Glob Grep
+---
+# Generate WebBench Benchmark Report
+Aggregate all results from `web-bench-results.jsonl` into a comprehensive markdown report.
+## Input
+- **Results file:** `web-bench-results.jsonl` — one JSON line per completed task
+- **Result line schema:**
+  ```json
+  {
+    "id": 42,
+    "url": "https://acehardware.com",
+    "category": "READ",
+    "task": "Navigate to...",
+    "score": 1.0,
+    "verdict": "PASS",
+    "reasoning": "Successfully extracted all specs",
+    "error": null,
+    "duration_ms": 34200,
+    "tokens_used": {"input": 12450, "output": 3200},
+    "timestamp": "2026-03-19T14:30:00Z"
+  }
+  ```
+## Report Generation
+Use Node.js to compute all statistics:
+```bash
+node -e "
+const fs = require('fs');
+const results = fs.readFileSync('web-bench-results.jsonl','utf-8').trim().split('\n')
+  .filter(l => l.trim()).map(JSON.parse);
+const total = results.length;
+const passed = results.filter(r => r.verdict === 'PASS').length;
+const partial = results.filter(r => r.verdict === 'PARTIAL').length;
+const failed = results.filter(r => r.verdict === 'FAIL').length;
+const totalScore = results.reduce((s, r) => s + r.score, 0);
+// Timing
+const totalDurationMs = results.reduce((s, r) => s + (r.duration_ms || 0), 0);
+const avgDurationMs = total ? totalDurationMs / total : 0;
+// Tokens
+const totalInputTokens = results.reduce((s, r) => s + ((r.tokens_used || {}).input || 0), 0);
+const totalOutputTokens = results.reduce((s, r) => s + ((r.tokens_used || {}).output || 0), 0);
+const totalTokens = totalInputTokens + totalOutputTokens;
+// By category
+const byCat = {};
+for (const r of results) {
+  if (!byCat[r.category]) byCat[r.category] = [];
+  byCat[r.category].push(r);
+}
+// By website (top failures)
+const bySite = {};
+for (const r of results) {
+  if (!bySite[r.url]) bySite[r.url] = [];
+  bySite[r.url].push(r);
+}
+// Failure modes
+const blockers = {};
+for (const r of results) {
+  if (r.error) blockers[r.error] = (blockers[r.error] || 0) + 1;
+}
+const catStats = {};
+for (const [cat, rs] of Object.entries(byCat)) {
+  catStats[cat] = {
+    total: rs.length,
+    passed: rs.filter(r => r.verdict === 'PASS').length,
+    score: rs.reduce((s, r) => s + r.score, 0) / rs.length
+  };
+}
+const siteFailures = Object.entries(bySite)
+  .map(([site, rs]) => [site, rs.filter(r => r.verdict === 'FAIL').length])
+  .sort((a, b) => b[1] - a[1]).slice(0, 20);
+console.log(JSON.stringify({
+  total, passed, partial, failed,
+  totalScore, avgScore: total ? totalScore / total : 0,
+  totalDurationMs, avgDurationMs,
+  totalInputTokens, totalOutputTokens, totalTokens,
+  byCategory: catStats,
+  bySiteFailures: Object.fromEntries(siteFailures),
+  blockers
+}, null, 2));
+"
+```
+## Report Template
+Write `web-bench-report.md` with this structure:
+```markdown
+# WebBench Benchmark Report
+**Date:** {timestamp}
+**Agent:** {agent identifier}
+**Dataset:** Halluminate/WebBench
+**Tasks evaluated:** {total} / 2454
+---
+## Overall Results
+| Metric | Value |
+|--------|-------|
+| **Pass Rate** | {passed}/{total} ({pass_pct}%) |
+| **Partial Rate** | {partial}/{total} ({partial_pct}%) |
+| **Fail Rate** | {failed}/{total} ({fail_pct}%) |
+| **Average Score** | {avg_score:.2f} / 1.0 |
+| **Total Duration** | {total_duration_formatted} |
+| **Avg Duration/Task** | {avg_duration_formatted} |
+| **Total Tokens** | {total_tokens:,} ({input_tokens:,} input + {output_tokens:,} output) |
+| **Avg Tokens/Task** | {avg_tokens:,} |
+## Results by Category
+| Category | Total | Pass | Partial | Fail | Pass Rate | Avg Score |
+|----------|-------|------|---------|------|-----------|-----------|
+| READ | ... | ... | ... | ... | ...% | ... |
+| CREATE | ... | ... | ... | ... | ...% | ... |
+| UPDATE | ... | ... | ... | ... | ...% | ... |
+| DELETE | ... | ... | ... | ... | ...% | ... |
+| FILE_MANIPULATION | ... | ... | ... | ... | ...% | ... |
+## Timing Breakdown
+| Category | Avg Duration | Min | Max |
+|----------|-------------|-----|-----|
+| READ | ... | ... | ... |
+| CREATE | ... | ... | ... |
+| ... | | | |
+## Token Usage Breakdown
+| Category | Avg Input Tokens | Avg Output Tokens | Avg Total |
+|----------|-----------------|-------------------|-----------|
+| READ | ... | ... | ... |
+| CREATE | ... | ... | ... |
+| ... | | | |
+## Top Failure Modes
+| Failure Mode | Count | % of Failures |
+|-------------|-------|---------------|
+| Auth required | ... | ... |
+| CAPTCHA | ... | ... |
+| Site unavailable | ... | ... |
+| Navigation failure | ... | ... |
+## Worst Performing Websites (by failure count)
+| Website | Tasks | Failures | Failure Rate |
+|---------|-------|----------|-------------|
+| ... | ... | ... | ... |
+## Best Performing Websites (by pass rate, min 3 tasks)
+| Website | Tasks | Pass Rate | Avg Score |
+|---------|-------|-----------|-----------|
+| ... | ... | ... | ... |
+## Sample Failures
+{Show 5-10 representative failures with task description, what went wrong, and verdict reasoning}
+## Methodology
+- **Execution:** One task per session via agent-web-interface browser automation
+- **Evaluation:** LLM-as-judge with structured rubric (PASS=1.0, PARTIAL=0.5, FAIL=0.0)
+- **Scoring dimensions:** Navigation, Comprehension, Completeness, Accuracy, Confirmation
+- **Infrastructure blockers** (auth, CAPTCHA, site down) scored as FAIL but flagged separately
+```
+## Output
+- **File:** `web-bench-report.md` in working directory
+- Report should be self-contained and readable without the raw JSONL data
+## Guardrails
+- Use the JSONL file as the sole source of truth — do not fabricate statistics
+- Format all durations as human-readable (e.g., "2h 34m 12s" not "9252000ms")
+- Format token counts with thousands separators
+- Round percentages to one decimal place
+- If results file has fewer than the expected total tasks, note this prominently in the report header

package/dist/1.0.5/codex/plugin/skills/generate-report/agents/claude.yaml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ frontmatter:
2	+ user-invocable: false

package/dist/1.0.5/codex/plugin/skills/load-dataset/SKILL.md ADDED Viewed

@@ -0,0 +1,209 @@
+---
+name: load-dataset
+description: >
+  Download and prepare the Halluminate/WebBench dataset from HuggingFace for benchmarking.
+  Triggers: "load dataset", "download WebBench", "prepare benchmark data", "fetch tasks".
+  Downloads the CSV dataset via curl, converts to JSONL with Node.js, applies optional filters
+  (category, sample size, website allowlist/blocklist), and writes web-bench-tasks.jsonl to the
+  working directory. Zero Python dependencies — uses only curl and Node.js.
+  Does NOT execute tasks — use execute-task for that.
+allowed-tools: Bash Read Write Edit Glob
+---
+# Load WebBench Dataset
+Download the Halluminate/WebBench dataset from HuggingFace and prepare it for benchmark execution.
+## Dataset Source
+- **HuggingFace:** `Halluminate/WebBench`
+- **Source file:** `webbenchfinal.csv` (CSV format)
+- **Size:** ~2,454 tasks across 452 websites
+- **Fields per row:** `ID` (int), `Starting_URL` (string), `Category` (enum), `Task` (string)
+## Pre-check: Skip Download if Dataset Exists
+Before downloading, check if `web-bench-tasks.jsonl` already exists in the working directory:
+```bash
+if [ -f web-bench-tasks.jsonl ]; then
+  echo "Dataset already exists: $(wc -l < web-bench-tasks.jsonl) tasks"
+  head -1 web-bench-tasks.jsonl
+fi
+```
+**If `web-bench-tasks.jsonl` exists and is non-empty, skip the download and conversion entirely.** Jump straight to [Applying Filters](#applying-filters) if filters need to be applied, or report the existing dataset to the tracker.
+Only proceed with download if the file does not exist or is empty.
+## Download Method
+Download the CSV directly with `curl`, then convert to JSONL with Node.js. No Python dependencies required.
+### Step 1: Download the CSV
+```bash
+curl -fSL -o web-bench-dataset.csv \
+  "https://huggingface.co/datasets/Halluminate/WebBench/resolve/main/webbenchfinal.csv"
+```
+If the above URL fails (HuggingFace sometimes changes paths), try:
+```bash
+curl -fSL -o web-bench-dataset.csv \
+  "https://huggingface.co/datasets/Halluminate/WebBench/raw/main/webbenchfinal.csv"
+```
+### Step 2: Convert CSV to JSONL
+```bash
+node -e "
+const fs = require('fs');
+const csv = fs.readFileSync('web-bench-dataset.csv', 'utf-8');
+const lines = csv.split('\n');
+const header = lines[0].split(',').map(h => h.trim().replace(/^\"|\"$/g, ''));
+// Find column indices
+const idIdx = header.findIndex(h => h === 'ID');
+const urlIdx = header.findIndex(h => h === 'Starting_URL');
+const catIdx = header.findIndex(h => h === 'Category');
+const taskIdx = header.findIndex(h => h === 'Task');
+const out = fs.createWriteStream('web-bench-tasks.jsonl');
+let count = 0;
+for (let i = 1; i < lines.length; i++) {
+  const line = lines[i].trim();
+  if (!line) continue;
+  // Parse CSV line respecting quoted fields
+  const fields = [];
+  let field = '';
+  let inQuotes = false;
+  for (let j = 0; j < line.length; j++) {
+    const ch = line[j];
+    if (ch === '\"') {
+      inQuotes = !inQuotes;
+    } else if (ch === ',' && !inQuotes) {
+      fields.push(field.trim());
+      field = '';
+    } else {
+      field += ch;
+    }
+  }
+  fields.push(field.trim());
+  if (fields.length > taskIdx) {
+    out.write(JSON.stringify({
+      id: parseInt(fields[idIdx], 10),
+      url: fields[urlIdx],
+      category: fields[catIdx],
+      task: fields[taskIdx]
+    }) + '\n');
+    count++;
+  }
+}
+out.end();
+console.log('Wrote ' + count + ' tasks to web-bench-tasks.jsonl');
+"
+```
+### Step 3: Verify the output
+```bash
+wc -l web-bench-tasks.jsonl
+head -1 web-bench-tasks.jsonl
+node -e "
+const fs = require('fs');
+const tasks = fs.readFileSync('web-bench-tasks.jsonl','utf-8').trim().split('\n').map(JSON.parse);
+const cats = {};
+const sites = new Set();
+for (const t of tasks) {
+  cats[t.category] = (cats[t.category] || 0) + 1;
+  sites.add(t.url);
+}
+for (const [c, n] of Object.entries(cats).sort()) console.log('  ' + c + ': ' + n);
+console.log('Total: ' + tasks.length + ' tasks across ' + sites.size + ' websites');
+"
+```
+## Applying Filters
+After downloading, apply filters based on tracker configuration. All filters use Node.js.
+### Category Filter
+If the tracker specifies a category filter (e.g., `READ`, `CREATE`):
+```bash
+node -e "
+const fs = require('fs');
+const category = process.argv[1];
+const tasks = fs.readFileSync('web-bench-tasks.jsonl','utf-8').trim().split('\n').map(JSON.parse);
+const filtered = tasks.filter(t => t.category === category);
+fs.writeFileSync('web-bench-tasks.jsonl', filtered.map(JSON.stringify).join('\n') + '\n');
+console.log('Filtered to ' + filtered.length + ' ' + category + ' tasks');
+" "READ"
+```
+### Sample Size
+If the tracker specifies a sample size (e.g., `--sample 50`):
+```bash
+node -e "
+const fs = require('fs');
+const n = parseInt(process.argv[1], 10);
+const tasks = fs.readFileSync('web-bench-tasks.jsonl','utf-8').trim().split('\n').map(JSON.parse);
+// Deterministic shuffle (seed-based) for reproducibility
+function seededShuffle(arr, seed) {
+  const a = [...arr];
+  let s = seed;
+  for (let i = a.length - 1; i > 0; i--) {
+    s = (s * 1664525 + 1013904223) & 0xffffffff;
+    const j = ((s >>> 0) % (i + 1));
+    [a[i], a[j]] = [a[j], a[i]];
+  }
+  return a;
+}
+const sample = seededShuffle(tasks, 42).slice(0, Math.min(n, tasks.length));
+fs.writeFileSync('web-bench-tasks.jsonl', sample.map(JSON.stringify).join('\n') + '\n');
+console.log('Sampled ' + sample.length + ' tasks');
+" "50"
+```
+### Website Blocklist
+```bash
+node -e "
+const fs = require('fs');
+const blocklist = new Set(process.argv[1] ? process.argv[1].split(',') : []);
+const tasks = fs.readFileSync('web-bench-tasks.jsonl','utf-8').trim().split('\n').map(JSON.parse);
+const filtered = tasks.filter(t => !blocklist.has(t.url));
+fs.writeFileSync('web-bench-tasks.jsonl', filtered.map(JSON.stringify).join('\n') + '\n');
+console.log(filtered.length + ' tasks after blocklist filter');
+" ""
+```
+## Output
+- **File:** `web-bench-tasks.jsonl` in working directory
+- **Intermediate file:** `web-bench-dataset.csv` (can be deleted after conversion)
+- **Format:** One JSON object per line
+- **Schema:**
+  ```json
+  {"id": 42, "url": "https://acehardware.com", "category": "READ", "task": "Navigate to..."}
+  ```
+## Cleanup
+After successful conversion, remove the intermediate CSV:
+```bash
+rm -f web-bench-dataset.csv
+```
+Report the total count and category breakdown to the tracker.

package/dist/1.0.5/codex/plugin/skills/load-dataset/agents/claude.yaml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ frontmatter:
2	+ user-invocable: false

package/dist/1.0.5/codex/plugin/skills/run-benchmark/SKILL.md ADDED Viewed

@@ -0,0 +1,92 @@
+---
+name: run-benchmark
+description: >
+  Run the WebBench browser agent benchmark — main entry point and orchestrator.
+  Triggers: "run benchmark", "run WebBench", "start benchmark", "benchmark browser agent",
+  "web bench", "execute WebBench", "run web-bench".
+  Parses user configuration (category filter, sample size, resume), delegates to
+  load-dataset, execute-task, evaluate-task, and generate-report skills.
+  This is the user-invocable orchestrator that ties the full benchmark pipeline together.
+allowed-tools: Read Write Edit Glob Grep Bash Task
+---
+# Run WebBench Benchmark
+Main entry point for running the WebBench browser agent benchmark. This skill is used in interactive (single-session) mode. For multi-session workflow execution, see the system prompt.
+## Input
+Parse configuration from: `$ARGUMENTS`
+Supported flags:
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--category <CAT>` | Filter tasks by category (READ, CREATE, UPDATE, DELETE, FILE_MANIPULATION) | All categories |
+| `--sample <N>` | Random sample of N tasks (deterministic seed=42) | Full dataset |
+| `--resume` | Resume from existing web-bench-results.jsonl, skip completed task IDs | Fresh run |
+| `--report-only` | Skip execution, just generate report from existing results | Full run |
+Examples:
+- `run-benchmark --category READ --sample 50` — 50 random READ tasks
+- `run-benchmark --resume` — continue from where last run stopped
+- `run-benchmark --report-only` — just aggregate existing results
+## Interactive Execution Protocol
+When run interactively (not via the workflow loop), this skill executes the full pipeline in a single session:
+### 1. Setup
+1. Parse arguments
+2. Check for existing state (`web-bench-tasks.jsonl`, `web-bench-results.jsonl`)
+3. If `--resume` and results exist: determine completed task IDs, skip them
+4. If not resuming: load the `load-dataset` skill to download and prepare the dataset
+5. Report configuration and task count
+### 2. Execute Tasks
+For each task in `web-bench-tasks.jsonl` (skipping completed if resuming):
+1. Read the task line
+2. Record start time: `date +%s%3N`
+3. Load `execute-task` methodology and have the browser-capable calling context perform the browser automation
+4. Load `evaluate-task` methodology and score the result
+5. Record end time: `date +%s%3N`, compute duration
+6. Append result to `web-bench-results.jsonl`:
+   ```json
+   {"id": 42, "url": "...", "category": "READ", "task": "...", "score": 1.0, "verdict": "PASS", "reasoning": "...", "error": null, "duration_ms": 34200, "tokens_used": {"input": 12450, "output": 3200}, "timestamp": "2026-03-19T14:30:00Z"}
+   ```
+7. Print progress: `[42/2454] PASS (1.0) — acehardware.com — READ — 34.2s`
+### 3. Generate Report
+After all tasks are processed (or if `--report-only`):
+1. Load `generate-report` methodology
+2. Aggregate `web-bench-results.jsonl` into `web-bench-report.md`
+3. Print summary statistics to console
+## Token Tracking
+Token usage should be tracked per task. The agent should estimate tokens consumed during task execution by recording:
+- **Input tokens:** Approximate from the size of prompts, page snapshots, and tool responses received during execution
+- **Output tokens:** Approximate from the size of responses and tool calls generated
+If exact token counts are available from the session metadata, prefer those over estimates.
+## Progress Display
+After each task, print a status line:
+```
+[1/50] PASS  (1.0)  acehardware.com          READ    34.2s   15,650 tokens
+[2/50] FAIL  (0.0)  airbnb.com               CREATE  12.1s    8,200 tokens  [auth_required]
+[3/50] PARTIAL(0.5) amazon.com               READ    45.8s   22,100 tokens
+```
+## Guardrails
+- **Always append, never overwrite** results. The JSONL file is append-only.
+- **Respect the dataset.** Do not modify task descriptions or skip tasks without recording a FAIL.

package/dist/1.0.5/codex/plugin/skills/run-benchmark/agents/claude.yaml ADDED Viewed

@@ -0,0 +1,3 @@
+frontmatter:
+  argument-hint: "[--category READ|CREATE|UPDATE|DELETE|FILE_MANIPULATION] [--sample N] [--resume]"
+  user-invocable: true

package/dist/1.0.5/release.json ADDED Viewed

@@ -0,0 +1,18 @@
+{
+  "schemaVersion": 1,
+  "pluginRef": "web-bench@athena-workflow-marketplace",
+  "pluginName": "web-bench",
+  "marketplaceName": "athena-workflow-marketplace",
+  "version": "1.0.5",
+  "artifacts": {
+    "claude": {
+      "type": "directory",
+      "path": "./claude/plugin"
+    },
+    "codex": {
+      "type": "marketplace",
+      "marketplacePath": "./.agents/plugins/marketplace.json",
+      "pluginPath": "./codex/plugin"
+    }
+  }
+}

package/package.json ADDED Viewed

@@ -0,0 +1,13 @@
+{
+  "name": "@athenaflow/plugin-web-bench",
+  "version": "1.0.5",
+  "description": "WebBench benchmark runner \u2014 executes real-world browser tasks from the Halluminate/WebBench dataset, scores via LLM-as-judge, and produces evaluation reports",
+  "license": "MIT",
+  "publishConfig": {
+    "access": "public"
+  },
+  "scripts": {
+    "build:artifacts": "node ../../scripts/build-plugin-artifacts.mjs .",
+    "prepack": "npm run build:artifacts"
+  }
+}