npm - devlyn-cli - Versions diffs - 1.14.0 → 2.0.0 - Mend

devlyn-cli 1.14.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json ADDED Viewed

@@ -0,0 +1,162 @@
+{
+  "fixture_id": "F3-backend-contract-risk",
+  "generated_at": "2026-04-29T09:57:53Z",
+  "generated_from": {
+    "expected_path": "benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json",
+    "expected_sha256": "c0925ee948179fbc1c76836d98fba0c14c7eba56f462f2922903951923cb22e6",
+    "metadata_path": "benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json",
+    "metadata_sha256": "c54530db26dbb04ce50b698fed2608206eae6f9a5dc2f666f127695e15d3fa30",
+    "oracle_script_shas": {
+      "scope-tier-a": "baaf21ed4a67f35d2a8af825e72869ef9737b5dfe08d65dd1a11c26fafe297ae",
+      "scope-tier-b": "9349d00a5c7456a4df9142923334e7004407d53f2443f2e210945bb771971e25",
+      "test-fidelity": "401184da51ae500cecfc75a6c5819b0d28acb63a397f788fb628c2913562f903"
+    }
+  },
+  "required_invariants": [
+    {
+      "authority": "expected.json/forbidden_patterns",
+      "id": "forbidden_pattern__eslint_disable_without_explicit_per_issue_justification__server_index_js",
+      "operational_check": "variant arm output MUST NOT contain regex pattern '/\\\\*\\\\s*eslint-disable' in files ['server/index.js']; rationale: eslint-disable without explicit per-issue justification",
+      "severity": "disqualifier",
+      "source_field": "expected.json/forbidden_patterns/1",
+      "source_ref": "expected.json:forbidden_patterns[1]"
+    },
+    {
+      "authority": "expected.json/forbidden_patterns",
+      "id": "forbidden_pattern__silent_catch_returning_fallback__server_index_js",
+      "operational_check": "variant arm output MUST NOT contain regex pattern \"catch\\\\s*\\\\([^)]*\\\\)\\\\s*\\\\{[^}]*return\\\\s+(null|undefined|'')\" in files ['server/index.js']; rationale: silent catch returning fallback",
+      "severity": "disqualifier",
+      "source_field": "expected.json/forbidden_patterns/0",
+      "source_ref": "expected.json:forbidden_patterns[0]"
+    },
+    {
+      "authority": "expected.json/max_deps_added",
+      "id": "max_deps_added__0",
+      "operational_check": "variant arm MUST NOT add more than 0 new npm dependencies (count delta of package.json:dependencies + devDependencies)",
+      "severity": "hard",
+      "source_field": "expected.json/max_deps_added",
+      "source_ref": "expected.json:max_deps_added"
+    },
+    {
+      "authority": "expected.json/required_files",
+      "id": "required_file__server_index_js",
+      "operational_check": "variant arm output MUST contain file 'server/index.js' (created or preserved)",
+      "severity": "hard",
+      "source_field": "expected.json/required_files",
+      "source_ref": "expected.json:required_files[server/index.js]"
+    },
+    {
+      "authority": "expected.json/required_files",
+      "id": "required_file__tests_server_test_js",
+      "operational_check": "variant arm output MUST contain file 'tests/server.test.js' (created or preserved)",
+      "severity": "hard",
+      "source_field": "expected.json/required_files",
+      "source_ref": "expected.json:required_files[tests/server.test.js]"
+    },
+    {
+      "authority": "metadata/oracle-allowlist",
+      "id": "scope-tier-a:lockfile-deletion",
+      "operational_check": "variant arm MUST NOT delete a scaffold-present lockfile",
+      "severity": "hard",
+      "source_field": "oracle/scope-tier-a/scope-tier-a:lockfile-deletion",
+      "source_ref": "oracle-scope-tier-a.py"
+    },
+    {
+      "authority": "metadata/oracle-allowlist",
+      "id": "scope-tier-a:tier-a-violation",
+      "operational_check": "variant arm MUST NOT add or modify paths matching: docs/roadmap/** | docs/VISION.md | docs/ROADMAP.md | .github/** | node_modules/** | **/node_modules/** | test-results/** | coverage/** | .nyc_output/** | basename suffix .log | basename prefix .env or secrets.",
+      "severity": "hard",
+      "source_field": "oracle/scope-tier-a/scope-tier-a:tier-a-violation",
+      "source_ref": "oracle-scope-tier-a.py"
+    },
+    {
+      "authority": "metadata/oracle-allowlist",
+      "id": "scope-tier-b:scope-unmatched",
+      "operational_check": "every variant-touched file MUST be either inside spec_output_files (Tier C) OR reachable from a Tier C seed via static JS/TS imports OR matched by expected.json:tier_a_waivers",
+      "severity": "warn",
+      "source_field": "oracle/scope-tier-b/scope-tier-b:scope-unmatched",
+      "source_ref": "oracle-scope-tier-b.py"
+    },
+    {
+      "authority": "expected.json/spec_output_files",
+      "id": "spec_output_file__server_index_js",
+      "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'server/index.js' is one Tier C seed",
+      "severity": "warn",
+      "source_field": "expected.json/spec_output_files",
+      "source_ref": "expected.json:spec_output_files[server/index.js]"
+    },
+    {
+      "authority": "expected.json/spec_output_files",
+      "id": "spec_output_file__tests_server_test_js",
+      "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'tests/server.test.js' is one Tier C seed",
+      "severity": "warn",
+      "source_field": "expected.json/spec_output_files",
+      "source_ref": "expected.json:spec_output_files[tests/server.test.js]"
+    },
+    {
+      "authority": "metadata/oracle-allowlist",
+      "id": "test-fidelity:assertion-regression",
+      "operational_check": "effective assertion count MUST NOT drop and skipped-test count MUST NOT rise; vacuous expect.assertions(0) is treated as a real regression",
+      "severity": "warn",
+      "source_field": "oracle/test-fidelity/test-fidelity:assertion-regression",
+      "source_ref": "oracle-test-fidelity.py"
+    },
+    {
+      "authority": "metadata/oracle-allowlist",
+      "id": "test-fidelity:mock-swap",
+      "operational_check": "post-arm test file MUST NOT swap REAL_PATTERNS hits for MOCK_PATTERNS hits (jest/vi/sinon, nock/msw, app.handle/inject/callback, hand-rolled IncomingMessage/ServerResponse, etc.); a drop in real_calls combined with a rise in mock_calls is a mock-swap flag",
+      "severity": "flag",
+      "source_field": "oracle/test-fidelity/test-fidelity:mock-swap",
+      "source_ref": "oracle-test-fidelity.py"
+    },
+    {
+      "authority": "metadata/oracle-allowlist",
+      "id": "test-fidelity:test-file-deleted",
+      "operational_check": "no scaffold-present test file may be deleted by the variant arm; deletion of an existing tests/*.test.* / *.spec.* / *.e2e.* file is a flag-severity finding",
+      "severity": "flag",
+      "source_field": "oracle/test-fidelity/test-fidelity:test-file-deleted",
+      "source_ref": "oracle-test-fidelity.py"
+    },
+    {
+      "authority": "metadata/oracle-allowlist",
+      "id": "test-fidelity:test-file-renamed",
+      "operational_check": "rename of a scaffold-present test file is warn-severity (content fidelity not verified across renames in step 1)",
+      "severity": "warn",
+      "source_field": "oracle/test-fidelity/test-fidelity:test-file-renamed",
+      "source_ref": "oracle-test-fidelity.py"
+    },
+    {
+      "authority": "expected.json/verification_commands",
+      "id": "verification__6001efe2",
+      "operational_check": "running `node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?per_page=abc`, r => { console.log(r.statusCode); s.close(); }); });'` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['400']; stdout MUST NOT contain any of []",
+      "severity": "hard",
+      "source_field": "expected.json/verification_commands/3",
+      "source_ref": "expected.json:verification_commands[3]"
+    },
+    {
+      "authority": "expected.json/verification_commands",
+      "id": "verification__6517d995",
+      "operational_check": "running `node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items`, r => { let b = \"\"; r.on(\"data\", c=>b+=c); r.on(\"end\", () => { const d = JSON.parse(b); console.log(JSON.stringify({ total: d.total, page: d.page, per_page: d.per_page, items_len: d.items.length })); s.close(); }); }); });'` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['\"total\":2', '\"page\":1']; stdout MUST NOT contain any of []",
+      "severity": "hard",
+      "source_field": "expected.json/verification_commands/1",
+      "source_ref": "expected.json:verification_commands[1]"
+    },
+    {
+      "authority": "expected.json/verification_commands",
+      "id": "verification__73df5e81",
+      "operational_check": "running `node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?page=2&per_page=1`, r => { let b = \"\"; r.on(\"data\", c=>b+=c); r.on(\"end\", () => { const d = JSON.parse(b); console.log(d.items[0] && d.items[0].name); s.close(); }); }); });'` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['beta']; stdout MUST NOT contain any of []",
+      "severity": "hard",
+      "source_field": "expected.json/verification_commands/2",
+      "source_ref": "expected.json:verification_commands[2]"
+    },
+    {
+      "authority": "expected.json/verification_commands",
+      "id": "verification__7c5f3637",
+      "operational_check": "running `node --test tests/server.test.js` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of []; stdout MUST NOT contain any of ['fail ']",
+      "severity": "hard",
+      "source_field": "expected.json/verification_commands/0",
+      "source_ref": "expected.json:verification_commands[0]"
+    }
+  ],
+  "schema_version": "1"
+}

package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json ADDED Viewed

@@ -0,0 +1,65 @@
+{
+  "verification_commands": [
+    {
+      "cmd": "node --test tests/server.test.js",
+      "exit_code": 0,
+      "stdout_contains": [],
+      "stdout_not_contains": [
+        "not ok "
+      ]
+    },
+    {
+      "cmd": "node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items`, r => { let b = \"\"; r.on(\"data\", c=>b+=c); r.on(\"end\", () => { const d = JSON.parse(b); console.log(JSON.stringify({ total: d.total, page: d.page, per_page: d.per_page, items_len: d.items.length })); s.close(); }); }); });'",
+      "exit_code": 0,
+      "stdout_contains": [
+        "\"total\":2",
+        "\"page\":1"
+      ],
+      "stdout_not_contains": []
+    },
+    {
+      "cmd": "node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?page=2&per_page=1`, r => { let b = \"\"; r.on(\"data\", c=>b+=c); r.on(\"end\", () => { const d = JSON.parse(b); console.log(d.items[0] && d.items[0].name); s.close(); }); }); });'",
+      "exit_code": 0,
+      "stdout_contains": [
+        "beta"
+      ],
+      "stdout_not_contains": []
+    },
+    {
+      "cmd": "node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?per_page=abc`, r => { console.log(r.statusCode); s.close(); }); });'",
+      "exit_code": 0,
+      "stdout_contains": [
+        "400"
+      ],
+      "stdout_not_contains": []
+    }
+  ],
+  "forbidden_patterns": [
+    {
+      "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|'')",
+      "description": "silent catch returning fallback",
+      "files": [
+        "server/index.js"
+      ],
+      "severity": "disqualifier"
+    },
+    {
+      "pattern": "/\\*\\s*eslint-disable",
+      "description": "eslint-disable without explicit per-issue justification",
+      "files": [
+        "server/index.js"
+      ],
+      "severity": "disqualifier"
+    }
+  ],
+  "required_files": [
+    "server/index.js",
+    "tests/server.test.js"
+  ],
+  "forbidden_files": [],
+  "max_deps_added": 0,
+  "spec_output_files": [
+    "server/index.js",
+    "tests/server.test.js"
+  ]
+}

package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json ADDED Viewed

@@ -0,0 +1,19 @@
+{
+  "id": "F3-backend-contract-risk",
+  "category": "high-risk",
+  "difficulty": "high",
+  "timeout_seconds": 1500,
+  "required_tools": ["node"],
+  "browser": false,
+  "deps_change_expected": false,
+  "intent": "Change the GET /items response shape from { items } to a paginated { items, total, page, per_page } while keeping the existing 1-based id semantics and updating tests. A lazy implementation will leave tests broken or drop the items array — the spec requires both.",
+  "pair_plan_oracle_categories": [
+    "scope-tier-a:lockfile-deletion",
+    "scope-tier-a:tier-a-violation",
+    "scope-tier-b:scope-unmatched",
+    "test-fidelity:assertion-regression",
+    "test-fidelity:mock-swap",
+    "test-fidelity:test-file-deleted",
+    "test-fidelity:test-file-renamed"
+  ]
+}

package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh ADDED Viewed

@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+# F3 setup — no changes to base test-repo. Task modifies existing server/index.js.
+set -e
+exit 0

package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md ADDED Viewed

@@ -0,0 +1,56 @@
+---
+id: "F3-backend-contract-risk"
+title: "Paginate GET /items response"
+status: planned
+complexity: high
+depends-on: []
+---
+# F3 Paginate `GET /items`
+## Context
+`server/index.js` currently returns `{ items: [...] }` for `GET /items` with
+no pagination metadata. As the dataset grows, clients need a `total` and
+paging information. The task: wrap the existing response in a pagination
+envelope, accept `?page` and `?per_page` query parameters, and update tests
+so existing assertions continue to pass alongside new paging assertions.
+## Requirements
+- [ ] `GET /items` (no query) returns `{ items, total, page, per_page }` where:
+  - `items` is the full list (baseline repo has 2 items).
+  - `total` is the full item count.
+  - `page` is `1`.
+  - `per_page` is the full item count when no pagination was requested.
+- [ ] `GET /items?page=1&per_page=1` returns the first item wrapped in the envelope with `items.length === 1`, `total === 2`, `page === 1`, `per_page === 1`.
+- [ ] `GET /items?page=2&per_page=1` returns the second item similarly.
+- [ ] `GET /items?page=99&per_page=1` returns `items: []`, `total === 2`, `page === 99`, `per_page === 1` (out-of-range page is allowed — bare empty array, never a 404).
+- [ ] `GET /items/:id` behavior unchanged (the per-item route does NOT get paginated).
+- [ ] `tests/server.test.js` is updated so every existing assertion still holds (semantically) AND the new paging behavior is covered by at least two new tests.
+- [ ] `GET /health` continues to return `{ status: 'ok' }` unchanged.
+## Constraints
+- **No new npm dependencies.** Use only Express + built-ins already in the repo.
+- **No silent catches.** Invalid `page` or `per_page` (non-numeric, zero, negative) must respond 400 with `{ error: 'invalid_query', field }`.
+- **No breaking change to `/items/:id`.** The per-item route must keep its current contract (the fixture explicitly does NOT paginate single-item lookups).
+- **Backward-compat note**: clients that previously read `response.items` MUST still get the array at the same key inside the new envelope.
+- **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
+## Out of Scope
+- Caching, rate limiting, authentication.
+- Converting `items` to a database-backed list.
+- Touching `bin/cli.js`, `web/`, or `tests/cli.test.js`.
+- Adding a new route.
+## Verification
+- Server start: `node server/index.js` listens on port 3000 (exit via SIGINT).
+- `curl -s http://127.0.0.1:3000/items | jq '.total'` returns `2`.
+- `curl -s 'http://127.0.0.1:3000/items?per_page=1&page=2' | jq '.items[0].name'` returns `"beta"`.
+- `curl -s 'http://127.0.0.1:3000/items?per_page=abc' -o /dev/null -w '%{http_code}'` returns `400`.
+- `node --test tests/server.test.js` passes; must include ≥ 2 new paging tests.
+- `git diff --stat` shows only `server/index.js` and `tests/server.test.js` touched.

package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt ADDED Viewed

@@ -0,0 +1,9 @@
+The `GET /items` endpoint in `server/index.js` currently returns `{ items: [...] }`. Paginate it: the response should be `{ items, total, page, per_page }`. Accept `?page` and `?per_page` query params. When no params are given, return everything on page 1 with `per_page` equal to the full count.
+Keep `GET /items/:id` unchanged (no pagination on single-item lookup). `GET /health` stays as-is.
+Invalid `page` or `per_page` (non-numeric, zero, negative) → respond 400 with `{ error: 'invalid_query', field: '<name>' }`. Out-of-range page (beyond the last item) returns an empty `items` array, NOT a 404.
+Update `tests/server.test.js` so existing behavior is still covered AND you add at least two new tests for the paging behavior.
+No new npm dependencies. Only touch `server/index.js` and `tests/server.test.js`.

package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md ADDED Viewed

@@ -0,0 +1,40 @@
+# F4 — Notes
+## Purpose
+Exercises the browser-validate phase of the pipeline (Phase 1.5). Catches
+web-UI-only regressions that unit tests can't see and that server/integration
+tests won't surface.
+## Failure modes detected
+- **Italic via Unicode.** Arms may reach for Unicode italic characters
+  (`𝑖𝑡𝑎𝑙𝑖𝑐`) instead of CSS. Spec explicitly forbids this because it breaks
+  screen readers.
+- **CDN link.** Linking to Google Fonts or an external CSS cuts the bench
+  and breaks offline / air-gapped runs — disqualifier.
+- **Breaking Greet.** Careless refactors rewire the Greet button's handler
+  by mistake. Pipeline's Phase 1.5 browser-validate + dedicated spec test
+  catches it.
+- **Accessibility drift.** Missing/incorrect `aria-label` on button.
+## Pipeline exercise
+- Phase 1.5 BROWSER VALIDATE is the primary gate (web file changes trigger it).
+- Phase 3 CRITIC design checks the DOM structure and event-handler wiring.
+## Caveats
+- Playwright requires browser binaries installed locally. If the runner
+  machine lacks them, the browser test commands will fail. The suite
+  runner can still scoring for diff + grep checks, but the Playwright
+  command will show exit ≠ 0.
+- The bench runner sets `BROWSER_METADATA` so future versions can wire
+  stricter browser-required gating; today the fixture only checks file
+  presence in verification.
+## Rotation trigger
+When both arms consistently produce correct output AND include accessible
+markup without pipeline intervention, rotate to a harder UI task (e.g., a
+form with validation states).

package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json ADDED Viewed

@@ -0,0 +1,57 @@
+{
+  "verification_commands": [
+    {
+      "cmd": "grep -q 'id=\"whisper\"' web/index.html && echo OK",
+      "exit_code": 0,
+      "stdout_contains": [
+        "OK"
+      ],
+      "stdout_not_contains": []
+    },
+    {
+      "cmd": "grep -q 'hello from bench-test-repo' web/index.html && echo OK",
+      "exit_code": 0,
+      "stdout_contains": [
+        "OK"
+      ],
+      "stdout_not_contains": []
+    },
+    {
+      "cmd": "grep -qE '(italic|font-style)' web/index.html && echo OK",
+      "exit_code": 0,
+      "stdout_contains": [
+        "OK"
+      ],
+      "stdout_not_contains": []
+    },
+    {
+      "cmd": "bash -c 'shopt -s nullglob; files=(tests/e2e/*.spec.*); [ ${#files[@]} -gt 0 ] && echo FOUND || { echo MISSING; exit 1; }'",
+      "exit_code": 0,
+      "stdout_contains": [
+        "FOUND"
+      ],
+      "stdout_not_contains": [
+        "MISSING"
+      ]
+    }
+  ],
+  "forbidden_patterns": [
+    {
+      "pattern": "(cdnjs|unpkg|jsdelivr|fonts\\.googleapis)",
+      "description": "external CDN reference \u2014 out-of-scope / offline brittleness",
+      "files": [
+        "web/index.html"
+      ],
+      "severity": "disqualifier"
+    }
+  ],
+  "required_files": [
+    "web/index.html"
+  ],
+  "forbidden_files": [],
+  "max_deps_added": 0,
+  "spec_output_files": [
+    "web/index.html",
+    "tests/e2e/**"
+  ]
+}

package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json ADDED Viewed

@@ -0,0 +1,10 @@
+{
+  "id": "F4-web-browser-design",
+  "category": "stress",
+  "difficulty": "medium",
+  "timeout_seconds": 1800,
+  "required_tools": ["node", "npx"],
+  "browser": true,
+  "deps_change_expected": false,
+  "intent": "Add a second button labelled 'Whisper' to web/index.html that, when clicked, replaces the #output text with 'hello from bench-test-repo' rendered in lowercase italic. The existing 'Greet' button continues to work unchanged. Tests exercise both buttons via the static page (no server)."
+}

package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh ADDED Viewed

@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+# F4 setup — no base changes needed. The task extends web/index.html and
+# creates a Playwright test file.
+set -e
+mkdir -p tests/e2e
+exit 0

package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md ADDED Viewed

@@ -0,0 +1,49 @@
+---
+id: "F4-web-browser-design"
+title: "Add a Whisper button with italic lowercase output"
+status: planned
+complexity: medium
+depends-on: []
+---
+# F4 Add Whisper button
+## Context
+`web/index.html` currently has one button ("Greet") that fills `#output`
+with `Hello from bench-test-repo`. Add a second button beside it labelled
+`Whisper` that fills `#output` with `hello from bench-test-repo` — lowercase
+and italicized — using only the page's own CSS/JS.
+## Requirements
+- [ ] A new `<button id="whisper">Whisper</button>` renders beside the existing `#greet` button.
+- [ ] Clicking `#whisper` sets `#output` textContent to `hello from bench-test-repo` (lowercase, no exclamation).
+- [ ] `#output`'s rendering of the whisper text is italic. Use CSS (inline, a class, or toggling a class). Do not rely on Unicode italic characters.
+- [ ] Clicking `#greet` continues to set `#output` to `Hello from bench-test-repo` as before (no italic styling).
+- [ ] A text node in `#output` is readable by Playwright via `data-testid="output"` (already present in the baseline).
+- [ ] Minimal diff: only `web/index.html` and any new files directly needed for the test harness (e.g., `tests/e2e/whisper.spec.js` per the existing Playwright config).
+## Constraints
+- **No new npm dependencies.** Playwright is already scripted via `npx serve` and the repo's `playwright.config.js`.
+- **No external resources.** Don't link to CDN fonts, external CSS, or remote images.
+- **No inline JS frameworks.** Stick to the vanilla pattern already in `index.html`.
+- **Accessibility.** Both buttons must have accessible names equal to their visible labels; `#whisper` adds `aria-label="whisper"` only if its visible text differs (it doesn't, so leave it off).
+- **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
+## Out of Scope
+- Animations / transitions.
+- Theme toggle / dark mode.
+- Any change to `bin/cli.js`, `server/`, or CLI tests.
+- Moving styles into a separate .css file.
+## Verification
+- Page loads: `npx serve -l 5173 web &` + `curl -s http://127.0.0.1:5173/` returns HTML containing `<button id="whisper"`.
+- Clicking whisper produces `hello from bench-test-repo` in `#output` — verifiable via Playwright:
+  `npx playwright test tests/e2e/` passes the whisper spec.
+- Clicking greet still produces `Hello from bench-test-repo` (test stays green).
+- `git diff --stat` shows only `web/index.html` and the added Playwright test file.

package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt ADDED Viewed

@@ -0,0 +1,9 @@
+Add a second button next to the existing "Greet" button in `web/index.html`, labelled "Whisper". When clicked, it should set `#output` to `hello from bench-test-repo` (lowercase, no exclamation mark) rendered in italic.
+The existing "Greet" button must continue to set `#output` to `Hello from bench-test-repo` as before — no italic, no change.
+Keep everything self-contained in the page: no CDN fonts, no new npm dependencies, no external resources. Use the same vanilla JS pattern that's already there.
+Write a Playwright test under `tests/e2e/` that exercises both buttons. The repo already has `playwright.config.js` and serves `web/` via `npx serve -l 5173`.
+Only touch `web/index.html` and the new Playwright test file.

package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md ADDED Viewed

@@ -0,0 +1,38 @@
+# F5 — Notes
+## Purpose
+The suite's FIX LOOP stress test. The tests are intentionally constructed so
+the obvious first-pass implementation (simple `input.split(' ').filter(w => w === word).length`) passes the basic count case but fails on:
+- Case insensitivity (`Cat` should match `cat`).
+- Whole-word boundaries (`cat` should NOT match inside `category`).
+- Empty-stdin edge (returning `undefined` instead of `0`).
+Variant's pipeline is expected to:
+1. BUILD produces a first implementation.
+2. BUILD GATE runs `node --test`; some tests fail.
+3. EVAL emits findings with `criterion_ref` pointing at specific failing cases.
+4. FIX LOOP round 1 targets those findings and converges.
+Bare, without a forcing mechanism, often ships the first implementation and
+calls it done. Verification catches that.
+## Failure modes detected
+- **Partial implementation.** Naive token split without regex word boundaries.
+- **Case handling.** Missing `.toLowerCase()` on both sides of the comparison.
+- **Async stdin.** Using `process.stdin.on('data')` without handling `end` properly → program hangs on test invocation.
+- **Forgotten empty case.** `stdin.read()` returning `null` → `null.length` or `undefined` output.
+## Pipeline exercise
+- **Phase 2 EVAL** is the star: it must identify each failing test case with file:line evidence.
+- **Phase 2.5 FIX LOOP** runs at least once. A fixture passing with 0 fix rounds is a smoke signal that the test-trap design is too lenient; inspect.
+- **Phase 1.4 BUILD GATE** uses `node --test` which exits non-zero on any failure, forcing route to 2.5.
+## Rotation trigger
+When fix rounds consistently = 0 across two shipped versions, the trap is too
+easy. Stiffen by adding a fourth test edge (e.g., Unicode folding, hyphenated
+words).

package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json ADDED Viewed

@@ -0,0 +1,65 @@
+{
+  "verification_commands": [
+    {
+      "cmd": "node --test tests/count.test.js",
+      "exit_code": 0,
+      "stdout_contains": [],
+      "stdout_not_contains": [
+        "not ok "
+      ]
+    },
+    {
+      "cmd": "echo 'cat hat CAT category' | node bin/cli.js count cat",
+      "exit_code": 0,
+      "stdout_contains": [
+        "2"
+      ],
+      "stdout_not_contains": [
+        "3",
+        "4"
+      ]
+    },
+    {
+      "cmd": "echo '' | node bin/cli.js count cat",
+      "exit_code": 0,
+      "stdout_contains": [
+        "0"
+      ],
+      "stdout_not_contains": []
+    },
+    {
+      "cmd": "node bin/cli.js count",
+      "exit_code": 1,
+      "stdout_contains": [],
+      "stdout_not_contains": []
+    },
+    {
+      "cmd": "node bin/cli.js hello",
+      "exit_code": 0,
+      "stdout_contains": [
+        "Hello, world!"
+      ],
+      "stdout_not_contains": []
+    }
+  ],
+  "forbidden_patterns": [
+    {
+      "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
+      "description": "empty catch block \u2014 silent error suppression",
+      "files": [
+        "bin/cli.js"
+      ],
+      "severity": "disqualifier"
+    }
+  ],
+  "required_files": [
+    "bin/cli.js",
+    "tests/count.test.js"
+  ],
+  "forbidden_files": [],
+  "max_deps_added": 0,
+  "spec_output_files": [
+    "bin/cli.js",
+    "tests/**/count.test.js"
+  ]
+}

package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json ADDED Viewed

@@ -0,0 +1,10 @@
+{
+  "id": "F5-fix-loop-red-green",
+  "category": "stress",
+  "difficulty": "medium",
+  "timeout_seconds": 1500,
+  "required_tools": ["node"],
+  "browser": false,
+  "deps_change_expected": false,
+  "intent": "Make the pre-installed failing tests for a new `count` subcommand pass. The tests require case-insensitive whole-word counting of stdin input against a provided word argument. A naive first implementation satisfies basic counts but misses case-insensitivity or whole-word boundaries — EVAL catches it and FIX LOOP drives the correct second pass."
+}