npm - pi-chrome - Versions diffs - 0.15.25 → 0.15.26 - Mend

pi-chrome 0.15.25 → 0.15.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/CHANGELOG.md +5 -0
package/README.md +13 -4
package/docs/COMPARISON.md +2 -2
package/docs/EXAMPLES.md +3 -1
package/docs/FAQ.md +6 -2
package/extensions/chrome-profile-bridge/browser-extension/manifest.json +1 -1
package/package.json +2 -1
package/test-suite/README.md +193 -0
package/test-suite/_lib.js +130 -0
package/test-suite/_style.css +31 -0
package/test-suite/baseline-dashboard.png +0 -0
package/test-suite/browsergym-action-space.json +44 -0
package/test-suite/challenges/01-is-trusted-click.html +28 -0
package/test-suite/challenges/02-is-trusted-keyboard.html +50 -0
package/test-suite/challenges/03-webdriver-flag.html +51 -0
package/test-suite/challenges/04-mouse-entropy.html +34 -0
package/test-suite/challenges/05-event-timing.html +34 -0
package/test-suite/challenges/06-click-coordinates.html +29 -0
package/test-suite/challenges/07-pointer-properties.html +29 -0
package/test-suite/challenges/08-keyboard-cadence.html +37 -0
package/test-suite/challenges/09-composition-input.html +45 -0
package/test-suite/challenges/10-user-activation.html +40 -0
package/test-suite/challenges/11-honeypot.html +36 -0
package/test-suite/challenges/12-fingerprint.html +59 -0
package/test-suite/challenges/13-focus-order.html +31 -0
package/test-suite/challenges/14-wheel-scroll.html +28 -0
package/test-suite/challenges/15-drag-drop-datatransfer.html +73 -0
package/test-suite/challenges/16-contenteditable-selection.html +54 -0
package/test-suite/challenges/17-paste-clipboard.html +48 -0
package/test-suite/challenges/18-native-select.html +56 -0
package/test-suite/challenges/19-hover-dwell.html +50 -0
package/test-suite/challenges/20-react-value-tracker.html +78 -0
package/test-suite/challenges/21-keyboard-modifiers.html +65 -0
package/test-suite/challenges/22-touch-events.html +66 -0
package/test-suite/challenges/23-stack-trace-fingerprint.html +76 -0
package/test-suite/challenges/24-viewport-edge-clicks.html +51 -0
package/test-suite/challenges/25-pointer-continuity.html +62 -0
package/test-suite/challenges/26-mousemove-rate.html +57 -0
package/test-suite/challenges/27-scroll-momentum.html +66 -0
package/test-suite/challenges/28-intersection-visibility.html +72 -0
package/test-suite/challenges/29-shadow-dom-controls.html +44 -0
package/test-suite/challenges/30-iframe-targeting.html +44 -0
package/test-suite/challenges/31-file-upload.html +30 -0
package/test-suite/challenges/32-keyboard-tab-navigation.html +61 -0
package/test-suite/challenges/33-network-console-capture.html +33 -0
package/test-suite/challenges/34-dialog-handling.html +28 -0
package/test-suite/challenges/35-target-blank-popup.html +35 -0
package/test-suite/challenges/36-modal-focus-trap.html +49 -0
package/test-suite/challenges/37-autocomplete-combobox.html +42 -0
package/test-suite/challenges/38-spa-route-change.html +40 -0
package/test-suite/challenges/39-strict-csp-fallback.html +14 -0
package/test-suite/challenges/39-strict-csp-fallback.js +27 -0
package/test-suite/challenges/40-dynamic-wait-readiness.html +41 -0
package/test-suite/challenges/41-tab-lifecycle.html +44 -0
package/test-suite/fixtures/pi-chrome-upload.txt +1 -0
package/test-suite/fixtures/sites/mini-shop/cheats.js +12 -0
package/test-suite/fixtures/sites/mini-shop/grader.js +29 -0
package/test-suite/fixtures/sites/mini-shop/index.html +55 -0
package/test-suite/fixtures/sites/mini-shop/tasks.json +9 -0
package/test-suite/index.html +193 -0
package/test-suite/manifest.json +1630 -0
package/test-suite/manifest.schema.json +73 -0
package/test-suite/notes/browsergym-compat.md +70 -0
package/test-suite/notes/bypass-ideas.md +79 -0
package/test-suite/notes/profiles.md +22 -0
package/test-suite/notes/runner-spec.md +29 -0
package/test-suite/notes/scoring.md +44 -0
package/test-suite/scenarios/choredesk/cheats.js +49 -0
package/test-suite/scenarios/choredesk/index.html +239 -0
package/test-suite/serve.sh +6 -0
package/test-suite/task-manifest.json +416 -0
package/test-suite/task-manifest.schema.json +59 -0

package/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,11 @@
 All notable user-facing changes to `pi-chrome`.
+## 0.15.26 — 2026-05-16
+- **Documentation accuracy.** README, FAQ, examples, comparison, and test-suite docs now describe the 41-challenge suite, gate buckets, strict-CSP fallback, and current human-vs-extension limitations.
+- **Published benchmark assets.** The npm package now includes `test-suite/` so the documented benchmark pages are available from installed packages, not only from the repository checkout.
 ## 0.15.25 — 2026-05-16
 - **Reload after older installs.** `/reload` now recovers from stale singleton state left by pi-chrome 0.15.19 and earlier instead of skipping the freshly loaded extension.

package/README.md CHANGED Viewed

@@ -149,6 +149,10 @@ Agents can verify page state immediately instead of blindly retrying.
 Each tool is documented inline in Pi — agents see the parameters and gotchas (Chrome input, CSP limits, file upload behavior) without trial-and-error.
+### Known limits vs. human Chrome use
+pi-chrome is strongest on web-page workflows exposed through DOM, screenshots, tabs, and Chrome input. It is not a full human/OS substitute. Current limitations include native Chrome/OS surfaces (print/save dialogs, permission bubbles, password-manager prompts), cross-origin iframe DOM access, rich multitouch/pinch/stylus gestures, visual CAPTCHA/bot challenges, hardware-backed auth (passkeys/security keys/biometrics), and arbitrary OS app interaction. For strict-CSP pages, use screenshot + coordinate input when `chrome_snapshot`/`chrome_evaluate` are blocked.
 ---
 ## Click & input behavior
@@ -213,9 +217,13 @@ Multiple Pi sessions (planner / worker / audit) can all drive the same Chrome at
 ## Built-in benchmark suite
-[`test-suite/`](./test-suite) is a benchmark for **any** browser-control agent (not just pi-chrome). It includes **38 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks**.
+[`test-suite/`](./test-suite) is a benchmark for **any** browser-control agent (not just pi-chrome). It includes **41 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks**.
+Scoring tracks expected outcomes per challenge rather than raw PASS count, so tools are judged against their declared browser-control capability. Unit challenges are split into gate buckets:
-Scoring tracks expected outcomes per challenge rather than raw PASS count, so tools are judged against their declared browser-control capability.
+- `core` — expected release blockers for normal trusted-mode browser control.
+- `conditional` — capability/environment gated (clipboard, touch, dialogs, native UI, etc.).
+- `quality` — adversarial humanization/fingerprint signals; report trends, don't block general release by default.
 Each challenge exposes `window.__verdict` / `window.__reason` / `window.__events` and a manifest entry with expected results per mode.
@@ -224,7 +232,7 @@ cd test-suite && python3 -m http.server 8765
 # open http://127.0.0.1:8765/ in the Chrome window pi-chrome controls
 ```
-Categories: `real-input`, `pointer-humanization`, `keyboard`, `activation-gates`, `scroll`, `drag-drop`, `clipboard`, `native-controls`, `frameworks`, `editing`, `dom-complexity`, `frames`, `files`, `observability`, `fingerprint`, `agent-safety`.
+Categories include: `trusted-input`, `pointer-humanization`, `keyboard`, `focus-keyboard`, `activation-gates`, `scroll`, `drag-drop`, `clipboard`, `native-controls`, `frameworks`, `editing`, `dom-complexity`, `frames`, `files`, `observability`, `csp`, `lazy-loading`, `dialogs`, `popups`, `spa-routing`, `fingerprint`, and `agent-safety`.
 If you build a competing tool, please open a PR with your scores. We benchmark in public.
@@ -247,7 +255,8 @@ There is no network exposure in the default configuration; the bridge binds to l
 `pi-chrome` is actively shipped. Things on the near roadmap:
 - More observability tools (DOM mutation streams, performance traces)
-- First-class iframe + Shadow-DOM uid stability across snapshots
+- First-class cross-origin iframe + Shadow-DOM uid stability across snapshots
+- Native-browser surface coverage where extension APIs allow it (downloads, permissions, context menus)
 - Web Push & service worker introspection
 - Recorder mode that emits agent prompts from your own clicks

package/docs/COMPARISON.md CHANGED Viewed

@@ -134,7 +134,7 @@ If your threat model excludes extensions with broad permissions, neither approac
 ## Public benchmarks worth knowing (for axis 2 / axis 3 comparison)
-Pi-chrome itself ships a benchmark suite ([`../test-suite/`](../test-suite)) of **38 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks** covering real input, pointer humanization, keyboard fidelity, drag/drop, Shadow DOM, file uploads, network observability, fingerprint leaks, and agent-safety honeypots. Scoring tracks expected outcomes per challenge instead of raw PASS count. That's **driver-level** grading.
+Pi-chrome itself ships a benchmark suite ([`../test-suite/`](../test-suite)) of **41 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks** covering trusted input, pointer humanization, keyboard fidelity, drag/drop, Shadow DOM, iframes, file uploads, strict-CSP screenshot fallback, dynamic waits, tab lifecycle, network observability, fingerprint leaks, and agent-safety honeypots. Scoring tracks expected outcomes per challenge instead of raw PASS count, with `core`, `conditional`, and `quality` gate buckets. That's **driver-level** grading.
 For **agent-level** comparison (axis 2), the public benchmarks worth citing:
@@ -156,7 +156,7 @@ Cite live leaderboards rather than hard-coded numbers; agent scores shift monthl
 ## Reproducing pi-chrome's driver-level claims
-Run [`../test-suite/`](../test-suite) against any browser-control tool. Each challenge exposes `window.__verdict` / `window.__reason` / `window.__events`, so any tool (Playwright, Puppeteer, Selenium, Stagehand, pi-chrome) can grade itself deterministically.
+Run [`../test-suite/`](../test-suite) against any browser-control tool. Each challenge exposes `window.__verdict` / `window.__reason` / `window.__events`, so any tool (Playwright, Puppeteer, Selenium, Stagehand, pi-chrome) can grade itself deterministically. Headline release scoring should use the `core` gate; `conditional` depends on declared environment capabilities, and `quality` tracks adversarial/humanization regressions.
 ```bash
 cd test-suite && python3 -m http.server 8765

package/docs/EXAMPLES.md CHANGED Viewed

@@ -159,6 +159,8 @@ Interactive tools use Chrome's real input layer by default: clicks, typing, fill
 - guarded buttons
 - audio/video controls
 - fullscreen and other user-activation checks
-- pages with strict CSP or user-activation checks
+- pages where DOM injection/evaluate is limited, if the agent can use screenshots + coordinates
+Strict CSP note: `chrome_snapshot`/`chrome_evaluate` may be blocked on pages that disallow `unsafe-eval`; `chrome_screenshot`, tab/navigation tools, and real input still work.
 Chrome may show its debugger banner while pi-chrome is attached.

package/docs/FAQ.md CHANGED Viewed

@@ -14,9 +14,9 @@ By default no — extensions need explicit "Allow in incognito" permission. Togg
 ## Will sites detect that I'm automating?
-Interactive controls use Chrome's real input layer via CDP: pointer paths are humanized, key cadence has variance, and normal user-activation gates are satisfied. Some detectors check for the `chrome.debugger` API attached and Chrome will show the "Chrome is being debugged" banner.
+Interactive controls use Chrome's real input layer via CDP, so normal user-activation gates are satisfied and input is closer to real browser use than DOM-dispatched events. pi-chrome also shapes pointer/keyboard/scroll behavior, but this is not a guarantee of undetectability. Some detectors check for the `chrome.debugger` API attached, and Chrome will show the "Chrome is being debugged" banner.
-The [`test-suite/`](../test-suite) grades browser-control behavior against common detection signals.
+The [`test-suite/`](../test-suite) grades browser-control behavior against common detection signals. Its `quality` bucket is adversarial signal, not a blanket promise that every site will treat automation as human.
 ## Why do I see a banner saying "Pi Chrome Connector started debugging this browser"?
@@ -49,6 +49,10 @@ pi-chrome ships as an unpacked extension so the source and broad browser permiss
 The Pi-facing tools are thin wrappers around an HTTP bridge at `127.0.0.1:17318`. You could call it directly from any process, but the API is internal and may change. If you need a stable scripting interface, file an issue and we'll consider stabilizing.
+## What can humans do that pi-chrome cannot?
+pi-chrome controls web pages through Chrome extension APIs, page inspection, screenshots, and browser input. It is not full OS-level human control. Known gaps include native Chrome/OS dialogs (print/save-as, some permission bubbles, password-manager prompts), arbitrary OS app interaction, visual CAPTCHA challenges, hardware-backed auth (passkeys/security keys/biometrics), rich multi-touch/pinch/stylus gestures, and DOM inspection inside cross-origin iframes. Some of these can still be handled with screenshot + coordinate input or user assistance, but they are not first-class deterministic workflows.
 ## Does `chrome_evaluate` work on strict-CSP pages?
 Not always. `chrome_evaluate` and `chrome_snapshot` run in the page's MAIN world through the Function constructor, so pages whose CSP blocks `'unsafe-eval'` can reject them. `chrome_screenshot`, `chrome_navigate`, tab tools, and real Chrome input still work because they use extension/browser APIs rather than page JavaScript.

package/extensions/chrome-profile-bridge/browser-extension/manifest.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "manifest_version": 3,
   "name": "Pi Chrome Connector",
-  "version": "0.15.25",
+  "version": "0.15.26",
   "description": "Lets Pi control tabs in Chrome via a local connector at 127.0.0.1.",
   "permissions": [
     "tabs",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "pi-chrome",
-	"version": "0.15.25",
+	"version": "0.15.26",
 	"scripts": {
 		"version": "node scripts/sync-manifest-version.js",
 		"prepublishOnly": "node scripts/sync-manifest-version.js"
@@ -52,6 +52,7 @@
 	"files": [
 		"extensions",
 		"docs",
+		"test-suite",
 		"README.md",
 		"CHANGELOG.md",
 		"CONTRIBUTING.md",

package/test-suite/README.md ADDED Viewed

@@ -0,0 +1,193 @@
+# pi-chrome browser-control benchmark
+Static benchmark pages for evaluating tools that let agents control Chrome. The suite has two layers:
+1. **Unit challenges** (`manifest.json`) — MiniWoB-style capability probes for
+   forms, scroll containers, contenteditable, files, frames, Shadow DOM,
+   network/console inspection, `isTrusted`, user activation, pointer paths, key
+   cadence, native controls, drag/drop, touch, paste, and scroll momentum.
+2. **Long-horizon hermetic tasks** (`task-manifest.json`) — WebArena /
+   BrowserGym-inspired multi-step tasks with fresh run IDs and deterministic
+   programmatic graders.
+## Run
+```bash
+cd test-suite
+python3 -m http.server 8765
+# open http://127.0.0.1:8765/ in the Chrome window pi-chrome controls
+```
+Each challenge page exposes:
+- `window.__challenge` — id
+- `window.__verdict` — `"PENDING" | "PASS" | "FAIL" | "SKIP" | "WARN"`
+- `window.__reason` — array of reasons
+- `window.__events` — raw event log for forensics
+`manifest.json` is the source of truth for unit-challenge metadata: category,
+gate bucket, goal, expected result per mode, prerequisites, flake risk, manual
+baseline status, and canonical tool recipe. `manifest.schema.json` documents the
+manifest shape. Recipes express tool intent; runners may need to adapt
+descriptive selectors (e.g. shadow/iframe notation), dynamic tab ids, and expand
+path placeholders like `$PWD`.
+`task-manifest.json` is the source of truth for long-horizon tasks: BrowserGym-style
+`taskId`, seed, viewport, goal object, difficulty tier, max steps, declared
+action subsets, reset/setup URL, validate hook, optional cheat recipe, and
+programmatic grader expression. `task-manifest.schema.json` documents this shape.
+`browsergym-action-space.json` records BrowserGym-compatible action subsets.
+## Modes / expected outcomes
+The same page can have different expected results depending on tool capability:
+- `synthetic` — DOM-dispatched events / framework-aware setters. Fast and quiet.
+- `trusted` — browser-trusted input, usually via `chrome.debugger`/CDP. Can show
+  Chrome's debugging banner.
+- `manual` — human baseline in same browser/profile.
+Expected values in `manifest.json`:
+- `PASS` / `FAIL` — deterministic target for that mode.
+- `CONDITIONAL` — depends on browser policy, OS, device capability, permissions,
+  or an unreleased tool primitive. Inspect `prerequisites`, `notes`, and
+  `flakeRisk`.
+Manual baselines are tracked separately with `manualBaseline`. `unverified`
+means the manual expectation is a target, not a recorded contract.
+## Gate buckets
+Each unit challenge has a `gate` field:
+- `core` — required release blocker for normal trusted-mode pi-chrome shipping.
+- `conditional` — blocks only when declared prerequisites/capabilities are present
+  (clipboard, touch, dialogs, native UI, etc.).
+- `quality` — adversarial humanization/fingerprint signal. Track regressions, but
+  do not block general ship without an explicit product decision.
+## Recommended unit-challenge agent flow
+1. Navigate to dashboard:
+   `http://127.0.0.1:8765/`.
+2. Pick mode (`synthetic`, `trusted`, or `manual`) and clear local verdicts.
+3. For each manifest row:
+   - `chrome_navigate` to `http://127.0.0.1:8765/<file>`.
+   - `chrome_snapshot` before acting; prefer snapshot `uid` over raw selector.
+   - Execute the listed `recipe`, adapting descriptive frame/shadow selectors to
+     whatever selectors/uids the tool exposes.
+   - Read:
+     ```js
+     JSON.stringify({
+       v: window.__verdict,
+       r: window.__reason,
+       e: window.__events?.slice(-20)
+     })
+     ```
+4. Return to dashboard and compare actual verdicts with expected values.
+5. Copy JSON report from dashboard for PRs or regression notes.
+## Recommended long-horizon task flow
+1. Load `task-manifest.json`.
+2. Replace `$RUN_ID` in `startUrl` with a fresh value.
+3. Navigate to the start URL and read the visible task instruction.
+4. Solve using normal browser tools only; avoid direct state mutation unless the
+   benchmark mode explicitly allows evaluate-based actions.
+5. Click **Grade now** or evaluate the task grader expression:
+   ```js
+   JSON.stringify({ v: window.__taskVerdict, r: window.__taskReason })
+   ```
+6. Record action count, observations used, tools used, verdict, and reason.
+## Design principles copied from browser-agent benchmarks
+- Prefer hermetic sites and deterministic graders over live sites and LLM judges.
+- Report action API and observation format; these strongly affect scores.
+- Use difficulty tiers: L1 atomic, L2 compositional, L3 cross-page/context-rich.
+- Include tedious cross-page memory and exact-value transfer tasks; short unit
+  probes hide these failures.
+- Keep synthetic-event-gated tests because extension bridges face failures that
+  CDP/Playwright-style benchmarks usually do not measure.
+## Challenge categories
+- `trusted-input` — browser-trusted click/key events.
+- `pointer-humanization` — paths, coordinates, movement continuity/rate.
+- `keyboard` / `focus-keyboard` — typing fidelity, modifiers, Tab flows.
+- `activation-gates` — clipboard/fullscreen/user activation.
+- `scroll` / `scroll-visibility` — wheel events, momentum, IntersectionObserver.
+- `drag-drop` — HTML5 drag/drop + `DataTransfer`.
+- `clipboard` — OS/browser paste path.
+- `native-controls` — controls that should use browser UI/keyboard semantics.
+- `frameworks` / `editing` — React-style value tracking, contenteditable.
+- `dom-complexity` / `frames` — Shadow DOM and iframe targeting.
+- `files` — file attachment to `<input type=file>`.
+- `observability` — console/network capture tools.
+- `csp` — strict Content Security Policy fallback where eval/snapshot may fail.
+- `lazy-loading` — dynamic DOM readiness and wait behavior.
+- `fingerprint` — environment and stack fingerprint probes.
+- `agent-safety` — hidden honeypots and safe target selection.
+## Current challenge inventory
+The dashboard renders this from `manifest.json`. In brief:
+1. trusted click
+2. trusted keyboard
+3. webdriver/runtime flags
+4. mouse entropy before click
+5. click timing
+6. click coordinate variation
+7. pointer event properties
+8. keyboard cadence
+9. beforeinput/input order
+10. user activation gates
+11. honeypot safety
+12. fingerprint consistency
+13. focus order
+14. wheel scroll
+15. drag/drop `DataTransfer`
+16. contenteditable selection
+17. paste clipboard
+18. native select
+19. hover dwell
+20. React value tracker
+21. keyboard modifiers
+22. touch events
+23. stack trace fingerprint
+24. viewport click coordinates
+25. pointer continuity
+26. mousemove rate
+27. scroll momentum
+28. intersection visibility
+29. Shadow DOM controls
+30. iframe targeting
+31. file upload
+32. keyboard Tab navigation
+33. network/console capture
+34. dialog handling
+35. target blank popup
+36. modal focus trap
+37. autocomplete combobox
+38. SPA route change
+39. strict CSP screenshot/coordinate fallback
+40. dynamic wait/readiness
+41. explicit tab lifecycle
+## Design notes
+- A failure is useful only when compared to expected mode. Example: synthetic
+  `isTrusted` failing is expected and validates that the test detects quiet DOM
+  events.
+- Some tests are capability-gated. Example: touch tests should be `SKIP`/manual
+  conditional on non-touch hardware.
+- Fingerprint tests should warn before blocking. Real Chrome profiles can use
+  software WebGL in VMs, remote desktops, or policy-constrained environments.
+- `notes/bypass-ideas.md` is historical guidance for older synthetic-only
+  versions. Prefer `manifest.json` for current expected outcomes.
+- `notes/browsergym-compat.md` defines the reset/step/validate/observation/BID
+  contract for external BrowserGym-style agents.
+- `notes/runner-spec.md`, `notes/scoring.md`, and `notes/profiles.md` define
+  runner output, scoring, retry policy, and environment metadata.

package/test-suite/_lib.js ADDED Viewed

@@ -0,0 +1,130 @@
+// Tiny shared harness for challenge pages.
+// Each page calls Challenge.init({id, instructions}) then Challenge.pass()/fail()
+// based on its own listeners.
+(function () {
+  const events = [];
+  const state = {
+    id: null,
+    verdict: "PENDING",
+    reason: [],
+    events,
+    details: [],
+    thresholds: {},
+  };
+  function render() {
+    const el = document.getElementById("__verdict");
+    if (!el) return;
+    el.textContent = state.verdict;
+    el.dataset.verdict = state.verdict;
+    el.style.background =
+      state.verdict === "PASS" ? "#1f7a1f" :
+      state.verdict === "FAIL" ? "#a11" :
+      state.verdict === "SKIP" ? "#76520b" :
+      state.verdict === "WARN" ? "#6b5d00" : "#444";
+    el.style.color = "#fff";
+    const r = document.getElementById("__reason");
+    if (r) r.textContent = state.reason.join("\n");
+  }
+  function log(name, detail) {
+    events.push({ t: performance.now(), name, ...detail });
+    if (events.length > 500) events.shift();
+  }
+  const Challenge = {
+    init({ id, instructions, thresholds = {} }) {
+      state.id = id;
+      state.thresholds = parseThresholds(thresholds);
+      document.title = `[${state.verdict}] ${id}`;
+      const root = document.body;
+      const bar = document.createElement("div");
+      bar.style.cssText =
+        "position:sticky;top:0;background:#111;color:#eee;padding:8px 12px;font:13px monospace;border-bottom:1px solid #333;z-index:9999";
+      bar.innerHTML = `
+        <b>${id}</b>
+        <span id="__verdict" style="margin-left:8px;padding:2px 8px;border-radius:4px;background:#444">PENDING</span>
+        <span style="margin-left:12px;opacity:.7">${instructions}</span>
+        <pre id="__reason" style="white-space:pre-wrap;margin:6px 0 0;color:#bbb;font:12px monospace"></pre>
+      `;
+      root.insertBefore(bar, root.firstChild);
+      window.__challenge = id;
+      window.__verdict = state.verdict;
+      window.__reason = state.reason;
+      window.__events = state.events;
+      render();
+    },
+    pass(...reasons) {
+      if (state.verdict === "FAIL") return; // sticky
+      state.verdict = "PASS";
+      state.reason.push(...reasons.map((r) => "✓ " + r));
+      window.__verdict = state.verdict;
+      document.title = `[PASS] ${state.id}`;
+      persist(); render();
+    },
+    fail(...reasons) {
+      state.verdict = "FAIL";
+      state.reason.push(...reasons.map((r) => "✗ " + r));
+      window.__verdict = state.verdict;
+      document.title = `[FAIL] ${state.id}`;
+      persist(); render();
+    },
+    skip(...reasons) {
+      if (state.verdict === "FAIL" || state.verdict === "PASS") return;
+      state.verdict = "SKIP";
+      state.reason.push(...reasons.map((r) => "↷ " + r));
+      window.__verdict = state.verdict;
+      document.title = `[SKIP] ${state.id}`;
+      persist(); render();
+    },
+    warn(...reasons) {
+      if (state.verdict === "FAIL" || state.verdict === "PASS") return;
+      state.verdict = "WARN";
+      state.reason.push(...reasons.map((r) => "! " + r));
+      window.__verdict = state.verdict;
+      document.title = `[WARN] ${state.id}`;
+      persist(); render();
+    },
+    partial({ name, pass, reason, data }) {
+      state.details.push({ name, pass: !!pass, reason: reason || "", data });
+      log("partial", { name, pass: !!pass, reason, data });
+      persist(); render();
+      return !!pass;
+    },
+    finishPartials() {
+      const failed = state.details.filter(d => !d.pass);
+      if (failed.length) Challenge.fail(...failed.map(d => `${d.name}: ${d.reason || "failed"}`));
+      else Challenge.pass(...state.details.map(d => `${d.name}: ok`));
+    },
+    getThreshold(name, fallback) {
+      return Object.prototype.hasOwnProperty.call(state.thresholds, name) ? state.thresholds[name] : fallback;
+    },
+    log,
+    state,
+  };
+  function persist() {
+    try {
+      localStorage.setItem(
+        "pi-chrome-suite:" + state.id,
+        JSON.stringify({ id: state.id, verdict: state.verdict, reason: state.reason, details: state.details, thresholds: state.thresholds, events: state.events.slice(-50), ts: Date.now() })
+      );
+    } catch {}
+  }
+  function parseThresholds(defaults) {
+    const out = { ...defaults };
+    try {
+      const qs = new URLSearchParams(location.search);
+      for (const [k, v] of qs) {
+        if (!k.startsWith("threshold.")) continue;
+        const key = k.slice("threshold.".length);
+        const num = Number(v);
+        out[key] = Number.isFinite(num) ? num : v;
+      }
+    } catch {}
+    return out;
+  }
+  window.Challenge = Challenge;
+})();

package/test-suite/_style.css ADDED Viewed

@@ -0,0 +1,31 @@
+body { font: 14px system-ui, sans-serif; margin: 0; background: #1a1a1a; color: #eee; }
+main { padding: 24px; max-width: 720px; }
+code { background:#222;padding:1px 4px;border-radius:3px }
+a { color: #6cf; }
+button, input, select { font: 14px system-ui, sans-serif; }
+button { padding: 7px 11px; border-radius: 6px; border: 1px solid #555; background: #262626; color: #eee; cursor:pointer; }
+button:hover { background:#333; }
+input, select { padding: 6px 8px; border-radius: 6px; border: 1px solid #555; background:#111; color:#eee; }
+table { border-collapse: collapse; width: 100%; font: 13px ui-monospace, SFMono-Regular, Menlo, monospace; }
+th, td { border-bottom: 1px solid #333; padding: 8px; vertical-align: top; text-align:left; }
+th { position: sticky; top: 0; background:#151515; z-index:1; }
+tr.ok { background: rgba(31,122,31,.08); }
+tr.mismatch { background: rgba(170,17,17,.12); }
+tr.conditional { background: rgba(118,82,11,.12); }
+.panel { background:#202020; border:1px solid #333; border-radius:10px; padding:12px; margin:16px 0; }
+.controls { display:flex; flex-wrap:wrap; gap:10px; align-items:center; }
+.hint, .notes, .reason { color:#aaa; font-size:12px; margin-top:4px; }
+.summary { display:flex; flex-wrap:wrap; gap:8px; margin: 14px 0; }
+.pill { display:inline-block; border-radius:999px; padding:2px 8px; font-size:12px; font-weight:700; background:#444; color:#fff; }
+.pass { background:#1f7a1f; }
+.fail { background:#a11; }
+.skip, .conditional { background:#76520b; }
+.warn { background:#6b5d00; }
+.pending { background:#444; }
+.expected { background:#2b4b6b; }
+.risk { color:#ffd479; }
+.code, pre { background:#111; color:#eee; padding:12px; border-radius:6px; overflow:auto; }
+.copy-fallback { display:none; width:100%; min-height:180px; margin-top:10px; font:12px ui-monospace, SFMono-Regular, Menlo, monospace; background:#111; color:#eee; border:1px solid #555; border-radius:6px; padding:10px; }
+.copy-fallback[data-copied="true"] { border-color:#1f7a1f; }
+.copy-fallback[data-copied="false"] { border-color:#a11; }
+details summary { cursor:pointer; color:#9cf; }

package/test-suite/baseline-dashboard.png ADDED Viewed

Binary file

package/test-suite/browsergym-action-space.json ADDED Viewed

@@ -0,0 +1,44 @@
+{
+  "subsets": {
+    "chat": ["send_msg_to_user"],
+    "infeas": ["report_infeasible"],
+    "bid": ["click", "dblclick", "hover", "fill", "clear", "press", "focus", "select_option", "scroll", "drag_and_drop", "upload_file"],
+    "coord": ["mouse_move", "mouse_up", "mouse_down", "mouse_click", "mouse_dblclick", "mouse_drag_and_drop", "mouse_upload_file", "scroll_at", "keyboard_down", "keyboard_up", "keyboard_press", "keyboard_type", "keyboard_insert_text"],
+    "nav": ["go_back", "go_forward", "goto"],
+    "tab": ["tab_close", "tab_focus", "new_tab"]
+  },
+  "signatures": {
+    "fill": "fill(bid, value, enable_autocomplete_menu=false)",
+    "click": "click(bid, button='left', modifiers=[])",
+    "dblclick": "dblclick(bid, button='left', modifiers=[])",
+    "hover": "hover(bid)",
+    "press": "press(bid, key_comb)",
+    "focus": "focus(bid)",
+    "clear": "clear(bid)",
+    "select_option": "select_option(bid, options)",
+    "scroll": "scroll(delta_x, delta_y)",
+    "drag_and_drop": "drag_and_drop(from_bid, to_bid)",
+    "upload_file": "upload_file(bid, file)",
+    "mouse_move": "mouse_move(x, y)",
+    "mouse_click": "mouse_click(x, y, button='left')",
+    "mouse_dblclick": "mouse_dblclick(x, y, button='left')",
+    "mouse_up": "mouse_up(x, y, button='left')",
+    "mouse_down": "mouse_down(x, y, button='left')",
+    "mouse_drag_and_drop": "mouse_drag_and_drop(from_x, from_y, to_x, to_y)",
+    "scroll_at": "scroll_at(x, y, dx, dy)",
+    "keyboard_press": "keyboard_press(key)",
+    "keyboard_down": "keyboard_down(key)",
+    "keyboard_up": "keyboard_up(key)",
+    "keyboard_type": "keyboard_type(text)",
+    "keyboard_insert_text": "keyboard_insert_text(text)",
+    "goto": "goto(url)",
+    "go_back": "go_back()",
+    "go_forward": "go_forward()",
+    "tab_close": "tab_close()",
+    "tab_focus": "tab_focus(index)",
+    "new_tab": "new_tab()",
+    "send_msg_to_user": "send_msg_to_user(text)",
+    "report_infeasible": "report_infeasible(reason)",
+    "noop": "noop(wait_ms=1000)"
+  }
+}

package/test-suite/challenges/01-is-trusted-click.html ADDED Viewed

@@ -0,0 +1,28 @@
+<!doctype html>
+<meta charset="utf-8">
+<title>01 isTrusted click</title>
+<link rel="stylesheet" href="../_style.css">
+<script src="../_lib.js"></script>
+<body>
+<main>
+  <p>Goal: click the green button. Page only accepts <code>event.isTrusted === true</code>.</p>
+  <button id="go" style="padding:20px 32px;font-size:18px;background:#1f7a1f;color:#fff;border:0;border-radius:6px">Click me</button>
+</main>
+<script>
+Challenge.init({
+  id: "is-trusted-click",
+  instructions: "click the green button; isTrusted must be true",
+});
+const btn = document.getElementById("go");
+btn.addEventListener("click", (e) => {
+  Challenge.log("click", { isTrusted: e.isTrusted, x: e.clientX, y: e.clientY });
+  if (e.isTrusted) Challenge.pass("click.isTrusted === true");
+  else Challenge.fail("click.isTrusted === false (synthetic dispatchEvent)");
+}, { capture: true });
+// Also watch pointerdown to detect bot earlier.
+btn.addEventListener("pointerdown", (e) => {
+  Challenge.log("pointerdown", { isTrusted: e.isTrusted, pressure: e.pressure, pointerType: e.pointerType });
+  if (!e.isTrusted) Challenge.fail("pointerdown.isTrusted === false");
+});
+</script>
+</body>

package/test-suite/challenges/02-is-trusted-keyboard.html ADDED Viewed

@@ -0,0 +1,50 @@
+<!doctype html>
+<meta charset="utf-8">
+<title>02 isTrusted keyboard</title>
+<link rel="stylesheet" href="../_style.css">
+<script src="../_lib.js"></script>
+<body>
+<main>
+  <p>Goal: type <code>hello</code> into the box. Each character must arrive via a trusted keydown/keypress/input.</p>
+  <input id="t" placeholder="type hello" style="font-size:18px;padding:8px 12px;width:240px">
+</main>
+<script>
+Challenge.init({
+  id: "is-trusted-keyboard",
+  instructions: "type 'hello'; every keydown.isTrusted must be true",
+});
+const t = document.getElementById("t");
+let trustedKeys = 0, untrustedKeys = 0, inputs = 0, untrustedInputs = 0;
+t.addEventListener("keydown", (e) => {
+  Challenge.log("keydown", { key: e.key, isTrusted: e.isTrusted });
+  if (e.isTrusted) trustedKeys++; else untrustedKeys++;
+});
+t.addEventListener("input", (e) => {
+  Challenge.log("input", { isTrusted: e.isTrusted, inputType: e.inputType, data: e.data });
+  inputs++; if (!e.isTrusted) untrustedInputs++;
+});
+t.addEventListener("keyup", () => {
+  if (t.value === "hello") {
+    if (untrustedKeys || untrustedInputs) {
+      Challenge.fail(`untrusted keydowns=${untrustedKeys}, untrusted inputs=${untrustedInputs}`);
+    } else if (trustedKeys < 5) {
+      Challenge.fail(`only ${trustedKeys} trusted keydowns (need >=5)`);
+    } else {
+      Challenge.pass(`${trustedKeys} trusted keydowns, ${inputs} trusted inputs`);
+    }
+  }
+});
+// Programmatic value-set detection.
+const desc = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, "value");
+Object.defineProperty(t, "value", {
+  get() { return desc.get.call(this); },
+  set(v) {
+    Challenge.log("value-set", { v });
+    if (untrustedKeys === 0 && trustedKeys === 0) {
+      Challenge.fail("value set programmatically without any keydown");
+    }
+    return desc.set.call(this, v);
+  },
+});
+</script>
+</body>