pi-chrome 0.15.25 → 0.15.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/CHANGELOG.md +5 -0
  2. package/README.md +13 -4
  3. package/docs/COMPARISON.md +2 -2
  4. package/docs/EXAMPLES.md +3 -1
  5. package/docs/FAQ.md +6 -2
  6. package/extensions/chrome-profile-bridge/browser-extension/manifest.json +1 -1
  7. package/package.json +2 -1
  8. package/test-suite/README.md +193 -0
  9. package/test-suite/_lib.js +130 -0
  10. package/test-suite/_style.css +31 -0
  11. package/test-suite/baseline-dashboard.png +0 -0
  12. package/test-suite/browsergym-action-space.json +44 -0
  13. package/test-suite/challenges/01-is-trusted-click.html +28 -0
  14. package/test-suite/challenges/02-is-trusted-keyboard.html +50 -0
  15. package/test-suite/challenges/03-webdriver-flag.html +51 -0
  16. package/test-suite/challenges/04-mouse-entropy.html +34 -0
  17. package/test-suite/challenges/05-event-timing.html +34 -0
  18. package/test-suite/challenges/06-click-coordinates.html +29 -0
  19. package/test-suite/challenges/07-pointer-properties.html +29 -0
  20. package/test-suite/challenges/08-keyboard-cadence.html +37 -0
  21. package/test-suite/challenges/09-composition-input.html +45 -0
  22. package/test-suite/challenges/10-user-activation.html +40 -0
  23. package/test-suite/challenges/11-honeypot.html +36 -0
  24. package/test-suite/challenges/12-fingerprint.html +59 -0
  25. package/test-suite/challenges/13-focus-order.html +31 -0
  26. package/test-suite/challenges/14-wheel-scroll.html +28 -0
  27. package/test-suite/challenges/15-drag-drop-datatransfer.html +73 -0
  28. package/test-suite/challenges/16-contenteditable-selection.html +54 -0
  29. package/test-suite/challenges/17-paste-clipboard.html +48 -0
  30. package/test-suite/challenges/18-native-select.html +56 -0
  31. package/test-suite/challenges/19-hover-dwell.html +50 -0
  32. package/test-suite/challenges/20-react-value-tracker.html +78 -0
  33. package/test-suite/challenges/21-keyboard-modifiers.html +65 -0
  34. package/test-suite/challenges/22-touch-events.html +66 -0
  35. package/test-suite/challenges/23-stack-trace-fingerprint.html +76 -0
  36. package/test-suite/challenges/24-viewport-edge-clicks.html +51 -0
  37. package/test-suite/challenges/25-pointer-continuity.html +62 -0
  38. package/test-suite/challenges/26-mousemove-rate.html +57 -0
  39. package/test-suite/challenges/27-scroll-momentum.html +66 -0
  40. package/test-suite/challenges/28-intersection-visibility.html +72 -0
  41. package/test-suite/challenges/29-shadow-dom-controls.html +44 -0
  42. package/test-suite/challenges/30-iframe-targeting.html +44 -0
  43. package/test-suite/challenges/31-file-upload.html +30 -0
  44. package/test-suite/challenges/32-keyboard-tab-navigation.html +61 -0
  45. package/test-suite/challenges/33-network-console-capture.html +33 -0
  46. package/test-suite/challenges/34-dialog-handling.html +28 -0
  47. package/test-suite/challenges/35-target-blank-popup.html +35 -0
  48. package/test-suite/challenges/36-modal-focus-trap.html +49 -0
  49. package/test-suite/challenges/37-autocomplete-combobox.html +42 -0
  50. package/test-suite/challenges/38-spa-route-change.html +40 -0
  51. package/test-suite/challenges/39-strict-csp-fallback.html +14 -0
  52. package/test-suite/challenges/39-strict-csp-fallback.js +27 -0
  53. package/test-suite/challenges/40-dynamic-wait-readiness.html +41 -0
  54. package/test-suite/challenges/41-tab-lifecycle.html +44 -0
  55. package/test-suite/fixtures/pi-chrome-upload.txt +1 -0
  56. package/test-suite/fixtures/sites/mini-shop/cheats.js +12 -0
  57. package/test-suite/fixtures/sites/mini-shop/grader.js +29 -0
  58. package/test-suite/fixtures/sites/mini-shop/index.html +55 -0
  59. package/test-suite/fixtures/sites/mini-shop/tasks.json +9 -0
  60. package/test-suite/index.html +193 -0
  61. package/test-suite/manifest.json +1630 -0
  62. package/test-suite/manifest.schema.json +73 -0
  63. package/test-suite/notes/browsergym-compat.md +70 -0
  64. package/test-suite/notes/bypass-ideas.md +79 -0
  65. package/test-suite/notes/profiles.md +22 -0
  66. package/test-suite/notes/runner-spec.md +29 -0
  67. package/test-suite/notes/scoring.md +44 -0
  68. package/test-suite/scenarios/choredesk/cheats.js +49 -0
  69. package/test-suite/scenarios/choredesk/index.html +239 -0
  70. package/test-suite/serve.sh +6 -0
  71. package/test-suite/task-manifest.json +416 -0
  72. package/test-suite/task-manifest.schema.json +59 -0
package/CHANGELOG.md CHANGED
@@ -2,6 +2,11 @@
2
2
 
3
3
  All notable user-facing changes to `pi-chrome`.
4
4
 
5
+ ## 0.15.26 — 2026-05-16
6
+
7
+ - **Documentation accuracy.** README, FAQ, examples, comparison, and test-suite docs now describe the 41-challenge suite, gate buckets, strict-CSP fallback, and current human-vs-extension limitations.
8
+ - **Published benchmark assets.** The npm package now includes `test-suite/` so the documented benchmark pages are available from installed packages, not only from the repository checkout.
9
+
5
10
  ## 0.15.25 — 2026-05-16
6
11
 
7
12
  - **Reload after older installs.** `/reload` now recovers from stale singleton state left by pi-chrome 0.15.19 and earlier instead of skipping the freshly loaded extension.
package/README.md CHANGED
@@ -149,6 +149,10 @@ Agents can verify page state immediately instead of blindly retrying.
149
149
 
150
150
  Each tool is documented inline in Pi — agents see the parameters and gotchas (Chrome input, CSP limits, file upload behavior) without trial-and-error.
151
151
 
152
+ ### Known limits vs. human Chrome use
153
+
154
+ pi-chrome is strongest on web-page workflows exposed through DOM, screenshots, tabs, and Chrome input. It is not a full human/OS substitute. Current limitations include native Chrome/OS surfaces (print/save dialogs, permission bubbles, password-manager prompts), cross-origin iframe DOM access, rich multitouch/pinch/stylus gestures, visual CAPTCHA/bot challenges, hardware-backed auth (passkeys/security keys/biometrics), and arbitrary OS app interaction. For strict-CSP pages, use screenshot + coordinate input when `chrome_snapshot`/`chrome_evaluate` are blocked.
155
+
152
156
  ---
153
157
 
154
158
  ## Click & input behavior
@@ -213,9 +217,13 @@ Multiple Pi sessions (planner / worker / audit) can all drive the same Chrome at
213
217
 
214
218
  ## Built-in benchmark suite
215
219
 
216
- [`test-suite/`](./test-suite) is a benchmark for **any** browser-control agent (not just pi-chrome). It includes **38 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks**.
220
+ [`test-suite/`](./test-suite) is a benchmark for **any** browser-control agent (not just pi-chrome). It includes **41 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks**.
221
+
222
+ Scoring tracks expected outcomes per challenge rather than raw PASS count, so tools are judged against their declared browser-control capability. Unit challenges are split into gate buckets:
217
223
 
218
- Scoring tracks expected outcomes per challenge rather than raw PASS count, so tools are judged against their declared browser-control capability.
224
+ - `core` expected release blockers for normal trusted-mode browser control.
225
+ - `conditional` — capability/environment gated (clipboard, touch, dialogs, native UI, etc.).
226
+ - `quality` — adversarial humanization/fingerprint signals; report trends, don't block general release by default.
219
227
 
220
228
  Each challenge exposes `window.__verdict` / `window.__reason` / `window.__events` and a manifest entry with expected results per mode.
221
229
 
@@ -224,7 +232,7 @@ cd test-suite && python3 -m http.server 8765
224
232
  # open http://127.0.0.1:8765/ in the Chrome window pi-chrome controls
225
233
  ```
226
234
 
227
- Categories: `real-input`, `pointer-humanization`, `keyboard`, `activation-gates`, `scroll`, `drag-drop`, `clipboard`, `native-controls`, `frameworks`, `editing`, `dom-complexity`, `frames`, `files`, `observability`, `fingerprint`, `agent-safety`.
235
+ Categories include: `trusted-input`, `pointer-humanization`, `keyboard`, `focus-keyboard`, `activation-gates`, `scroll`, `drag-drop`, `clipboard`, `native-controls`, `frameworks`, `editing`, `dom-complexity`, `frames`, `files`, `observability`, `csp`, `lazy-loading`, `dialogs`, `popups`, `spa-routing`, `fingerprint`, and `agent-safety`.
228
236
 
229
237
  If you build a competing tool, please open a PR with your scores. We benchmark in public.
230
238
 
@@ -247,7 +255,8 @@ There is no network exposure in the default configuration; the bridge binds to l
247
255
  `pi-chrome` is actively shipped. Things on the near roadmap:
248
256
 
249
257
  - More observability tools (DOM mutation streams, performance traces)
250
- - First-class iframe + Shadow-DOM uid stability across snapshots
258
+ - First-class cross-origin iframe + Shadow-DOM uid stability across snapshots
259
+ - Native-browser surface coverage where extension APIs allow it (downloads, permissions, context menus)
251
260
  - Web Push & service worker introspection
252
261
  - Recorder mode that emits agent prompts from your own clicks
253
262
 
@@ -134,7 +134,7 @@ If your threat model excludes extensions with broad permissions, neither approac
134
134
 
135
135
  ## Public benchmarks worth knowing (for axis 2 / axis 3 comparison)
136
136
 
137
- Pi-chrome itself ships a benchmark suite ([`../test-suite/`](../test-suite)) of **38 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks** covering real input, pointer humanization, keyboard fidelity, drag/drop, Shadow DOM, file uploads, network observability, fingerprint leaks, and agent-safety honeypots. Scoring tracks expected outcomes per challenge instead of raw PASS count. That's **driver-level** grading.
137
+ Pi-chrome itself ships a benchmark suite ([`../test-suite/`](../test-suite)) of **41 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks** covering trusted input, pointer humanization, keyboard fidelity, drag/drop, Shadow DOM, iframes, file uploads, strict-CSP screenshot fallback, dynamic waits, tab lifecycle, network observability, fingerprint leaks, and agent-safety honeypots. Scoring tracks expected outcomes per challenge instead of raw PASS count, with `core`, `conditional`, and `quality` gate buckets. That's **driver-level** grading.
138
138
 
139
139
  For **agent-level** comparison (axis 2), the public benchmarks worth citing:
140
140
 
@@ -156,7 +156,7 @@ Cite live leaderboards rather than hard-coded numbers; agent scores shift monthl
156
156
 
157
157
  ## Reproducing pi-chrome's driver-level claims
158
158
 
159
- Run [`../test-suite/`](../test-suite) against any browser-control tool. Each challenge exposes `window.__verdict` / `window.__reason` / `window.__events`, so any tool (Playwright, Puppeteer, Selenium, Stagehand, pi-chrome) can grade itself deterministically.
159
+ Run [`../test-suite/`](../test-suite) against any browser-control tool. Each challenge exposes `window.__verdict` / `window.__reason` / `window.__events`, so any tool (Playwright, Puppeteer, Selenium, Stagehand, pi-chrome) can grade itself deterministically. Headline release scoring should use the `core` gate; `conditional` depends on declared environment capabilities, and `quality` tracks adversarial/humanization regressions.
160
160
 
161
161
  ```bash
162
162
  cd test-suite && python3 -m http.server 8765
package/docs/EXAMPLES.md CHANGED
@@ -159,6 +159,8 @@ Interactive tools use Chrome's real input layer by default: clicks, typing, fill
159
159
  - guarded buttons
160
160
  - audio/video controls
161
161
  - fullscreen and other user-activation checks
162
- - pages with strict CSP or user-activation checks
162
+ - pages where DOM injection/evaluate is limited, if the agent can use screenshots + coordinates
163
+
164
+ Strict CSP note: `chrome_snapshot`/`chrome_evaluate` may be blocked on pages that disallow `unsafe-eval`; `chrome_screenshot`, tab/navigation tools, and real input still work.
163
165
 
164
166
  Chrome may show its debugger banner while pi-chrome is attached.
package/docs/FAQ.md CHANGED
@@ -14,9 +14,9 @@ By default no — extensions need explicit "Allow in incognito" permission. Togg
14
14
 
15
15
  ## Will sites detect that I'm automating?
16
16
 
17
- Interactive controls use Chrome's real input layer via CDP: pointer paths are humanized, key cadence has variance, and normal user-activation gates are satisfied. Some detectors check for the `chrome.debugger` API attached and Chrome will show the "Chrome is being debugged" banner.
17
+ Interactive controls use Chrome's real input layer via CDP, so normal user-activation gates are satisfied and input is closer to real browser use than DOM-dispatched events. pi-chrome also shapes pointer/keyboard/scroll behavior, but this is not a guarantee of undetectability. Some detectors check for the `chrome.debugger` API attached, and Chrome will show the "Chrome is being debugged" banner.
18
18
 
19
- The [`test-suite/`](../test-suite) grades browser-control behavior against common detection signals.
19
+ The [`test-suite/`](../test-suite) grades browser-control behavior against common detection signals. Its `quality` bucket is adversarial signal, not a blanket promise that every site will treat automation as human.
20
20
 
21
21
  ## Why do I see a banner saying "Pi Chrome Connector started debugging this browser"?
22
22
 
@@ -49,6 +49,10 @@ pi-chrome ships as an unpacked extension so the source and broad browser permiss
49
49
 
50
50
  The Pi-facing tools are thin wrappers around an HTTP bridge at `127.0.0.1:17318`. You could call it directly from any process, but the API is internal and may change. If you need a stable scripting interface, file an issue and we'll consider stabilizing.
51
51
 
52
+ ## What can humans do that pi-chrome cannot?
53
+
54
+ pi-chrome controls web pages through Chrome extension APIs, page inspection, screenshots, and browser input. It is not full OS-level human control. Known gaps include native Chrome/OS dialogs (print/save-as, some permission bubbles, password-manager prompts), arbitrary OS app interaction, visual CAPTCHA challenges, hardware-backed auth (passkeys/security keys/biometrics), rich multi-touch/pinch/stylus gestures, and DOM inspection inside cross-origin iframes. Some of these can still be handled with screenshot + coordinate input or user assistance, but they are not first-class deterministic workflows.
55
+
52
56
  ## Does `chrome_evaluate` work on strict-CSP pages?
53
57
 
54
58
  Not always. `chrome_evaluate` and `chrome_snapshot` run in the page's MAIN world through the Function constructor, so pages whose CSP blocks `'unsafe-eval'` can reject them. `chrome_screenshot`, `chrome_navigate`, tab tools, and real Chrome input still work because they use extension/browser APIs rather than page JavaScript.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "manifest_version": 3,
3
3
  "name": "Pi Chrome Connector",
4
- "version": "0.15.25",
4
+ "version": "0.15.27",
5
5
  "description": "Lets Pi control tabs in Chrome via a local connector at 127.0.0.1.",
6
6
  "permissions": [
7
7
  "tabs",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-chrome",
3
- "version": "0.15.25",
3
+ "version": "0.15.27",
4
4
  "scripts": {
5
5
  "version": "node scripts/sync-manifest-version.js",
6
6
  "prepublishOnly": "node scripts/sync-manifest-version.js"
@@ -52,6 +52,7 @@
52
52
  "files": [
53
53
  "extensions",
54
54
  "docs",
55
+ "test-suite",
55
56
  "README.md",
56
57
  "CHANGELOG.md",
57
58
  "CONTRIBUTING.md",
@@ -0,0 +1,193 @@
1
+ # pi-chrome browser-control benchmark
2
+
3
+ Static benchmark pages for evaluating tools that let agents control Chrome. The suite has two layers:
4
+
5
+ 1. **Unit challenges** (`manifest.json`) — MiniWoB-style capability probes for
6
+ forms, scroll containers, contenteditable, files, frames, Shadow DOM,
7
+ network/console inspection, `isTrusted`, user activation, pointer paths, key
8
+ cadence, native controls, drag/drop, touch, paste, and scroll momentum.
9
+ 2. **Long-horizon hermetic tasks** (`task-manifest.json`) — WebArena /
10
+ BrowserGym-inspired multi-step tasks with fresh run IDs and deterministic
11
+ programmatic graders.
12
+
13
+ ## Run
14
+
15
+ ```bash
16
+ cd test-suite
17
+ python3 -m http.server 8765
18
+ # open http://127.0.0.1:8765/ in the Chrome window pi-chrome controls
19
+ ```
20
+
21
+ Each challenge page exposes:
22
+
23
+ - `window.__challenge` — id
24
+ - `window.__verdict` — `"PENDING" | "PASS" | "FAIL" | "SKIP" | "WARN"`
25
+ - `window.__reason` — array of reasons
26
+ - `window.__events` — raw event log for forensics
27
+
28
+ `manifest.json` is the source of truth for unit-challenge metadata: category,
29
+ gate bucket, goal, expected result per mode, prerequisites, flake risk, manual
30
+ baseline status, and canonical tool recipe. `manifest.schema.json` documents the
31
+ manifest shape. Recipes express tool intent; runners may need to adapt
32
+ descriptive selectors (e.g. shadow/iframe notation), dynamic tab ids, and expand
33
+ path placeholders like `$PWD`.
34
+
35
+ `task-manifest.json` is the source of truth for long-horizon tasks: BrowserGym-style
36
+ `taskId`, seed, viewport, goal object, difficulty tier, max steps, declared
37
+ action subsets, reset/setup URL, validate hook, optional cheat recipe, and
38
+ programmatic grader expression. `task-manifest.schema.json` documents this shape.
39
+ `browsergym-action-space.json` records BrowserGym-compatible action subsets.
40
+
41
+ ## Modes / expected outcomes
42
+
43
+ The same page can have different expected results depending on tool capability:
44
+
45
+ - `synthetic` — DOM-dispatched events / framework-aware setters. Fast and quiet.
46
+ - `trusted` — browser-trusted input, usually via `chrome.debugger`/CDP. Can show
47
+ Chrome's debugging banner.
48
+ - `manual` — human baseline in same browser/profile.
49
+
50
+ Expected values in `manifest.json`:
51
+
52
+ - `PASS` / `FAIL` — deterministic target for that mode.
53
+ - `CONDITIONAL` — depends on browser policy, OS, device capability, permissions,
54
+ or an unreleased tool primitive. Inspect `prerequisites`, `notes`, and
55
+ `flakeRisk`.
56
+
57
+ Manual baselines are tracked separately with `manualBaseline`. `unverified`
58
+ means the manual expectation is a target, not a recorded contract.
59
+
60
+ ## Gate buckets
61
+
62
+ Each unit challenge has a `gate` field:
63
+
64
+ - `core` — required release blocker for normal trusted-mode pi-chrome shipping.
65
+ - `conditional` — blocks only when declared prerequisites/capabilities are present
66
+ (clipboard, touch, dialogs, native UI, etc.).
67
+ - `quality` — adversarial humanization/fingerprint signal. Track regressions, but
68
+ do not block general ship without an explicit product decision.
69
+
70
+ ## Recommended unit-challenge agent flow
71
+
72
+ 1. Navigate to dashboard:
73
+ `http://127.0.0.1:8765/`.
74
+ 2. Pick mode (`synthetic`, `trusted`, or `manual`) and clear local verdicts.
75
+ 3. For each manifest row:
76
+ - `chrome_navigate` to `http://127.0.0.1:8765/<file>`.
77
+ - `chrome_snapshot` before acting; prefer snapshot `uid` over raw selector.
78
+ - Execute the listed `recipe`, adapting descriptive frame/shadow selectors to
79
+ whatever selectors/uids the tool exposes.
80
+ - Read:
81
+ ```js
82
+ JSON.stringify({
83
+ v: window.__verdict,
84
+ r: window.__reason,
85
+ e: window.__events?.slice(-20)
86
+ })
87
+ ```
88
+ 4. Return to dashboard and compare actual verdicts with expected values.
89
+ 5. Copy JSON report from dashboard for PRs or regression notes.
90
+
91
+ ## Recommended long-horizon task flow
92
+
93
+ 1. Load `task-manifest.json`.
94
+ 2. Replace `$RUN_ID` in `startUrl` with a fresh value.
95
+ 3. Navigate to the start URL and read the visible task instruction.
96
+ 4. Solve using normal browser tools only; avoid direct state mutation unless the
97
+ benchmark mode explicitly allows evaluate-based actions.
98
+ 5. Click **Grade now** or evaluate the task grader expression:
99
+ ```js
100
+ JSON.stringify({ v: window.__taskVerdict, r: window.__taskReason })
101
+ ```
102
+ 6. Record action count, observations used, tools used, verdict, and reason.
103
+
104
+ ## Design principles copied from browser-agent benchmarks
105
+
106
+ - Prefer hermetic sites and deterministic graders over live sites and LLM judges.
107
+ - Report action API and observation format; these strongly affect scores.
108
+ - Use difficulty tiers: L1 atomic, L2 compositional, L3 cross-page/context-rich.
109
+ - Include tedious cross-page memory and exact-value transfer tasks; short unit
110
+ probes hide these failures.
111
+ - Keep synthetic-event-gated tests because extension bridges face failures that
112
+ CDP/Playwright-style benchmarks usually do not measure.
113
+
114
+ ## Challenge categories
115
+
116
+ - `trusted-input` — browser-trusted click/key events.
117
+ - `pointer-humanization` — paths, coordinates, movement continuity/rate.
118
+ - `keyboard` / `focus-keyboard` — typing fidelity, modifiers, Tab flows.
119
+ - `activation-gates` — clipboard/fullscreen/user activation.
120
+ - `scroll` / `scroll-visibility` — wheel events, momentum, IntersectionObserver.
121
+ - `drag-drop` — HTML5 drag/drop + `DataTransfer`.
122
+ - `clipboard` — OS/browser paste path.
123
+ - `native-controls` — controls that should use browser UI/keyboard semantics.
124
+ - `frameworks` / `editing` — React-style value tracking, contenteditable.
125
+ - `dom-complexity` / `frames` — Shadow DOM and iframe targeting.
126
+ - `files` — file attachment to `<input type=file>`.
127
+ - `observability` — console/network capture tools.
128
+ - `csp` — strict Content Security Policy fallback where eval/snapshot may fail.
129
+ - `lazy-loading` — dynamic DOM readiness and wait behavior.
130
+ - `fingerprint` — environment and stack fingerprint probes.
131
+ - `agent-safety` — hidden honeypots and safe target selection.
132
+
133
+ ## Current challenge inventory
134
+
135
+ The dashboard renders this from `manifest.json`. In brief:
136
+
137
+ 1. trusted click
138
+ 2. trusted keyboard
139
+ 3. webdriver/runtime flags
140
+ 4. mouse entropy before click
141
+ 5. click timing
142
+ 6. click coordinate variation
143
+ 7. pointer event properties
144
+ 8. keyboard cadence
145
+ 9. beforeinput/input order
146
+ 10. user activation gates
147
+ 11. honeypot safety
148
+ 12. fingerprint consistency
149
+ 13. focus order
150
+ 14. wheel scroll
151
+ 15. drag/drop `DataTransfer`
152
+ 16. contenteditable selection
153
+ 17. paste clipboard
154
+ 18. native select
155
+ 19. hover dwell
156
+ 20. React value tracker
157
+ 21. keyboard modifiers
158
+ 22. touch events
159
+ 23. stack trace fingerprint
160
+ 24. viewport click coordinates
161
+ 25. pointer continuity
162
+ 26. mousemove rate
163
+ 27. scroll momentum
164
+ 28. intersection visibility
165
+ 29. Shadow DOM controls
166
+ 30. iframe targeting
167
+ 31. file upload
168
+ 32. keyboard Tab navigation
169
+ 33. network/console capture
170
+ 34. dialog handling
171
+ 35. target blank popup
172
+ 36. modal focus trap
173
+ 37. autocomplete combobox
174
+ 38. SPA route change
175
+ 39. strict CSP screenshot/coordinate fallback
176
+ 40. dynamic wait/readiness
177
+ 41. explicit tab lifecycle
178
+
179
+ ## Design notes
180
+
181
+ - A failure is useful only when compared to expected mode. Example: synthetic
182
+ `isTrusted` failing is expected and validates that the test detects quiet DOM
183
+ events.
184
+ - Some tests are capability-gated. Example: touch tests should be `SKIP`/manual
185
+ conditional on non-touch hardware.
186
+ - Fingerprint tests should warn before blocking. Real Chrome profiles can use
187
+ software WebGL in VMs, remote desktops, or policy-constrained environments.
188
+ - `notes/bypass-ideas.md` is historical guidance for older synthetic-only
189
+ versions. Prefer `manifest.json` for current expected outcomes.
190
+ - `notes/browsergym-compat.md` defines the reset/step/validate/observation/BID
191
+ contract for external BrowserGym-style agents.
192
+ - `notes/runner-spec.md`, `notes/scoring.md`, and `notes/profiles.md` define
193
+ runner output, scoring, retry policy, and environment metadata.
@@ -0,0 +1,130 @@
1
+ // Tiny shared harness for challenge pages.
2
+ // Each page calls Challenge.init({id, instructions}) then Challenge.pass()/fail()
3
+ // based on its own listeners.
4
+ (function () {
5
+ const events = [];
6
+ const state = {
7
+ id: null,
8
+ verdict: "PENDING",
9
+ reason: [],
10
+ events,
11
+ details: [],
12
+ thresholds: {},
13
+ };
14
+
15
+ function render() {
16
+ const el = document.getElementById("__verdict");
17
+ if (!el) return;
18
+ el.textContent = state.verdict;
19
+ el.dataset.verdict = state.verdict;
20
+ el.style.background =
21
+ state.verdict === "PASS" ? "#1f7a1f" :
22
+ state.verdict === "FAIL" ? "#a11" :
23
+ state.verdict === "SKIP" ? "#76520b" :
24
+ state.verdict === "WARN" ? "#6b5d00" : "#444";
25
+ el.style.color = "#fff";
26
+ const r = document.getElementById("__reason");
27
+ if (r) r.textContent = state.reason.join("\n");
28
+ }
29
+
30
+ function log(name, detail) {
31
+ events.push({ t: performance.now(), name, ...detail });
32
+ if (events.length > 500) events.shift();
33
+ }
34
+
35
+ const Challenge = {
36
+ init({ id, instructions, thresholds = {} }) {
37
+ state.id = id;
38
+ state.thresholds = parseThresholds(thresholds);
39
+ document.title = `[${state.verdict}] ${id}`;
40
+ const root = document.body;
41
+ const bar = document.createElement("div");
42
+ bar.style.cssText =
43
+ "position:sticky;top:0;background:#111;color:#eee;padding:8px 12px;font:13px monospace;border-bottom:1px solid #333;z-index:9999";
44
+ bar.innerHTML = `
45
+ <b>${id}</b>
46
+ <span id="__verdict" style="margin-left:8px;padding:2px 8px;border-radius:4px;background:#444">PENDING</span>
47
+ <span style="margin-left:12px;opacity:.7">${instructions}</span>
48
+ <pre id="__reason" style="white-space:pre-wrap;margin:6px 0 0;color:#bbb;font:12px monospace"></pre>
49
+ `;
50
+ root.insertBefore(bar, root.firstChild);
51
+ window.__challenge = id;
52
+ window.__verdict = state.verdict;
53
+ window.__reason = state.reason;
54
+ window.__events = state.events;
55
+ render();
56
+ },
57
+ pass(...reasons) {
58
+ if (state.verdict === "FAIL") return; // sticky
59
+ state.verdict = "PASS";
60
+ state.reason.push(...reasons.map((r) => "✓ " + r));
61
+ window.__verdict = state.verdict;
62
+ document.title = `[PASS] ${state.id}`;
63
+ persist(); render();
64
+ },
65
+ fail(...reasons) {
66
+ state.verdict = "FAIL";
67
+ state.reason.push(...reasons.map((r) => "✗ " + r));
68
+ window.__verdict = state.verdict;
69
+ document.title = `[FAIL] ${state.id}`;
70
+ persist(); render();
71
+ },
72
+ skip(...reasons) {
73
+ if (state.verdict === "FAIL" || state.verdict === "PASS") return;
74
+ state.verdict = "SKIP";
75
+ state.reason.push(...reasons.map((r) => "↷ " + r));
76
+ window.__verdict = state.verdict;
77
+ document.title = `[SKIP] ${state.id}`;
78
+ persist(); render();
79
+ },
80
+ warn(...reasons) {
81
+ if (state.verdict === "FAIL" || state.verdict === "PASS") return;
82
+ state.verdict = "WARN";
83
+ state.reason.push(...reasons.map((r) => "! " + r));
84
+ window.__verdict = state.verdict;
85
+ document.title = `[WARN] ${state.id}`;
86
+ persist(); render();
87
+ },
88
+ partial({ name, pass, reason, data }) {
89
+ state.details.push({ name, pass: !!pass, reason: reason || "", data });
90
+ log("partial", { name, pass: !!pass, reason, data });
91
+ persist(); render();
92
+ return !!pass;
93
+ },
94
+ finishPartials() {
95
+ const failed = state.details.filter(d => !d.pass);
96
+ if (failed.length) Challenge.fail(...failed.map(d => `${d.name}: ${d.reason || "failed"}`));
97
+ else Challenge.pass(...state.details.map(d => `${d.name}: ok`));
98
+ },
99
+ getThreshold(name, fallback) {
100
+ return Object.prototype.hasOwnProperty.call(state.thresholds, name) ? state.thresholds[name] : fallback;
101
+ },
102
+ log,
103
+ state,
104
+ };
105
+
106
+ function persist() {
107
+ try {
108
+ localStorage.setItem(
109
+ "pi-chrome-suite:" + state.id,
110
+ JSON.stringify({ id: state.id, verdict: state.verdict, reason: state.reason, details: state.details, thresholds: state.thresholds, events: state.events.slice(-50), ts: Date.now() })
111
+ );
112
+ } catch {}
113
+ }
114
+
115
+ function parseThresholds(defaults) {
116
+ const out = { ...defaults };
117
+ try {
118
+ const qs = new URLSearchParams(location.search);
119
+ for (const [k, v] of qs) {
120
+ if (!k.startsWith("threshold.")) continue;
121
+ const key = k.slice("threshold.".length);
122
+ const num = Number(v);
123
+ out[key] = Number.isFinite(num) ? num : v;
124
+ }
125
+ } catch {}
126
+ return out;
127
+ }
128
+
129
+ window.Challenge = Challenge;
130
+ })();
@@ -0,0 +1,31 @@
1
+ body { font: 14px system-ui, sans-serif; margin: 0; background: #1a1a1a; color: #eee; }
2
+ main { padding: 24px; max-width: 720px; }
3
+ code { background:#222;padding:1px 4px;border-radius:3px }
4
+ a { color: #6cf; }
5
+ button, input, select { font: 14px system-ui, sans-serif; }
6
+ button { padding: 7px 11px; border-radius: 6px; border: 1px solid #555; background: #262626; color: #eee; cursor:pointer; }
7
+ button:hover { background:#333; }
8
+ input, select { padding: 6px 8px; border-radius: 6px; border: 1px solid #555; background:#111; color:#eee; }
9
+ table { border-collapse: collapse; width: 100%; font: 13px ui-monospace, SFMono-Regular, Menlo, monospace; }
10
+ th, td { border-bottom: 1px solid #333; padding: 8px; vertical-align: top; text-align:left; }
11
+ th { position: sticky; top: 0; background:#151515; z-index:1; }
12
+ tr.ok { background: rgba(31,122,31,.08); }
13
+ tr.mismatch { background: rgba(170,17,17,.12); }
14
+ tr.conditional { background: rgba(118,82,11,.12); }
15
+ .panel { background:#202020; border:1px solid #333; border-radius:10px; padding:12px; margin:16px 0; }
16
+ .controls { display:flex; flex-wrap:wrap; gap:10px; align-items:center; }
17
+ .hint, .notes, .reason { color:#aaa; font-size:12px; margin-top:4px; }
18
+ .summary { display:flex; flex-wrap:wrap; gap:8px; margin: 14px 0; }
19
+ .pill { display:inline-block; border-radius:999px; padding:2px 8px; font-size:12px; font-weight:700; background:#444; color:#fff; }
20
+ .pass { background:#1f7a1f; }
21
+ .fail { background:#a11; }
22
+ .skip, .conditional { background:#76520b; }
23
+ .warn { background:#6b5d00; }
24
+ .pending { background:#444; }
25
+ .expected { background:#2b4b6b; }
26
+ .risk { color:#ffd479; }
27
+ .code, pre { background:#111; color:#eee; padding:12px; border-radius:6px; overflow:auto; }
28
+ .copy-fallback { display:none; width:100%; min-height:180px; margin-top:10px; font:12px ui-monospace, SFMono-Regular, Menlo, monospace; background:#111; color:#eee; border:1px solid #555; border-radius:6px; padding:10px; }
29
+ .copy-fallback[data-copied="true"] { border-color:#1f7a1f; }
30
+ .copy-fallback[data-copied="false"] { border-color:#a11; }
31
+ details summary { cursor:pointer; color:#9cf; }
@@ -0,0 +1,44 @@
1
+ {
2
+ "subsets": {
3
+ "chat": ["send_msg_to_user"],
4
+ "infeas": ["report_infeasible"],
5
+ "bid": ["click", "dblclick", "hover", "fill", "clear", "press", "focus", "select_option", "scroll", "drag_and_drop", "upload_file"],
6
+ "coord": ["mouse_move", "mouse_up", "mouse_down", "mouse_click", "mouse_dblclick", "mouse_drag_and_drop", "mouse_upload_file", "scroll_at", "keyboard_down", "keyboard_up", "keyboard_press", "keyboard_type", "keyboard_insert_text"],
7
+ "nav": ["go_back", "go_forward", "goto"],
8
+ "tab": ["tab_close", "tab_focus", "new_tab"]
9
+ },
10
+ "signatures": {
11
+ "fill": "fill(bid, value, enable_autocomplete_menu=false)",
12
+ "click": "click(bid, button='left', modifiers=[])",
13
+ "dblclick": "dblclick(bid, button='left', modifiers=[])",
14
+ "hover": "hover(bid)",
15
+ "press": "press(bid, key_comb)",
16
+ "focus": "focus(bid)",
17
+ "clear": "clear(bid)",
18
+ "select_option": "select_option(bid, options)",
19
+ "scroll": "scroll(delta_x, delta_y)",
20
+ "drag_and_drop": "drag_and_drop(from_bid, to_bid)",
21
+ "upload_file": "upload_file(bid, file)",
22
+ "mouse_move": "mouse_move(x, y)",
23
+ "mouse_click": "mouse_click(x, y, button='left')",
24
+ "mouse_dblclick": "mouse_dblclick(x, y, button='left')",
25
+ "mouse_up": "mouse_up(x, y, button='left')",
26
+ "mouse_down": "mouse_down(x, y, button='left')",
27
+ "mouse_drag_and_drop": "mouse_drag_and_drop(from_x, from_y, to_x, to_y)",
28
+ "scroll_at": "scroll_at(x, y, dx, dy)",
29
+ "keyboard_press": "keyboard_press(key)",
30
+ "keyboard_down": "keyboard_down(key)",
31
+ "keyboard_up": "keyboard_up(key)",
32
+ "keyboard_type": "keyboard_type(text)",
33
+ "keyboard_insert_text": "keyboard_insert_text(text)",
34
+ "goto": "goto(url)",
35
+ "go_back": "go_back()",
36
+ "go_forward": "go_forward()",
37
+ "tab_close": "tab_close()",
38
+ "tab_focus": "tab_focus(index)",
39
+ "new_tab": "new_tab()",
40
+ "send_msg_to_user": "send_msg_to_user(text)",
41
+ "report_infeasible": "report_infeasible(reason)",
42
+ "noop": "noop(wait_ms=1000)"
43
+ }
44
+ }
@@ -0,0 +1,28 @@
1
+ <!doctype html>
2
+ <meta charset="utf-8">
3
+ <title>01 isTrusted click</title>
4
+ <link rel="stylesheet" href="../_style.css">
5
+ <script src="../_lib.js"></script>
6
+ <body>
7
+ <main>
8
+ <p>Goal: click the green button. Page only accepts <code>event.isTrusted === true</code>.</p>
9
+ <button id="go" style="padding:20px 32px;font-size:18px;background:#1f7a1f;color:#fff;border:0;border-radius:6px">Click me</button>
10
+ </main>
11
+ <script>
12
+ Challenge.init({
13
+ id: "is-trusted-click",
14
+ instructions: "click the green button; isTrusted must be true",
15
+ });
16
+ const btn = document.getElementById("go");
17
+ btn.addEventListener("click", (e) => {
18
+ Challenge.log("click", { isTrusted: e.isTrusted, x: e.clientX, y: e.clientY });
19
+ if (e.isTrusted) Challenge.pass("click.isTrusted === true");
20
+ else Challenge.fail("click.isTrusted === false (synthetic dispatchEvent)");
21
+ }, { capture: true });
22
+ // Also watch pointerdown to detect bot earlier.
23
+ btn.addEventListener("pointerdown", (e) => {
24
+ Challenge.log("pointerdown", { isTrusted: e.isTrusted, pressure: e.pressure, pointerType: e.pointerType });
25
+ if (!e.isTrusted) Challenge.fail("pointerdown.isTrusted === false");
26
+ });
27
+ </script>
28
+ </body>
@@ -0,0 +1,50 @@
1
+ <!doctype html>
2
+ <meta charset="utf-8">
3
+ <title>02 isTrusted keyboard</title>
4
+ <link rel="stylesheet" href="../_style.css">
5
+ <script src="../_lib.js"></script>
6
+ <body>
7
+ <main>
8
+ <p>Goal: type <code>hello</code> into the box. Each character must arrive via a trusted keydown/keypress/input.</p>
9
+ <input id="t" placeholder="type hello" style="font-size:18px;padding:8px 12px;width:240px">
10
+ </main>
11
+ <script>
12
+ Challenge.init({
13
+ id: "is-trusted-keyboard",
14
+ instructions: "type 'hello'; every keydown.isTrusted must be true",
15
+ });
16
+ const t = document.getElementById("t");
17
+ let trustedKeys = 0, untrustedKeys = 0, inputs = 0, untrustedInputs = 0;
18
+ t.addEventListener("keydown", (e) => {
19
+ Challenge.log("keydown", { key: e.key, isTrusted: e.isTrusted });
20
+ if (e.isTrusted) trustedKeys++; else untrustedKeys++;
21
+ });
22
+ t.addEventListener("input", (e) => {
23
+ Challenge.log("input", { isTrusted: e.isTrusted, inputType: e.inputType, data: e.data });
24
+ inputs++; if (!e.isTrusted) untrustedInputs++;
25
+ });
26
+ t.addEventListener("keyup", () => {
27
+ if (t.value === "hello") {
28
+ if (untrustedKeys || untrustedInputs) {
29
+ Challenge.fail(`untrusted keydowns=${untrustedKeys}, untrusted inputs=${untrustedInputs}`);
30
+ } else if (trustedKeys < 5) {
31
+ Challenge.fail(`only ${trustedKeys} trusted keydowns (need >=5)`);
32
+ } else {
33
+ Challenge.pass(`${trustedKeys} trusted keydowns, ${inputs} trusted inputs`);
34
+ }
35
+ }
36
+ });
37
+ // Programmatic value-set detection.
38
+ const desc = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, "value");
39
+ Object.defineProperty(t, "value", {
40
+ get() { return desc.get.call(this); },
41
+ set(v) {
42
+ Challenge.log("value-set", { v });
43
+ if (untrustedKeys === 0 && trustedKeys === 0) {
44
+ Challenge.fail("value set programmatically without any keydown");
45
+ }
46
+ return desc.set.call(this, v);
47
+ },
48
+ });
49
+ </script>
50
+ </body>