pi-chrome 0.15.25 → 0.15.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +5 -0
- package/README.md +13 -4
- package/docs/COMPARISON.md +2 -2
- package/docs/EXAMPLES.md +3 -1
- package/docs/FAQ.md +6 -2
- package/extensions/chrome-profile-bridge/browser-extension/manifest.json +1 -1
- package/package.json +2 -1
- package/test-suite/README.md +193 -0
- package/test-suite/_lib.js +130 -0
- package/test-suite/_style.css +31 -0
- package/test-suite/baseline-dashboard.png +0 -0
- package/test-suite/browsergym-action-space.json +44 -0
- package/test-suite/challenges/01-is-trusted-click.html +28 -0
- package/test-suite/challenges/02-is-trusted-keyboard.html +50 -0
- package/test-suite/challenges/03-webdriver-flag.html +51 -0
- package/test-suite/challenges/04-mouse-entropy.html +34 -0
- package/test-suite/challenges/05-event-timing.html +34 -0
- package/test-suite/challenges/06-click-coordinates.html +29 -0
- package/test-suite/challenges/07-pointer-properties.html +29 -0
- package/test-suite/challenges/08-keyboard-cadence.html +37 -0
- package/test-suite/challenges/09-composition-input.html +45 -0
- package/test-suite/challenges/10-user-activation.html +40 -0
- package/test-suite/challenges/11-honeypot.html +36 -0
- package/test-suite/challenges/12-fingerprint.html +59 -0
- package/test-suite/challenges/13-focus-order.html +31 -0
- package/test-suite/challenges/14-wheel-scroll.html +28 -0
- package/test-suite/challenges/15-drag-drop-datatransfer.html +73 -0
- package/test-suite/challenges/16-contenteditable-selection.html +54 -0
- package/test-suite/challenges/17-paste-clipboard.html +48 -0
- package/test-suite/challenges/18-native-select.html +56 -0
- package/test-suite/challenges/19-hover-dwell.html +50 -0
- package/test-suite/challenges/20-react-value-tracker.html +78 -0
- package/test-suite/challenges/21-keyboard-modifiers.html +65 -0
- package/test-suite/challenges/22-touch-events.html +66 -0
- package/test-suite/challenges/23-stack-trace-fingerprint.html +76 -0
- package/test-suite/challenges/24-viewport-edge-clicks.html +51 -0
- package/test-suite/challenges/25-pointer-continuity.html +62 -0
- package/test-suite/challenges/26-mousemove-rate.html +57 -0
- package/test-suite/challenges/27-scroll-momentum.html +66 -0
- package/test-suite/challenges/28-intersection-visibility.html +72 -0
- package/test-suite/challenges/29-shadow-dom-controls.html +44 -0
- package/test-suite/challenges/30-iframe-targeting.html +44 -0
- package/test-suite/challenges/31-file-upload.html +30 -0
- package/test-suite/challenges/32-keyboard-tab-navigation.html +61 -0
- package/test-suite/challenges/33-network-console-capture.html +33 -0
- package/test-suite/challenges/34-dialog-handling.html +28 -0
- package/test-suite/challenges/35-target-blank-popup.html +35 -0
- package/test-suite/challenges/36-modal-focus-trap.html +49 -0
- package/test-suite/challenges/37-autocomplete-combobox.html +42 -0
- package/test-suite/challenges/38-spa-route-change.html +40 -0
- package/test-suite/challenges/39-strict-csp-fallback.html +14 -0
- package/test-suite/challenges/39-strict-csp-fallback.js +27 -0
- package/test-suite/challenges/40-dynamic-wait-readiness.html +41 -0
- package/test-suite/challenges/41-tab-lifecycle.html +44 -0
- package/test-suite/fixtures/pi-chrome-upload.txt +1 -0
- package/test-suite/fixtures/sites/mini-shop/cheats.js +12 -0
- package/test-suite/fixtures/sites/mini-shop/grader.js +29 -0
- package/test-suite/fixtures/sites/mini-shop/index.html +55 -0
- package/test-suite/fixtures/sites/mini-shop/tasks.json +9 -0
- package/test-suite/index.html +193 -0
- package/test-suite/manifest.json +1630 -0
- package/test-suite/manifest.schema.json +73 -0
- package/test-suite/notes/browsergym-compat.md +70 -0
- package/test-suite/notes/bypass-ideas.md +79 -0
- package/test-suite/notes/profiles.md +22 -0
- package/test-suite/notes/runner-spec.md +29 -0
- package/test-suite/notes/scoring.md +44 -0
- package/test-suite/scenarios/choredesk/cheats.js +49 -0
- package/test-suite/scenarios/choredesk/index.html +239 -0
- package/test-suite/serve.sh +6 -0
- package/test-suite/task-manifest.json +416 -0
- package/test-suite/task-manifest.schema.json +59 -0
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
All notable user-facing changes to `pi-chrome`.
|
|
4
4
|
|
|
5
|
+
## 0.15.26 — 2026-05-16
|
|
6
|
+
|
|
7
|
+
- **Documentation accuracy.** README, FAQ, examples, comparison, and test-suite docs now describe the 41-challenge suite, gate buckets, strict-CSP fallback, and current human-vs-extension limitations.
|
|
8
|
+
- **Published benchmark assets.** The npm package now includes `test-suite/` so the documented benchmark pages are available from installed packages, not only from the repository checkout.
|
|
9
|
+
|
|
5
10
|
## 0.15.25 — 2026-05-16
|
|
6
11
|
|
|
7
12
|
- **Reload after older installs.** `/reload` now recovers from stale singleton state left by pi-chrome 0.15.19 and earlier instead of skipping the freshly loaded extension.
|
package/README.md
CHANGED
|
@@ -149,6 +149,10 @@ Agents can verify page state immediately instead of blindly retrying.
|
|
|
149
149
|
|
|
150
150
|
Each tool is documented inline in Pi — agents see the parameters and gotchas (Chrome input, CSP limits, file upload behavior) without trial-and-error.
|
|
151
151
|
|
|
152
|
+
### Known limits vs. human Chrome use
|
|
153
|
+
|
|
154
|
+
pi-chrome is strongest on web-page workflows exposed through DOM, screenshots, tabs, and Chrome input. It is not a full human/OS substitute. Current limitations include native Chrome/OS surfaces (print/save dialogs, permission bubbles, password-manager prompts), cross-origin iframe DOM access, rich multitouch/pinch/stylus gestures, visual CAPTCHA/bot challenges, hardware-backed auth (passkeys/security keys/biometrics), and arbitrary OS app interaction. For strict-CSP pages, use screenshot + coordinate input when `chrome_snapshot`/`chrome_evaluate` are blocked.
|
|
155
|
+
|
|
152
156
|
---
|
|
153
157
|
|
|
154
158
|
## Click & input behavior
|
|
@@ -213,9 +217,13 @@ Multiple Pi sessions (planner / worker / audit) can all drive the same Chrome at
|
|
|
213
217
|
|
|
214
218
|
## Built-in benchmark suite
|
|
215
219
|
|
|
216
|
-
[`test-suite/`](./test-suite) is a benchmark for **any** browser-control agent (not just pi-chrome). It includes **
|
|
220
|
+
[`test-suite/`](./test-suite) is a benchmark for **any** browser-control agent (not just pi-chrome). It includes **41 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks**.
|
|
221
|
+
|
|
222
|
+
Scoring tracks expected outcomes per challenge rather than raw PASS count, so tools are judged against their declared browser-control capability. Unit challenges are split into gate buckets:
|
|
217
223
|
|
|
218
|
-
|
|
224
|
+
- `core` — expected release blockers for normal trusted-mode browser control.
|
|
225
|
+
- `conditional` — capability/environment gated (clipboard, touch, dialogs, native UI, etc.).
|
|
226
|
+
- `quality` — adversarial humanization/fingerprint signals; report trends, don't block general release by default.
|
|
219
227
|
|
|
220
228
|
Each challenge exposes `window.__verdict` / `window.__reason` / `window.__events` and a manifest entry with expected results per mode.
|
|
221
229
|
|
|
@@ -224,7 +232,7 @@ cd test-suite && python3 -m http.server 8765
|
|
|
224
232
|
# open http://127.0.0.1:8765/ in the Chrome window pi-chrome controls
|
|
225
233
|
```
|
|
226
234
|
|
|
227
|
-
Categories: `
|
|
235
|
+
Categories include: `trusted-input`, `pointer-humanization`, `keyboard`, `focus-keyboard`, `activation-gates`, `scroll`, `drag-drop`, `clipboard`, `native-controls`, `frameworks`, `editing`, `dom-complexity`, `frames`, `files`, `observability`, `csp`, `lazy-loading`, `dialogs`, `popups`, `spa-routing`, `fingerprint`, and `agent-safety`.
|
|
228
236
|
|
|
229
237
|
If you build a competing tool, please open a PR with your scores. We benchmark in public.
|
|
230
238
|
|
|
@@ -247,7 +255,8 @@ There is no network exposure in the default configuration; the bridge binds to l
|
|
|
247
255
|
`pi-chrome` is actively shipped. Things on the near roadmap:
|
|
248
256
|
|
|
249
257
|
- More observability tools (DOM mutation streams, performance traces)
|
|
250
|
-
- First-class iframe + Shadow-DOM uid stability across snapshots
|
|
258
|
+
- First-class cross-origin iframe + Shadow-DOM uid stability across snapshots
|
|
259
|
+
- Native-browser surface coverage where extension APIs allow it (downloads, permissions, context menus)
|
|
251
260
|
- Web Push & service worker introspection
|
|
252
261
|
- Recorder mode that emits agent prompts from your own clicks
|
|
253
262
|
|
package/docs/COMPARISON.md
CHANGED
|
@@ -134,7 +134,7 @@ If your threat model excludes extensions with broad permissions, neither approac
|
|
|
134
134
|
|
|
135
135
|
## Public benchmarks worth knowing (for axis 2 / axis 3 comparison)
|
|
136
136
|
|
|
137
|
-
Pi-chrome itself ships a benchmark suite ([`../test-suite/`](../test-suite)) of **
|
|
137
|
+
Pi-chrome itself ships a benchmark suite ([`../test-suite/`](../test-suite)) of **41 primitive challenges** plus **4 hermetic BrowserGym-style long-horizon tasks** covering trusted input, pointer humanization, keyboard fidelity, drag/drop, Shadow DOM, iframes, file uploads, strict-CSP screenshot fallback, dynamic waits, tab lifecycle, network observability, fingerprint leaks, and agent-safety honeypots. Scoring tracks expected outcomes per challenge instead of raw PASS count, with `core`, `conditional`, and `quality` gate buckets. That's **driver-level** grading.
|
|
138
138
|
|
|
139
139
|
For **agent-level** comparison (axis 2), the public benchmarks worth citing:
|
|
140
140
|
|
|
@@ -156,7 +156,7 @@ Cite live leaderboards rather than hard-coded numbers; agent scores shift monthl
|
|
|
156
156
|
|
|
157
157
|
## Reproducing pi-chrome's driver-level claims
|
|
158
158
|
|
|
159
|
-
Run [`../test-suite/`](../test-suite) against any browser-control tool. Each challenge exposes `window.__verdict` / `window.__reason` / `window.__events`, so any tool (Playwright, Puppeteer, Selenium, Stagehand, pi-chrome) can grade itself deterministically.
|
|
159
|
+
Run [`../test-suite/`](../test-suite) against any browser-control tool. Each challenge exposes `window.__verdict` / `window.__reason` / `window.__events`, so any tool (Playwright, Puppeteer, Selenium, Stagehand, pi-chrome) can grade itself deterministically. Headline release scoring should use the `core` gate; `conditional` depends on declared environment capabilities, and `quality` tracks adversarial/humanization regressions.
|
|
160
160
|
|
|
161
161
|
```bash
|
|
162
162
|
cd test-suite && python3 -m http.server 8765
|
package/docs/EXAMPLES.md
CHANGED
|
@@ -159,6 +159,8 @@ Interactive tools use Chrome's real input layer by default: clicks, typing, fill
|
|
|
159
159
|
- guarded buttons
|
|
160
160
|
- audio/video controls
|
|
161
161
|
- fullscreen and other user-activation checks
|
|
162
|
-
- pages
|
|
162
|
+
- pages where DOM injection/evaluate is limited, if the agent can use screenshots + coordinates
|
|
163
|
+
|
|
164
|
+
Strict CSP note: `chrome_snapshot`/`chrome_evaluate` may be blocked on pages that disallow `unsafe-eval`; `chrome_screenshot`, tab/navigation tools, and real input still work.
|
|
163
165
|
|
|
164
166
|
Chrome may show its debugger banner while pi-chrome is attached.
|
package/docs/FAQ.md
CHANGED
|
@@ -14,9 +14,9 @@ By default no — extensions need explicit "Allow in incognito" permission. Togg
|
|
|
14
14
|
|
|
15
15
|
## Will sites detect that I'm automating?
|
|
16
16
|
|
|
17
|
-
Interactive controls use Chrome's real input layer via CDP
|
|
17
|
+
Interactive controls use Chrome's real input layer via CDP, so normal user-activation gates are satisfied and input is closer to real browser use than DOM-dispatched events. pi-chrome also shapes pointer/keyboard/scroll behavior, but this is not a guarantee of undetectability. Some detectors check for the `chrome.debugger` API attached, and Chrome will show the "Chrome is being debugged" banner.
|
|
18
18
|
|
|
19
|
-
The [`test-suite/`](../test-suite) grades browser-control behavior against common detection signals.
|
|
19
|
+
The [`test-suite/`](../test-suite) grades browser-control behavior against common detection signals. Its `quality` bucket is adversarial signal, not a blanket promise that every site will treat automation as human.
|
|
20
20
|
|
|
21
21
|
## Why do I see a banner saying "Pi Chrome Connector started debugging this browser"?
|
|
22
22
|
|
|
@@ -49,6 +49,10 @@ pi-chrome ships as an unpacked extension so the source and broad browser permiss
|
|
|
49
49
|
|
|
50
50
|
The Pi-facing tools are thin wrappers around an HTTP bridge at `127.0.0.1:17318`. You could call it directly from any process, but the API is internal and may change. If you need a stable scripting interface, file an issue and we'll consider stabilizing.
|
|
51
51
|
|
|
52
|
+
## What can humans do that pi-chrome cannot?
|
|
53
|
+
|
|
54
|
+
pi-chrome controls web pages through Chrome extension APIs, page inspection, screenshots, and browser input. It is not full OS-level human control. Known gaps include native Chrome/OS dialogs (print/save-as, some permission bubbles, password-manager prompts), arbitrary OS app interaction, visual CAPTCHA challenges, hardware-backed auth (passkeys/security keys/biometrics), rich multi-touch/pinch/stylus gestures, and DOM inspection inside cross-origin iframes. Some of these can still be handled with screenshot + coordinate input or user assistance, but they are not first-class deterministic workflows.
|
|
55
|
+
|
|
52
56
|
## Does `chrome_evaluate` work on strict-CSP pages?
|
|
53
57
|
|
|
54
58
|
Not always. `chrome_evaluate` and `chrome_snapshot` run in the page's MAIN world through the Function constructor, so pages whose CSP blocks `'unsafe-eval'` can reject them. `chrome_screenshot`, `chrome_navigate`, tab tools, and real Chrome input still work because they use extension/browser APIs rather than page JavaScript.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-chrome",
|
|
3
|
-
"version": "0.15.
|
|
3
|
+
"version": "0.15.26",
|
|
4
4
|
"scripts": {
|
|
5
5
|
"version": "node scripts/sync-manifest-version.js",
|
|
6
6
|
"prepublishOnly": "node scripts/sync-manifest-version.js"
|
|
@@ -52,6 +52,7 @@
|
|
|
52
52
|
"files": [
|
|
53
53
|
"extensions",
|
|
54
54
|
"docs",
|
|
55
|
+
"test-suite",
|
|
55
56
|
"README.md",
|
|
56
57
|
"CHANGELOG.md",
|
|
57
58
|
"CONTRIBUTING.md",
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# pi-chrome browser-control benchmark
|
|
2
|
+
|
|
3
|
+
Static benchmark pages for evaluating tools that let agents control Chrome. The suite has two layers:
|
|
4
|
+
|
|
5
|
+
1. **Unit challenges** (`manifest.json`) — MiniWoB-style capability probes for
|
|
6
|
+
forms, scroll containers, contenteditable, files, frames, Shadow DOM,
|
|
7
|
+
network/console inspection, `isTrusted`, user activation, pointer paths, key
|
|
8
|
+
cadence, native controls, drag/drop, touch, paste, and scroll momentum.
|
|
9
|
+
2. **Long-horizon hermetic tasks** (`task-manifest.json`) — WebArena /
|
|
10
|
+
BrowserGym-inspired multi-step tasks with fresh run IDs and deterministic
|
|
11
|
+
programmatic graders.
|
|
12
|
+
|
|
13
|
+
## Run
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
cd test-suite
|
|
17
|
+
python3 -m http.server 8765
|
|
18
|
+
# open http://127.0.0.1:8765/ in the Chrome window pi-chrome controls
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Each challenge page exposes:
|
|
22
|
+
|
|
23
|
+
- `window.__challenge` — id
|
|
24
|
+
- `window.__verdict` — `"PENDING" | "PASS" | "FAIL" | "SKIP" | "WARN"`
|
|
25
|
+
- `window.__reason` — array of reasons
|
|
26
|
+
- `window.__events` — raw event log for forensics
|
|
27
|
+
|
|
28
|
+
`manifest.json` is the source of truth for unit-challenge metadata: category,
|
|
29
|
+
gate bucket, goal, expected result per mode, prerequisites, flake risk, manual
|
|
30
|
+
baseline status, and canonical tool recipe. `manifest.schema.json` documents the
|
|
31
|
+
manifest shape. Recipes express tool intent; runners may need to adapt
|
|
32
|
+
descriptive selectors (e.g. shadow/iframe notation), dynamic tab ids, and expand
|
|
33
|
+
path placeholders like `$PWD`.
|
|
34
|
+
|
|
35
|
+
`task-manifest.json` is the source of truth for long-horizon tasks: BrowserGym-style
|
|
36
|
+
`taskId`, seed, viewport, goal object, difficulty tier, max steps, declared
|
|
37
|
+
action subsets, reset/setup URL, validate hook, optional cheat recipe, and
|
|
38
|
+
programmatic grader expression. `task-manifest.schema.json` documents this shape.
|
|
39
|
+
`browsergym-action-space.json` records BrowserGym-compatible action subsets.
|
|
40
|
+
|
|
41
|
+
## Modes / expected outcomes
|
|
42
|
+
|
|
43
|
+
The same page can have different expected results depending on tool capability:
|
|
44
|
+
|
|
45
|
+
- `synthetic` — DOM-dispatched events / framework-aware setters. Fast and quiet.
|
|
46
|
+
- `trusted` — browser-trusted input, usually via `chrome.debugger`/CDP. Can show
|
|
47
|
+
Chrome's debugging banner.
|
|
48
|
+
- `manual` — human baseline in same browser/profile.
|
|
49
|
+
|
|
50
|
+
Expected values in `manifest.json`:
|
|
51
|
+
|
|
52
|
+
- `PASS` / `FAIL` — deterministic target for that mode.
|
|
53
|
+
- `CONDITIONAL` — depends on browser policy, OS, device capability, permissions,
|
|
54
|
+
or an unreleased tool primitive. Inspect `prerequisites`, `notes`, and
|
|
55
|
+
`flakeRisk`.
|
|
56
|
+
|
|
57
|
+
Manual baselines are tracked separately with `manualBaseline`. `unverified`
|
|
58
|
+
means the manual expectation is a target, not a recorded contract.
|
|
59
|
+
|
|
60
|
+
## Gate buckets
|
|
61
|
+
|
|
62
|
+
Each unit challenge has a `gate` field:
|
|
63
|
+
|
|
64
|
+
- `core` — required release blocker for normal trusted-mode pi-chrome shipping.
|
|
65
|
+
- `conditional` — blocks only when declared prerequisites/capabilities are present
|
|
66
|
+
(clipboard, touch, dialogs, native UI, etc.).
|
|
67
|
+
- `quality` — adversarial humanization/fingerprint signal. Track regressions, but
|
|
68
|
+
do not block general ship without an explicit product decision.
|
|
69
|
+
|
|
70
|
+
## Recommended unit-challenge agent flow
|
|
71
|
+
|
|
72
|
+
1. Navigate to dashboard:
|
|
73
|
+
`http://127.0.0.1:8765/`.
|
|
74
|
+
2. Pick mode (`synthetic`, `trusted`, or `manual`) and clear local verdicts.
|
|
75
|
+
3. For each manifest row:
|
|
76
|
+
- `chrome_navigate` to `http://127.0.0.1:8765/<file>`.
|
|
77
|
+
- `chrome_snapshot` before acting; prefer snapshot `uid` over raw selector.
|
|
78
|
+
- Execute the listed `recipe`, adapting descriptive frame/shadow selectors to
|
|
79
|
+
whatever selectors/uids the tool exposes.
|
|
80
|
+
- Read:
|
|
81
|
+
```js
|
|
82
|
+
JSON.stringify({
|
|
83
|
+
v: window.__verdict,
|
|
84
|
+
r: window.__reason,
|
|
85
|
+
e: window.__events?.slice(-20)
|
|
86
|
+
})
|
|
87
|
+
```
|
|
88
|
+
4. Return to dashboard and compare actual verdicts with expected values.
|
|
89
|
+
5. Copy JSON report from dashboard for PRs or regression notes.
|
|
90
|
+
|
|
91
|
+
## Recommended long-horizon task flow
|
|
92
|
+
|
|
93
|
+
1. Load `task-manifest.json`.
|
|
94
|
+
2. Replace `$RUN_ID` in `startUrl` with a fresh value.
|
|
95
|
+
3. Navigate to the start URL and read the visible task instruction.
|
|
96
|
+
4. Solve using normal browser tools only; avoid direct state mutation unless the
|
|
97
|
+
benchmark mode explicitly allows evaluate-based actions.
|
|
98
|
+
5. Click **Grade now** or evaluate the task grader expression:
|
|
99
|
+
```js
|
|
100
|
+
JSON.stringify({ v: window.__taskVerdict, r: window.__taskReason })
|
|
101
|
+
```
|
|
102
|
+
6. Record action count, observations used, tools used, verdict, and reason.
|
|
103
|
+
|
|
104
|
+
## Design principles copied from browser-agent benchmarks
|
|
105
|
+
|
|
106
|
+
- Prefer hermetic sites and deterministic graders over live sites and LLM judges.
|
|
107
|
+
- Report action API and observation format; these strongly affect scores.
|
|
108
|
+
- Use difficulty tiers: L1 atomic, L2 compositional, L3 cross-page/context-rich.
|
|
109
|
+
- Include tedious cross-page memory and exact-value transfer tasks; short unit
|
|
110
|
+
probes hide these failures.
|
|
111
|
+
- Keep synthetic-event-gated tests because extension bridges face failures that
|
|
112
|
+
CDP/Playwright-style benchmarks usually do not measure.
|
|
113
|
+
|
|
114
|
+
## Challenge categories
|
|
115
|
+
|
|
116
|
+
- `trusted-input` — browser-trusted click/key events.
|
|
117
|
+
- `pointer-humanization` — paths, coordinates, movement continuity/rate.
|
|
118
|
+
- `keyboard` / `focus-keyboard` — typing fidelity, modifiers, Tab flows.
|
|
119
|
+
- `activation-gates` — clipboard/fullscreen/user activation.
|
|
120
|
+
- `scroll` / `scroll-visibility` — wheel events, momentum, IntersectionObserver.
|
|
121
|
+
- `drag-drop` — HTML5 drag/drop + `DataTransfer`.
|
|
122
|
+
- `clipboard` — OS/browser paste path.
|
|
123
|
+
- `native-controls` — controls that should use browser UI/keyboard semantics.
|
|
124
|
+
- `frameworks` / `editing` — React-style value tracking, contenteditable.
|
|
125
|
+
- `dom-complexity` / `frames` — Shadow DOM and iframe targeting.
|
|
126
|
+
- `files` — file attachment to `<input type=file>`.
|
|
127
|
+
- `observability` — console/network capture tools.
|
|
128
|
+
- `csp` — strict Content Security Policy fallback where eval/snapshot may fail.
|
|
129
|
+
- `lazy-loading` — dynamic DOM readiness and wait behavior.
|
|
130
|
+
- `fingerprint` — environment and stack fingerprint probes.
|
|
131
|
+
- `agent-safety` — hidden honeypots and safe target selection.
|
|
132
|
+
|
|
133
|
+
## Current challenge inventory
|
|
134
|
+
|
|
135
|
+
The dashboard renders this from `manifest.json`. In brief:
|
|
136
|
+
|
|
137
|
+
1. trusted click
|
|
138
|
+
2. trusted keyboard
|
|
139
|
+
3. webdriver/runtime flags
|
|
140
|
+
4. mouse entropy before click
|
|
141
|
+
5. click timing
|
|
142
|
+
6. click coordinate variation
|
|
143
|
+
7. pointer event properties
|
|
144
|
+
8. keyboard cadence
|
|
145
|
+
9. beforeinput/input order
|
|
146
|
+
10. user activation gates
|
|
147
|
+
11. honeypot safety
|
|
148
|
+
12. fingerprint consistency
|
|
149
|
+
13. focus order
|
|
150
|
+
14. wheel scroll
|
|
151
|
+
15. drag/drop `DataTransfer`
|
|
152
|
+
16. contenteditable selection
|
|
153
|
+
17. paste clipboard
|
|
154
|
+
18. native select
|
|
155
|
+
19. hover dwell
|
|
156
|
+
20. React value tracker
|
|
157
|
+
21. keyboard modifiers
|
|
158
|
+
22. touch events
|
|
159
|
+
23. stack trace fingerprint
|
|
160
|
+
24. viewport click coordinates
|
|
161
|
+
25. pointer continuity
|
|
162
|
+
26. mousemove rate
|
|
163
|
+
27. scroll momentum
|
|
164
|
+
28. intersection visibility
|
|
165
|
+
29. Shadow DOM controls
|
|
166
|
+
30. iframe targeting
|
|
167
|
+
31. file upload
|
|
168
|
+
32. keyboard Tab navigation
|
|
169
|
+
33. network/console capture
|
|
170
|
+
34. dialog handling
|
|
171
|
+
35. target blank popup
|
|
172
|
+
36. modal focus trap
|
|
173
|
+
37. autocomplete combobox
|
|
174
|
+
38. SPA route change
|
|
175
|
+
39. strict CSP screenshot/coordinate fallback
|
|
176
|
+
40. dynamic wait/readiness
|
|
177
|
+
41. explicit tab lifecycle
|
|
178
|
+
|
|
179
|
+
## Design notes
|
|
180
|
+
|
|
181
|
+
- A failure is useful only when compared to expected mode. Example: synthetic
|
|
182
|
+
`isTrusted` failing is expected and validates that the test detects quiet DOM
|
|
183
|
+
events.
|
|
184
|
+
- Some tests are capability-gated. Example: touch tests should be `SKIP`/manual
|
|
185
|
+
conditional on non-touch hardware.
|
|
186
|
+
- Fingerprint tests should warn before blocking. Real Chrome profiles can use
|
|
187
|
+
software WebGL in VMs, remote desktops, or policy-constrained environments.
|
|
188
|
+
- `notes/bypass-ideas.md` is historical guidance for older synthetic-only
|
|
189
|
+
versions. Prefer `manifest.json` for current expected outcomes.
|
|
190
|
+
- `notes/browsergym-compat.md` defines the reset/step/validate/observation/BID
|
|
191
|
+
contract for external BrowserGym-style agents.
|
|
192
|
+
- `notes/runner-spec.md`, `notes/scoring.md`, and `notes/profiles.md` define
|
|
193
|
+
runner output, scoring, retry policy, and environment metadata.
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
// Tiny shared harness for challenge pages.
|
|
2
|
+
// Each page calls Challenge.init({id, instructions}) then Challenge.pass()/fail()
|
|
3
|
+
// based on its own listeners.
|
|
4
|
+
(function () {
|
|
5
|
+
const events = [];
|
|
6
|
+
const state = {
|
|
7
|
+
id: null,
|
|
8
|
+
verdict: "PENDING",
|
|
9
|
+
reason: [],
|
|
10
|
+
events,
|
|
11
|
+
details: [],
|
|
12
|
+
thresholds: {},
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
function render() {
|
|
16
|
+
const el = document.getElementById("__verdict");
|
|
17
|
+
if (!el) return;
|
|
18
|
+
el.textContent = state.verdict;
|
|
19
|
+
el.dataset.verdict = state.verdict;
|
|
20
|
+
el.style.background =
|
|
21
|
+
state.verdict === "PASS" ? "#1f7a1f" :
|
|
22
|
+
state.verdict === "FAIL" ? "#a11" :
|
|
23
|
+
state.verdict === "SKIP" ? "#76520b" :
|
|
24
|
+
state.verdict === "WARN" ? "#6b5d00" : "#444";
|
|
25
|
+
el.style.color = "#fff";
|
|
26
|
+
const r = document.getElementById("__reason");
|
|
27
|
+
if (r) r.textContent = state.reason.join("\n");
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function log(name, detail) {
|
|
31
|
+
events.push({ t: performance.now(), name, ...detail });
|
|
32
|
+
if (events.length > 500) events.shift();
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const Challenge = {
|
|
36
|
+
init({ id, instructions, thresholds = {} }) {
|
|
37
|
+
state.id = id;
|
|
38
|
+
state.thresholds = parseThresholds(thresholds);
|
|
39
|
+
document.title = `[${state.verdict}] ${id}`;
|
|
40
|
+
const root = document.body;
|
|
41
|
+
const bar = document.createElement("div");
|
|
42
|
+
bar.style.cssText =
|
|
43
|
+
"position:sticky;top:0;background:#111;color:#eee;padding:8px 12px;font:13px monospace;border-bottom:1px solid #333;z-index:9999";
|
|
44
|
+
bar.innerHTML = `
|
|
45
|
+
<b>${id}</b>
|
|
46
|
+
<span id="__verdict" style="margin-left:8px;padding:2px 8px;border-radius:4px;background:#444">PENDING</span>
|
|
47
|
+
<span style="margin-left:12px;opacity:.7">${instructions}</span>
|
|
48
|
+
<pre id="__reason" style="white-space:pre-wrap;margin:6px 0 0;color:#bbb;font:12px monospace"></pre>
|
|
49
|
+
`;
|
|
50
|
+
root.insertBefore(bar, root.firstChild);
|
|
51
|
+
window.__challenge = id;
|
|
52
|
+
window.__verdict = state.verdict;
|
|
53
|
+
window.__reason = state.reason;
|
|
54
|
+
window.__events = state.events;
|
|
55
|
+
render();
|
|
56
|
+
},
|
|
57
|
+
pass(...reasons) {
|
|
58
|
+
if (state.verdict === "FAIL") return; // sticky
|
|
59
|
+
state.verdict = "PASS";
|
|
60
|
+
state.reason.push(...reasons.map((r) => "✓ " + r));
|
|
61
|
+
window.__verdict = state.verdict;
|
|
62
|
+
document.title = `[PASS] ${state.id}`;
|
|
63
|
+
persist(); render();
|
|
64
|
+
},
|
|
65
|
+
fail(...reasons) {
|
|
66
|
+
state.verdict = "FAIL";
|
|
67
|
+
state.reason.push(...reasons.map((r) => "✗ " + r));
|
|
68
|
+
window.__verdict = state.verdict;
|
|
69
|
+
document.title = `[FAIL] ${state.id}`;
|
|
70
|
+
persist(); render();
|
|
71
|
+
},
|
|
72
|
+
skip(...reasons) {
|
|
73
|
+
if (state.verdict === "FAIL" || state.verdict === "PASS") return;
|
|
74
|
+
state.verdict = "SKIP";
|
|
75
|
+
state.reason.push(...reasons.map((r) => "↷ " + r));
|
|
76
|
+
window.__verdict = state.verdict;
|
|
77
|
+
document.title = `[SKIP] ${state.id}`;
|
|
78
|
+
persist(); render();
|
|
79
|
+
},
|
|
80
|
+
warn(...reasons) {
|
|
81
|
+
if (state.verdict === "FAIL" || state.verdict === "PASS") return;
|
|
82
|
+
state.verdict = "WARN";
|
|
83
|
+
state.reason.push(...reasons.map((r) => "! " + r));
|
|
84
|
+
window.__verdict = state.verdict;
|
|
85
|
+
document.title = `[WARN] ${state.id}`;
|
|
86
|
+
persist(); render();
|
|
87
|
+
},
|
|
88
|
+
partial({ name, pass, reason, data }) {
|
|
89
|
+
state.details.push({ name, pass: !!pass, reason: reason || "", data });
|
|
90
|
+
log("partial", { name, pass: !!pass, reason, data });
|
|
91
|
+
persist(); render();
|
|
92
|
+
return !!pass;
|
|
93
|
+
},
|
|
94
|
+
finishPartials() {
|
|
95
|
+
const failed = state.details.filter(d => !d.pass);
|
|
96
|
+
if (failed.length) Challenge.fail(...failed.map(d => `${d.name}: ${d.reason || "failed"}`));
|
|
97
|
+
else Challenge.pass(...state.details.map(d => `${d.name}: ok`));
|
|
98
|
+
},
|
|
99
|
+
getThreshold(name, fallback) {
|
|
100
|
+
return Object.prototype.hasOwnProperty.call(state.thresholds, name) ? state.thresholds[name] : fallback;
|
|
101
|
+
},
|
|
102
|
+
log,
|
|
103
|
+
state,
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
function persist() {
|
|
107
|
+
try {
|
|
108
|
+
localStorage.setItem(
|
|
109
|
+
"pi-chrome-suite:" + state.id,
|
|
110
|
+
JSON.stringify({ id: state.id, verdict: state.verdict, reason: state.reason, details: state.details, thresholds: state.thresholds, events: state.events.slice(-50), ts: Date.now() })
|
|
111
|
+
);
|
|
112
|
+
} catch {}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function parseThresholds(defaults) {
|
|
116
|
+
const out = { ...defaults };
|
|
117
|
+
try {
|
|
118
|
+
const qs = new URLSearchParams(location.search);
|
|
119
|
+
for (const [k, v] of qs) {
|
|
120
|
+
if (!k.startsWith("threshold.")) continue;
|
|
121
|
+
const key = k.slice("threshold.".length);
|
|
122
|
+
const num = Number(v);
|
|
123
|
+
out[key] = Number.isFinite(num) ? num : v;
|
|
124
|
+
}
|
|
125
|
+
} catch {}
|
|
126
|
+
return out;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
window.Challenge = Challenge;
|
|
130
|
+
})();
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
body { font: 14px system-ui, sans-serif; margin: 0; background: #1a1a1a; color: #eee; }
|
|
2
|
+
main { padding: 24px; max-width: 720px; }
|
|
3
|
+
code { background:#222;padding:1px 4px;border-radius:3px }
|
|
4
|
+
a { color: #6cf; }
|
|
5
|
+
button, input, select { font: 14px system-ui, sans-serif; }
|
|
6
|
+
button { padding: 7px 11px; border-radius: 6px; border: 1px solid #555; background: #262626; color: #eee; cursor:pointer; }
|
|
7
|
+
button:hover { background:#333; }
|
|
8
|
+
input, select { padding: 6px 8px; border-radius: 6px; border: 1px solid #555; background:#111; color:#eee; }
|
|
9
|
+
table { border-collapse: collapse; width: 100%; font: 13px ui-monospace, SFMono-Regular, Menlo, monospace; }
|
|
10
|
+
th, td { border-bottom: 1px solid #333; padding: 8px; vertical-align: top; text-align:left; }
|
|
11
|
+
th { position: sticky; top: 0; background:#151515; z-index:1; }
|
|
12
|
+
tr.ok { background: rgba(31,122,31,.08); }
|
|
13
|
+
tr.mismatch { background: rgba(170,17,17,.12); }
|
|
14
|
+
tr.conditional { background: rgba(118,82,11,.12); }
|
|
15
|
+
.panel { background:#202020; border:1px solid #333; border-radius:10px; padding:12px; margin:16px 0; }
|
|
16
|
+
.controls { display:flex; flex-wrap:wrap; gap:10px; align-items:center; }
|
|
17
|
+
.hint, .notes, .reason { color:#aaa; font-size:12px; margin-top:4px; }
|
|
18
|
+
.summary { display:flex; flex-wrap:wrap; gap:8px; margin: 14px 0; }
|
|
19
|
+
.pill { display:inline-block; border-radius:999px; padding:2px 8px; font-size:12px; font-weight:700; background:#444; color:#fff; }
|
|
20
|
+
.pass { background:#1f7a1f; }
|
|
21
|
+
.fail { background:#a11; }
|
|
22
|
+
.skip, .conditional { background:#76520b; }
|
|
23
|
+
.warn { background:#6b5d00; }
|
|
24
|
+
.pending { background:#444; }
|
|
25
|
+
.expected { background:#2b4b6b; }
|
|
26
|
+
.risk { color:#ffd479; }
|
|
27
|
+
.code, pre { background:#111; color:#eee; padding:12px; border-radius:6px; overflow:auto; }
|
|
28
|
+
.copy-fallback { display:none; width:100%; min-height:180px; margin-top:10px; font:12px ui-monospace, SFMono-Regular, Menlo, monospace; background:#111; color:#eee; border:1px solid #555; border-radius:6px; padding:10px; }
|
|
29
|
+
.copy-fallback[data-copied="true"] { border-color:#1f7a1f; }
|
|
30
|
+
.copy-fallback[data-copied="false"] { border-color:#a11; }
|
|
31
|
+
details summary { cursor:pointer; color:#9cf; }
|
|
Binary file
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
{
|
|
2
|
+
"subsets": {
|
|
3
|
+
"chat": ["send_msg_to_user"],
|
|
4
|
+
"infeas": ["report_infeasible"],
|
|
5
|
+
"bid": ["click", "dblclick", "hover", "fill", "clear", "press", "focus", "select_option", "scroll", "drag_and_drop", "upload_file"],
|
|
6
|
+
"coord": ["mouse_move", "mouse_up", "mouse_down", "mouse_click", "mouse_dblclick", "mouse_drag_and_drop", "mouse_upload_file", "scroll_at", "keyboard_down", "keyboard_up", "keyboard_press", "keyboard_type", "keyboard_insert_text"],
|
|
7
|
+
"nav": ["go_back", "go_forward", "goto"],
|
|
8
|
+
"tab": ["tab_close", "tab_focus", "new_tab"]
|
|
9
|
+
},
|
|
10
|
+
"signatures": {
|
|
11
|
+
"fill": "fill(bid, value, enable_autocomplete_menu=false)",
|
|
12
|
+
"click": "click(bid, button='left', modifiers=[])",
|
|
13
|
+
"dblclick": "dblclick(bid, button='left', modifiers=[])",
|
|
14
|
+
"hover": "hover(bid)",
|
|
15
|
+
"press": "press(bid, key_comb)",
|
|
16
|
+
"focus": "focus(bid)",
|
|
17
|
+
"clear": "clear(bid)",
|
|
18
|
+
"select_option": "select_option(bid, options)",
|
|
19
|
+
"scroll": "scroll(delta_x, delta_y)",
|
|
20
|
+
"drag_and_drop": "drag_and_drop(from_bid, to_bid)",
|
|
21
|
+
"upload_file": "upload_file(bid, file)",
|
|
22
|
+
"mouse_move": "mouse_move(x, y)",
|
|
23
|
+
"mouse_click": "mouse_click(x, y, button='left')",
|
|
24
|
+
"mouse_dblclick": "mouse_dblclick(x, y, button='left')",
|
|
25
|
+
"mouse_up": "mouse_up(x, y, button='left')",
|
|
26
|
+
"mouse_down": "mouse_down(x, y, button='left')",
|
|
27
|
+
"mouse_drag_and_drop": "mouse_drag_and_drop(from_x, from_y, to_x, to_y)",
|
|
28
|
+
"scroll_at": "scroll_at(x, y, dx, dy)",
|
|
29
|
+
"keyboard_press": "keyboard_press(key)",
|
|
30
|
+
"keyboard_down": "keyboard_down(key)",
|
|
31
|
+
"keyboard_up": "keyboard_up(key)",
|
|
32
|
+
"keyboard_type": "keyboard_type(text)",
|
|
33
|
+
"keyboard_insert_text": "keyboard_insert_text(text)",
|
|
34
|
+
"goto": "goto(url)",
|
|
35
|
+
"go_back": "go_back()",
|
|
36
|
+
"go_forward": "go_forward()",
|
|
37
|
+
"tab_close": "tab_close()",
|
|
38
|
+
"tab_focus": "tab_focus(index)",
|
|
39
|
+
"new_tab": "new_tab()",
|
|
40
|
+
"send_msg_to_user": "send_msg_to_user(text)",
|
|
41
|
+
"report_infeasible": "report_infeasible(reason)",
|
|
42
|
+
"noop": "noop(wait_ms=1000)"
|
|
43
|
+
}
|
|
44
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<meta charset="utf-8">
|
|
3
|
+
<title>01 isTrusted click</title>
|
|
4
|
+
<link rel="stylesheet" href="../_style.css">
|
|
5
|
+
<script src="../_lib.js"></script>
|
|
6
|
+
<body>
|
|
7
|
+
<main>
|
|
8
|
+
<p>Goal: click the green button. Page only accepts <code>event.isTrusted === true</code>.</p>
|
|
9
|
+
<button id="go" style="padding:20px 32px;font-size:18px;background:#1f7a1f;color:#fff;border:0;border-radius:6px">Click me</button>
|
|
10
|
+
</main>
|
|
11
|
+
<script>
|
|
12
|
+
Challenge.init({
|
|
13
|
+
id: "is-trusted-click",
|
|
14
|
+
instructions: "click the green button; isTrusted must be true",
|
|
15
|
+
});
|
|
16
|
+
const btn = document.getElementById("go");
|
|
17
|
+
btn.addEventListener("click", (e) => {
|
|
18
|
+
Challenge.log("click", { isTrusted: e.isTrusted, x: e.clientX, y: e.clientY });
|
|
19
|
+
if (e.isTrusted) Challenge.pass("click.isTrusted === true");
|
|
20
|
+
else Challenge.fail("click.isTrusted === false (synthetic dispatchEvent)");
|
|
21
|
+
}, { capture: true });
|
|
22
|
+
// Also watch pointerdown to detect bot earlier.
|
|
23
|
+
btn.addEventListener("pointerdown", (e) => {
|
|
24
|
+
Challenge.log("pointerdown", { isTrusted: e.isTrusted, pressure: e.pressure, pointerType: e.pointerType });
|
|
25
|
+
if (!e.isTrusted) Challenge.fail("pointerdown.isTrusted === false");
|
|
26
|
+
});
|
|
27
|
+
</script>
|
|
28
|
+
</body>
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<meta charset="utf-8">
|
|
3
|
+
<title>02 isTrusted keyboard</title>
|
|
4
|
+
<link rel="stylesheet" href="../_style.css">
|
|
5
|
+
<script src="../_lib.js"></script>
|
|
6
|
+
<body>
|
|
7
|
+
<main>
|
|
8
|
+
<p>Goal: type <code>hello</code> into the box. Each character must arrive via a trusted keydown/keypress/input.</p>
|
|
9
|
+
<input id="t" placeholder="type hello" style="font-size:18px;padding:8px 12px;width:240px">
|
|
10
|
+
</main>
|
|
11
|
+
<script>
|
|
12
|
+
Challenge.init({
|
|
13
|
+
id: "is-trusted-keyboard",
|
|
14
|
+
instructions: "type 'hello'; every keydown.isTrusted must be true",
|
|
15
|
+
});
|
|
16
|
+
const t = document.getElementById("t");
|
|
17
|
+
let trustedKeys = 0, untrustedKeys = 0, inputs = 0, untrustedInputs = 0;
|
|
18
|
+
t.addEventListener("keydown", (e) => {
|
|
19
|
+
Challenge.log("keydown", { key: e.key, isTrusted: e.isTrusted });
|
|
20
|
+
if (e.isTrusted) trustedKeys++; else untrustedKeys++;
|
|
21
|
+
});
|
|
22
|
+
t.addEventListener("input", (e) => {
|
|
23
|
+
Challenge.log("input", { isTrusted: e.isTrusted, inputType: e.inputType, data: e.data });
|
|
24
|
+
inputs++; if (!e.isTrusted) untrustedInputs++;
|
|
25
|
+
});
|
|
26
|
+
t.addEventListener("keyup", () => {
|
|
27
|
+
if (t.value === "hello") {
|
|
28
|
+
if (untrustedKeys || untrustedInputs) {
|
|
29
|
+
Challenge.fail(`untrusted keydowns=${untrustedKeys}, untrusted inputs=${untrustedInputs}`);
|
|
30
|
+
} else if (trustedKeys < 5) {
|
|
31
|
+
Challenge.fail(`only ${trustedKeys} trusted keydowns (need >=5)`);
|
|
32
|
+
} else {
|
|
33
|
+
Challenge.pass(`${trustedKeys} trusted keydowns, ${inputs} trusted inputs`);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
// Programmatic value-set detection.
|
|
38
|
+
const desc = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, "value");
|
|
39
|
+
Object.defineProperty(t, "value", {
|
|
40
|
+
get() { return desc.get.call(this); },
|
|
41
|
+
set(v) {
|
|
42
|
+
Challenge.log("value-set", { v });
|
|
43
|
+
if (untrustedKeys === 0 && trustedKeys === 0) {
|
|
44
|
+
Challenge.fail("value set programmatically without any keydown");
|
|
45
|
+
}
|
|
46
|
+
return desc.set.call(this, v);
|
|
47
|
+
},
|
|
48
|
+
});
|
|
49
|
+
</script>
|
|
50
|
+
</body>
|