@mochi.js/core 0.1.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/launch.ts CHANGED
@@ -12,6 +12,8 @@
12
12
 
13
13
  import { deriveMatrix, type ProfileV1 } from "@mochi.js/consistency";
14
14
  import { resolveBinary } from "./binary";
15
+ import { type GeoConsistencyMode, reconcileGeoConsistency } from "./geo-consistency";
16
+ import { probeExitGeo } from "./geo-probe";
15
17
  import { spawnChromium } from "./proc";
16
18
  import { parseProxyUrl } from "./proxy-auth";
17
19
  import { Session } from "./session";
@@ -91,6 +93,16 @@ export interface LaunchOptions {
91
93
  args?: string[];
92
94
  out?: { traceDir?: string };
93
95
  timeout?: number;
96
+ /**
97
+ * Opt out of mochi's "auto-add `--no-sandbox` when running as root on
98
+ * Linux" fallback. Default `false` (the fallback is on). When `true`,
99
+ * mochi will NOT inject `--no-sandbox` even under root + Linux — useful
100
+ * if you've configured a SUID `chrome-sandbox` helper and want to keep
101
+ * the user-namespace sandbox active. The launch will crash with EPIPE
102
+ * if the SUID setup is wrong, but you keep stealth posture intact
103
+ * (`--no-sandbox` is a fingerprint leak per PLAN.md §8.6).
104
+ */
105
+ allowRootWithSandbox?: boolean;
94
106
  /**
95
107
  * When `true`, the {@link Session} skips both `buildPayload` (no payload
96
108
  * is compiled) and `Page.addScriptToEvaluateOnNewDocument` on every new
@@ -107,6 +119,27 @@ export interface LaunchOptions {
107
119
  * Chromium); task 0040.
108
120
  */
109
121
  bypassInject?: boolean;
122
+ /**
123
+ * When `true`, re-applies the harness/CI-only Chromium flags
124
+ * (`--disable-component-update`, `--disable-default-apps`,
125
+ * `--disable-background-networking`, `--disable-sync`, plus a noise-
126
+ * reduction `--disable-features=` block) on top of the production
127
+ * default flag set. Used by `@mochi.js/harness`, CI runs, and
128
+ * `mochi capture` flows where update traffic, default-apps auto-install,
129
+ * sync, and feed prefetches would inject non-determinism into baseline
130
+ * collection or stealth conformance.
131
+ *
132
+ * Defaults to `false` — production users get a cleaner flag set without
133
+ * the passive command-line bot-tells that patchright explicitly removes
134
+ * from its Playwright fork (`chromiumSwitchesPatch.ts:20-34`) and that
135
+ * `puppeteer-real-browser` strips for the same reason
136
+ * (`lib/cjs/index.js:57-58`).
137
+ *
138
+ * Pairs with — but is independent of — {@link bypassInject}. Capture
139
+ * flows set both `true`; harness conformance runs set `hermetic: true`
140
+ * with full inject pipeline active. PLAN.md §8.6 + task 0256.
141
+ */
142
+ hermetic?: boolean;
110
143
  /**
111
144
  * Convenience layer toggles for common bot-defense widgets. When
112
145
  * `challenges.turnstile.autoClick` is `true`, every page returned by
@@ -120,6 +153,35 @@ export interface LaunchOptions {
120
153
  * solving is v0.3+).
121
154
  */
122
155
  challenges?: ChallengeLaunchOptions;
156
+ /**
157
+ * Reconcile `(matrix.timezone, matrix.locale)` against the proxy's
158
+ * exit-IP geolocation. Closes the cross-layer leak where a US profile
159
+ * over an EU proxy would have `Date.getTimezoneOffset()` reporting PT
160
+ * while the IP geolocates to UTC+1 — the canonical bot signature.
161
+ *
162
+ * - `"privacy-fallback"` *(default)* — on mismatch (or probe failure),
163
+ * override the matrix to UTC + `en-US`. The session fingerprints as
164
+ * a privacy-conscious user (Tor / Brave / hardened-FF style), which
165
+ * is benign in most threat models.
166
+ * - `"auto-correct"` — on mismatch, override the matrix's timezone
167
+ * with the IP's timezone and the locale with a primary-locale
168
+ * guess for the IP's country. Most "stealth" but trusts mochi's
169
+ * IP-derived defaults over the user's declared profile.
170
+ * - `"strict"` — throw `GeoMismatchError` on mismatch. The user must
171
+ * change profile or change proxy. Probe failure (null) does NOT
172
+ * throw under strict — that's a network blip, not a mismatch.
173
+ * - `"off"` — skip the probe entirely. Use in offline tests / when
174
+ * the probe service is rate-limited.
175
+ *
176
+ * The probe is a single GET through wreq (using the matrix's
177
+ * `wreqPreset`, so the geo service sees the same JA4/headers as user
178
+ * traffic). 4-attempt cap, 2s per endpoint. Probe results are NOT
179
+ * cached across sessions — proxy IPs rotate.
180
+ *
181
+ * @see PLAN.md §9 (relational consistency, IP/TZ/Locale axis)
182
+ * @see tasks/0262-ip-tz-locale-exit-consistency.md
183
+ */
184
+ geoConsistency?: GeoConsistencyMode;
123
185
  }
124
186
 
125
187
  /**
@@ -129,26 +191,100 @@ export interface LaunchOptions {
129
191
  export async function launch(opts: LaunchOptions): Promise<Session> {
130
192
  const binary = await resolveBinary(opts.binary);
131
193
  const normalized = normalizeProxy(opts.proxy);
194
+
195
+ // Resolve the `MatrixV1` BEFORE spawning so matrix-derived values flow
196
+ // into both the `--lang` flag (task 0251) and `--window-size` flag
197
+ // (task 0252). The matrix is otherwise read post-spawn for inject;
198
+ // deriving early is cheap (~µs, pure function) and lets us close the
199
+ // I-5 leaks between Chromium's native network/OS-window state and the
200
+ // JS-layer spoof.
201
+ //
202
+ // Inline `ProfileV1` objects flow straight through; string profile ids
203
+ // are resolved against a placeholder profile until `@mochi.js/profiles`
204
+ // ships its first capture (phase 0.4). The matrix is bit-stable per
205
+ // `(profile, seed)` excluding the `derivedAt` timestamp.
206
+ const profile = resolveProfile(opts.profile);
207
+ const matrix = deriveMatrix(profile, opts.seed);
208
+
209
+ // Task 0262 — exit-IP / TZ / locale reconciliation.
210
+ //
211
+ // Probe the apparent exit IP through the configured proxy (using wreq
212
+ // with the matrix's `wreqPreset` so the geo service sees the same JA4
213
+ // / headers as user traffic). Then cross-reference against
214
+ // `(matrix.timezone, matrix.locale)` and apply `geoConsistency`. The
215
+ // adjusted matrix flows into BOTH `spawnChromium` (so `--lang` reflects
216
+ // any override) AND `Session` (so inject + the CDP `Emulation.set
217
+ // TimezoneOverride` send pick it up). PLAN.md §9.
218
+ //
219
+ // `"off"` short-circuits the probe — the probe call itself respects
220
+ // the mode so we don't pay the network round-trip in offline tests.
221
+ const geoMode: GeoConsistencyMode = opts.geoConsistency ?? "privacy-fallback";
222
+ let adjustedMatrix = matrix;
223
+ if (geoMode !== "off") {
224
+ const geo = await probeExitGeo({
225
+ ...(normalized?.netProxy !== undefined ? { proxy: normalized.netProxy } : {}),
226
+ matrix,
227
+ });
228
+ // Strict mode throws GeoMismatchError on real mismatch; let it
229
+ // propagate up so callers can recover (the orchestrator surfaced
230
+ // it as the canonical failure mode for "wrong proxy for profile").
231
+ const result = reconcileGeoConsistency(matrix, geo, geoMode);
232
+ adjustedMatrix = result.matrix;
233
+ if (result.action === "privacy-fallback" || result.action === "auto-correct") {
234
+ console.warn(
235
+ `[mochi] geoConsistency=${geoMode}: ${result.action} applied — ${result.reason ?? "(no reason)"}`,
236
+ );
237
+ }
238
+ }
239
+
132
240
  const proc = await spawnChromium({
133
241
  binary,
134
242
  extraArgs: opts.args,
135
243
  headless: opts.headless ?? false,
244
+ // Opt-out for the auto-no-sandbox-as-root fallback (default: fallback
245
+ // is on so first-run on a Linux server box doesn't crash).
246
+ ...(opts.allowRootWithSandbox === true ? { allowRootWithSandbox: true } : {}),
136
247
  // Chromium rejects inline auth on `--proxy-server`; pass the
137
248
  // auth-stripped server URL.
138
249
  ...(normalized !== undefined ? { proxy: normalized.server } : {}),
250
+ // Primary BCP-47 locale → `--lang=<value>`. Locks the network-layer
251
+ // `Accept-Language` header to the JS spoof (PLAN.md I-5). The full
252
+ // multi-locale list still flows through `matrix.languages` to the
253
+ // inject layer's `navigator.languages` spoof; Chromium derives the
254
+ // q-weighted `Accept-Language` value from the single `--lang` primary
255
+ // automatically. Task 0251.
256
+ locale: adjustedMatrix.locale,
257
+ // Pin OS-level outer window from the matrix's display geometry so
258
+ // `window.outerWidth/outerHeight` (which reads from the OS window,
259
+ // NOT the JS-spoofed `screen.*`) matches the spoof. Closes the
260
+ // `fingerprint-scan.com` 800×600 leak under `--headless=new`.
261
+ // UDC fixes the same issue at `__init__.py:410-411`. Task 0252.
262
+ //
263
+ // (`adjustedMatrix.display` === `matrix.display` since geo reconcile
264
+ // only touches timezone/locale/languages — but we use the adjusted
265
+ // ref for forward-compat.)
266
+ ...(Number.isInteger(adjustedMatrix.display.width) &&
267
+ Number.isInteger(adjustedMatrix.display.height) &&
268
+ adjustedMatrix.display.width > 0 &&
269
+ adjustedMatrix.display.height > 0
270
+ ? {
271
+ windowSize: {
272
+ width: adjustedMatrix.display.width,
273
+ height: adjustedMatrix.display.height,
274
+ },
275
+ }
276
+ : {}),
277
+ // Hermetic harness/CI escape hatch — re-applies the patchright-trim
278
+ // flags (`--disable-component-update`, `--disable-default-apps`,
279
+ // `--disable-background-networking`, `--disable-sync`, hermetic
280
+ // `--disable-features=` extras). Default `false` keeps production users
281
+ // off the passive command-line bot-tell list. Task 0256, PLAN.md §8.6.
282
+ ...(opts.hermetic === true ? { hermetic: true } : {}),
139
283
  });
140
284
 
141
- // Resolve the `MatrixV1` for this session via the consistency engine.
142
- // Inline `ProfileV1` objects flow straight through; string profile ids
143
- // are resolved against a placeholder profile until `@mochi.js/profiles`
144
- // ships its first capture (phase 0.4). The matrix is bit-stable per
145
- // `(profile, seed)` excluding the `derivedAt` timestamp.
146
- const profile = resolveProfile(opts.profile);
147
- const matrix = deriveMatrix(profile, opts.seed);
148
-
149
285
  const session = new Session({
150
286
  proc,
151
- matrix,
287
+ matrix: adjustedMatrix,
152
288
  seed: opts.seed,
153
289
  ...(opts.timeout !== undefined ? { defaultTimeoutMs: opts.timeout } : {}),
154
290
  ...(opts.bypassInject === true ? { bypassInject: true } : {}),
@@ -0,0 +1,110 @@
1
+ /**
2
+ * `ElementHandle` — lightweight wrapper around a CDP `RemoteObject` that lets
3
+ * callers operate on an element resolved via the closed-shadow piercing
4
+ * locator (`Page.querySelectorPiercing`).
5
+ *
6
+ * The handle is intentionally minimal — Phase 0.2 only needs enough surface
7
+ * for the Turnstile auto-clicker to ask "is this an iframe whose src matches
8
+ * cf-turnstile?" and then position a click. Wider parity with Playwright's
9
+ * `ElementHandle` (waitFor, fill, hover, screenshot…) is deferred — those
10
+ * compose on top of the same primitives once they're needed.
11
+ *
12
+ * Lifecycle: the underlying `objectId` is bound to a CDP `Runtime` execution
13
+ * context. Closing the page invalidates every handle the page produced; we
14
+ * don't try to release them via `Runtime.releaseObject` because there's no
15
+ * `Runtime.enable` in this session (PLAN.md §8.2). Stale handles surface as
16
+ * `Cannot find context with specified id` errors from the next CDP call,
17
+ * which is fine for a v0.2 surface.
18
+ *
19
+ * @see PLAN.md §8.2 / §8.3
20
+ * @see tasks/0253-closed-shadow-piercing-locator.md
21
+ */
22
+
23
+ import type { MessageRouter } from "../cdp/router";
24
+ import type { CdpSessionId, RemoteObject } from "../cdp/types";
25
+
26
+ export interface ElementHandleInit {
27
+ router: MessageRouter;
28
+ sessionId: CdpSessionId;
29
+ objectId: string;
30
+ /** CDP `backendNodeId` — stable across DOM mutations. */
31
+ backendNodeId: number;
32
+ }
33
+
34
+ /**
35
+ * A handle to a single DOM element exposed to host-side automation. Issued
36
+ * by `Page.querySelectorPiercing` / `Page.querySelectorAllPiercing`.
37
+ */
38
+ export class ElementHandle {
39
+ private readonly router: MessageRouter;
40
+ private readonly sessionId: CdpSessionId;
41
+ private readonly objectId: string;
42
+ private readonly _backendNodeId: number;
43
+
44
+ constructor(init: ElementHandleInit) {
45
+ this.router = init.router;
46
+ this.sessionId = init.sessionId;
47
+ this.objectId = init.objectId;
48
+ this._backendNodeId = init.backendNodeId;
49
+ }
50
+
51
+ /** The CDP `backendNodeId` for the element — stable across DOM mutations. */
52
+ get backendNodeId(): number {
53
+ return this._backendNodeId;
54
+ }
55
+
56
+ /**
57
+ * Read a single attribute via `Runtime.callFunctionOn`. Returns `null` when
58
+ * the attribute is absent (mirrors `Element.getAttribute`).
59
+ */
60
+ async getAttribute(name: string): Promise<string | null> {
61
+ const r = await this.router.send<{ result: RemoteObject }>(
62
+ "Runtime.callFunctionOn",
63
+ {
64
+ objectId: this.objectId,
65
+ functionDeclaration:
66
+ "function(n) { var v = this.getAttribute(n); return v === null ? null : String(v); }",
67
+ arguments: [{ value: name }],
68
+ returnByValue: true,
69
+ },
70
+ { sessionId: this.sessionId },
71
+ );
72
+ const v = r.result.value;
73
+ return v === null || v === undefined ? null : String(v);
74
+ }
75
+
76
+ /**
77
+ * Get the element's text content via `Runtime.callFunctionOn`.
78
+ */
79
+ async textContent(): Promise<string | null> {
80
+ const r = await this.router.send<{ result: RemoteObject }>(
81
+ "Runtime.callFunctionOn",
82
+ {
83
+ objectId: this.objectId,
84
+ functionDeclaration: "function() { return this.textContent; }",
85
+ returnByValue: true,
86
+ },
87
+ { sessionId: this.sessionId },
88
+ );
89
+ const v = r.result.value;
90
+ return v === null || v === undefined ? null : String(v);
91
+ }
92
+
93
+ /**
94
+ * Evaluate a function bound to this element (the handle is `this`). Result
95
+ * is JSON-serialised via `returnByValue: true`. Same contract as
96
+ * `Page.evaluate` — no closures, no arguments, no DOM-node returns.
97
+ */
98
+ async evaluate<T>(fn: (this: Element) => T): Promise<T> {
99
+ const r = await this.router.send<{ result: RemoteObject }>(
100
+ "Runtime.callFunctionOn",
101
+ {
102
+ objectId: this.objectId,
103
+ functionDeclaration: fn.toString(),
104
+ returnByValue: true,
105
+ },
106
+ { sessionId: this.sessionId },
107
+ );
108
+ return r.result.value as T;
109
+ }
110
+ }
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Closed-shadow piercing locator.
3
+ *
4
+ * Walks a tree returned by `DOM.getDocument({ depth: -1, pierce: true })` and
5
+ * yields `backendNodeId`s for every element that matches a parsed CSS
6
+ * selector — including elements inside **closed** shadow roots, which
7
+ * `DOM.querySelector(..., pierce: true)` does NOT traverse from the parent
8
+ * document. Patchright solves the same problem in `_customFindElementsByParsed`
9
+ * (`framesPatch.ts:868-1012`); this is mochi's port — we kept the recursive-walk
10
+ * shape but simplified the selector subset (CSS only — no XPath; see task
11
+ * 0253 brief for the rationale).
12
+ *
13
+ * The walker recurses through:
14
+ * - `node.children[]` (regular DOM descendants)
15
+ * - `node.shadowRoots[]` (BOTH `shadowRootType:"open"` and `"closed"` — the
16
+ * pierce flag yields both; the matcher just doesn't care which kind it is)
17
+ * - `node.contentDocument` (iframes — same-origin only; OOPIF subframes
18
+ * surface as separate targets and are out of scope here)
19
+ * - `node.templateContent` (template fragment, rare but cheap to walk)
20
+ *
21
+ * It deliberately does NOT recurse into:
22
+ * - `pseudoElements` — `::before` / `::after` aren't real DOM nodes for
23
+ * selector matching purposes; CDP yields them but they'd produce
24
+ * spurious matches on `*` selectors.
25
+ *
26
+ * The walker keeps a *flat* ancestor chain across shadow boundaries so the
27
+ * descendant-combinator matcher can reason about "div .btn" correctly even
28
+ * when the `.btn` is inside a closed shadow rooted at `<div>`. This mirrors
29
+ * how DOM's regular ancestor walk behaves under `composedPath` semantics —
30
+ * patchright does the same.
31
+ *
32
+ * Performance: O(N) in DOM size per call. Acceptable for v0.2 (per task
33
+ * brief — a per-page cache layer is a v0.3+ concern).
34
+ *
35
+ * @see PLAN.md §8.2 — `DOM.getDocument` / `DOM.resolveNode` are not forbidden
36
+ * @see tasks/0253-closed-shadow-piercing-locator.md
37
+ */
38
+
39
+ import type { PierceDomNode } from "../cdp/types";
40
+ import { matchSelector, type ParsedSelector } from "./selector";
41
+
42
+ export interface PierceMatch {
43
+ /** The CDP `backendNodeId` of the matched element — stable across DOM mutations. */
44
+ backendNodeId: number;
45
+ /** The CDP node id (per-DOMSession-instance; less stable than backend). */
46
+ nodeId: number;
47
+ /** The matched node itself (for diagnostics + tests). */
48
+ node: PierceDomNode;
49
+ }
50
+
51
+ /**
52
+ * Walk `root` and return every matching element. Ordering is depth-first,
53
+ * pre-order (parents before children) — matches the natural `querySelectorAll`
54
+ * traversal order users expect.
55
+ *
56
+ * If `limit` is set, the walk short-circuits as soon as that many matches
57
+ * accumulate. `Page.querySelectorPiercing` passes `1` for a single-element
58
+ * lookup; `querySelectorAllPiercing` leaves it undefined.
59
+ */
60
+ export function findPiercingMatches(
61
+ root: PierceDomNode,
62
+ selector: ParsedSelector,
63
+ limit?: number,
64
+ ): PierceMatch[] {
65
+ const out: PierceMatch[] = [];
66
+ walk(root, selector, [], out, limit);
67
+ return out;
68
+ }
69
+
70
+ function walk(
71
+ node: PierceDomNode,
72
+ selector: ParsedSelector,
73
+ ancestors: PierceDomNode[],
74
+ out: PierceMatch[],
75
+ limit: number | undefined,
76
+ ): boolean {
77
+ if (limit !== undefined && out.length >= limit) return true;
78
+
79
+ // Match element nodes only — but document / fragment nodes still need to
80
+ // recurse into children.
81
+ if (node.nodeType === 1 && matchSelector(selector, node, ancestors)) {
82
+ out.push({ backendNodeId: node.backendNodeId, nodeId: node.nodeId, node });
83
+ if (limit !== undefined && out.length >= limit) return true;
84
+ }
85
+
86
+ // Push self into ancestor stack ONLY if it's an element (text / shadow-root
87
+ // / document nodes aren't ancestors for `div .btn`-style descendant matches).
88
+ const isElement = node.nodeType === 1;
89
+ if (isElement) ancestors.push(node);
90
+
91
+ // Children (regular DOM descendants).
92
+ const children = node.children;
93
+ if (children !== undefined) {
94
+ for (const child of children) {
95
+ if (walk(child, selector, ancestors, out, limit)) {
96
+ if (isElement) ancestors.pop();
97
+ return true;
98
+ }
99
+ }
100
+ }
101
+
102
+ // Shadow roots — both open AND closed. This is the whole point.
103
+ const shadowRoots = node.shadowRoots;
104
+ if (shadowRoots !== undefined) {
105
+ for (const root of shadowRoots) {
106
+ if (walk(root, selector, ancestors, out, limit)) {
107
+ if (isElement) ancestors.pop();
108
+ return true;
109
+ }
110
+ }
111
+ }
112
+
113
+ // iframe contentDocument (same-origin only — OOPIFs surface as separate
114
+ // CDP targets and aren't reachable here).
115
+ const contentDocument = node.contentDocument;
116
+ if (contentDocument !== undefined) {
117
+ if (walk(contentDocument, selector, ancestors, out, limit)) {
118
+ if (isElement) ancestors.pop();
119
+ return true;
120
+ }
121
+ }
122
+
123
+ // <template>.content — rare in real-world Cloudflare integrations but
124
+ // matches what patchright walks.
125
+ const templateContent = node.templateContent;
126
+ if (templateContent !== undefined) {
127
+ if (walk(templateContent, selector, ancestors, out, limit)) {
128
+ if (isElement) ancestors.pop();
129
+ return true;
130
+ }
131
+ }
132
+
133
+ if (isElement) ancestors.pop();
134
+ return false;
135
+ }