@pagepocket/capture-http-puppeteer-unit 0.13.0 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/capture-http-puppeteer-unit.d.ts +1 -1
- package/dist/capture-http-puppeteer-unit.js +1 -1
- package/dist/internal/puppeteer-adapter/worker-cdp-types.d.ts +52 -0
- package/dist/internal/puppeteer-adapter/worker-cdp-types.js +46 -0
- package/dist/internal/puppeteer-adapter/worker-network.d.ts +22 -0
- package/dist/internal/puppeteer-adapter/worker-network.js +170 -0
- package/dist/internal/puppeteer-adapter.d.ts +3 -3
- package/dist/internal/puppeteer-adapter.js +34 -6
- package/dist/internal/utils/ensure-browser.d.ts +13 -0
- package/dist/internal/utils/ensure-browser.js +42 -0
- package/dist/internal/utils/puppeteer-auto-scroll.d.ts +25 -0
- package/dist/internal/utils/puppeteer-auto-scroll.js +110 -0
- package/dist/internal/utils/sandbox-args.d.ts +18 -0
- package/dist/internal/utils/sandbox-args.js +24 -0
- package/dist/internal/utils/trigger-actions.js +26 -31
- package/package.json +4 -3
|
@@ -3,7 +3,7 @@ import { type PuppeteerAdapterOptions } from "./internal/puppeteer-adapter.js";
|
|
|
3
3
|
export type CaptureHttpPuppeteerUnitOptions = PuppeteerAdapterOptions;
|
|
4
4
|
export declare class CaptureHttpPuppeteerUnit extends Unit {
|
|
5
5
|
readonly id = "captureHttpPuppeteer";
|
|
6
|
-
readonly description = "Capturing page
|
|
6
|
+
readonly description = "Capturing page";
|
|
7
7
|
private adapterOptions;
|
|
8
8
|
constructor(options?: CaptureHttpPuppeteerUnitOptions);
|
|
9
9
|
run(ctx: UnitContext, rt: UnitRuntime): Promise<{
|
|
@@ -44,7 +44,7 @@ export class CaptureHttpPuppeteerUnit extends Unit {
|
|
|
44
44
|
constructor(options) {
|
|
45
45
|
super();
|
|
46
46
|
this.id = "captureHttpPuppeteer";
|
|
47
|
-
this.description = "Capturing page
|
|
47
|
+
this.description = "Capturing page";
|
|
48
48
|
this.adapterOptions = options ?? {};
|
|
49
49
|
}
|
|
50
50
|
async run(ctx, rt) {
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CDP payload types and helpers for worker network capture.
|
|
3
|
+
*
|
|
4
|
+
* These are a minimal subset of the Chrome DevTools Protocol shapes needed
|
|
5
|
+
* to monitor `Network.*` events on WebWorker targets.
|
|
6
|
+
*/
|
|
7
|
+
import type { ResourceType } from "@pagepocket/lib";
|
|
8
|
+
export type CdpRequestWillBeSent = {
|
|
9
|
+
requestId: string;
|
|
10
|
+
timestamp?: number;
|
|
11
|
+
wallTime?: number;
|
|
12
|
+
type?: string;
|
|
13
|
+
initiator?: {
|
|
14
|
+
type?: string;
|
|
15
|
+
url?: string;
|
|
16
|
+
};
|
|
17
|
+
request: {
|
|
18
|
+
url: string;
|
|
19
|
+
method: string;
|
|
20
|
+
headers?: Record<string, unknown>;
|
|
21
|
+
};
|
|
22
|
+
};
|
|
23
|
+
export type CdpResponseReceived = {
|
|
24
|
+
requestId: string;
|
|
25
|
+
timestamp?: number;
|
|
26
|
+
wallTime?: number;
|
|
27
|
+
type?: string;
|
|
28
|
+
response: {
|
|
29
|
+
url: string;
|
|
30
|
+
status: number;
|
|
31
|
+
statusText?: string;
|
|
32
|
+
headers?: Record<string, unknown>;
|
|
33
|
+
mimeType?: string;
|
|
34
|
+
fromDiskCache?: boolean;
|
|
35
|
+
fromServiceWorker?: boolean;
|
|
36
|
+
};
|
|
37
|
+
};
|
|
38
|
+
export type CdpLoadingFinished = {
|
|
39
|
+
requestId: string;
|
|
40
|
+
timestamp?: number;
|
|
41
|
+
};
|
|
42
|
+
export type CdpLoadingFailed = {
|
|
43
|
+
requestId: string;
|
|
44
|
+
timestamp?: number;
|
|
45
|
+
errorText: string;
|
|
46
|
+
};
|
|
47
|
+
export declare const normalizeHeaders: (headers?: Record<string, unknown>) => Record<string, string>;
|
|
48
|
+
export declare const mapResourceType: (input?: string) => ResourceType | undefined;
|
|
49
|
+
export declare const resolveTimestampMs: (payload: {
|
|
50
|
+
timestamp?: number;
|
|
51
|
+
wallTime?: number;
|
|
52
|
+
}) => number;
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Helpers
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
export const normalizeHeaders = (headers) => {
|
|
5
|
+
const output = {};
|
|
6
|
+
if (!headers) {
|
|
7
|
+
return output;
|
|
8
|
+
}
|
|
9
|
+
for (const key of Object.keys(headers)) {
|
|
10
|
+
const value = headers[key];
|
|
11
|
+
if (value === undefined) {
|
|
12
|
+
continue;
|
|
13
|
+
}
|
|
14
|
+
output[key] = Array.isArray(value) ? value.join(", ") : String(value);
|
|
15
|
+
}
|
|
16
|
+
return output;
|
|
17
|
+
};
|
|
18
|
+
export const mapResourceType = (input) => {
|
|
19
|
+
if (!input) {
|
|
20
|
+
return undefined;
|
|
21
|
+
}
|
|
22
|
+
const normalized = input.toLowerCase();
|
|
23
|
+
switch (normalized) {
|
|
24
|
+
case "document":
|
|
25
|
+
case "stylesheet":
|
|
26
|
+
case "script":
|
|
27
|
+
case "image":
|
|
28
|
+
case "font":
|
|
29
|
+
case "media":
|
|
30
|
+
case "xhr":
|
|
31
|
+
case "fetch":
|
|
32
|
+
case "other":
|
|
33
|
+
return normalized;
|
|
34
|
+
default:
|
|
35
|
+
return normalized;
|
|
36
|
+
}
|
|
37
|
+
};
|
|
38
|
+
export const resolveTimestampMs = (payload) => {
|
|
39
|
+
if (typeof payload.wallTime === "number") {
|
|
40
|
+
return Math.round(payload.wallTime * 1000);
|
|
41
|
+
}
|
|
42
|
+
if (typeof payload.timestamp === "number") {
|
|
43
|
+
return Math.round(payload.timestamp * 1000);
|
|
44
|
+
}
|
|
45
|
+
return Date.now();
|
|
46
|
+
};
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CDP-based network capture for WebWorker targets.
|
|
3
|
+
*
|
|
4
|
+
* Puppeteer's `page.on("request")` does not fire for requests made inside
|
|
5
|
+
* dedicated WebWorkers. This module attaches a CDP `Network` session to each
|
|
6
|
+
* worker so those requests are captured alongside the main-page traffic.
|
|
7
|
+
*
|
|
8
|
+
* Non-intercepting: we only listen (`Network.enable`), never pause requests.
|
|
9
|
+
*/
|
|
10
|
+
import type { NetworkEventHandlers } from "@pagepocket/lib";
|
|
11
|
+
import type { Page } from "puppeteer";
|
|
12
|
+
export type WorkerNetworkCapture = {
|
|
13
|
+
/** Stop all worker network listeners and detach future-worker handling. */
|
|
14
|
+
stop: () => Promise<void>;
|
|
15
|
+
};
|
|
16
|
+
/**
|
|
17
|
+
* Start capturing network traffic from all current and future WebWorkers on
|
|
18
|
+
* the given page.
|
|
19
|
+
*
|
|
20
|
+
* Returns a handle whose `stop()` tears down every worker listener.
|
|
21
|
+
*/
|
|
22
|
+
export declare const startWorkerNetworkCapture: (page: Page, handlers: NetworkEventHandlers) => WorkerNetworkCapture;
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import { mapResourceType, normalizeHeaders, resolveTimestampMs } from "./worker-cdp-types.js";
|
|
2
|
+
const attachToWorker = async (worker, handlers, requestIdPrefix) => {
|
|
3
|
+
// CdpWebWorker (the Chromium-backed Puppeteer implementation) exposes a
|
|
4
|
+
// `client` getter that returns the underlying CDPSession. The base
|
|
5
|
+
// `WebWorker` class does not expose this, so we access it via cast.
|
|
6
|
+
const workerRef = worker;
|
|
7
|
+
const cdpSession = workerRef.client;
|
|
8
|
+
if (!cdpSession || typeof cdpSession.send !== "function") {
|
|
9
|
+
return undefined;
|
|
10
|
+
}
|
|
11
|
+
try {
|
|
12
|
+
await cdpSession.send("Network.enable");
|
|
13
|
+
}
|
|
14
|
+
catch {
|
|
15
|
+
return undefined;
|
|
16
|
+
}
|
|
17
|
+
let requestSequence = 0;
|
|
18
|
+
const cdpToLogical = new Map();
|
|
19
|
+
const getLogicalId = (cdpRequestId) => {
|
|
20
|
+
const existing = cdpToLogical.get(cdpRequestId);
|
|
21
|
+
if (existing) {
|
|
22
|
+
return existing;
|
|
23
|
+
}
|
|
24
|
+
const logicalId = `${requestIdPrefix}-${requestSequence++}`;
|
|
25
|
+
cdpToLogical.set(cdpRequestId, logicalId);
|
|
26
|
+
return logicalId;
|
|
27
|
+
};
|
|
28
|
+
const storedResponses = new Map();
|
|
29
|
+
const requestUrls = new Map();
|
|
30
|
+
const onRequestWillBeSent = (raw) => {
|
|
31
|
+
const payload = raw;
|
|
32
|
+
const logicalId = getLogicalId(payload.requestId);
|
|
33
|
+
const timestamp = resolveTimestampMs(payload);
|
|
34
|
+
requestUrls.set(logicalId, payload.request.url);
|
|
35
|
+
const event = {
|
|
36
|
+
type: "request",
|
|
37
|
+
requestId: logicalId,
|
|
38
|
+
url: payload.request.url,
|
|
39
|
+
method: payload.request.method,
|
|
40
|
+
headers: normalizeHeaders(payload.request.headers),
|
|
41
|
+
resourceType: mapResourceType(payload.type),
|
|
42
|
+
initiator: payload.initiator
|
|
43
|
+
? { type: payload.initiator.type, url: payload.initiator.url }
|
|
44
|
+
: undefined,
|
|
45
|
+
timestamp
|
|
46
|
+
};
|
|
47
|
+
handlers.onEvent(event);
|
|
48
|
+
};
|
|
49
|
+
const onResponseReceived = (raw) => {
|
|
50
|
+
const payload = raw;
|
|
51
|
+
const logicalId = getLogicalId(payload.requestId);
|
|
52
|
+
storedResponses.set(payload.requestId, {
|
|
53
|
+
logicalId,
|
|
54
|
+
response: payload.response,
|
|
55
|
+
resourceType: mapResourceType(payload.type)
|
|
56
|
+
});
|
|
57
|
+
};
|
|
58
|
+
const onLoadingFinished = async (raw) => {
|
|
59
|
+
const payload = raw;
|
|
60
|
+
const stored = storedResponses.get(payload.requestId);
|
|
61
|
+
if (!stored) {
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
storedResponses.delete(payload.requestId);
|
|
65
|
+
let bodyBytes;
|
|
66
|
+
try {
|
|
67
|
+
const result = await cdpSession.send("Network.getResponseBody", {
|
|
68
|
+
requestId: payload.requestId
|
|
69
|
+
});
|
|
70
|
+
const typed = result;
|
|
71
|
+
if (typed.base64Encoded) {
|
|
72
|
+
const binary = atob(typed.body);
|
|
73
|
+
const bytes = new Uint8Array(binary.length);
|
|
74
|
+
for (let byteIndex = 0; byteIndex < binary.length; byteIndex += 1) {
|
|
75
|
+
bytes[byteIndex] = binary.charCodeAt(byteIndex);
|
|
76
|
+
}
|
|
77
|
+
bodyBytes = bytes;
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
bodyBytes = new TextEncoder().encode(typed.body);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
catch {
|
|
84
|
+
// Body may be unavailable (204, redirects, already-destroyed worker).
|
|
85
|
+
}
|
|
86
|
+
if (bodyBytes && bodyBytes.byteLength === 0) {
|
|
87
|
+
bodyBytes = undefined;
|
|
88
|
+
}
|
|
89
|
+
const responseEvent = {
|
|
90
|
+
type: "response",
|
|
91
|
+
requestId: stored.logicalId,
|
|
92
|
+
url: stored.response.url,
|
|
93
|
+
status: stored.response.status,
|
|
94
|
+
statusText: stored.response.statusText,
|
|
95
|
+
headers: normalizeHeaders(stored.response.headers),
|
|
96
|
+
mimeType: stored.response.mimeType,
|
|
97
|
+
fromDiskCache: stored.response.fromDiskCache,
|
|
98
|
+
fromServiceWorker: stored.response.fromServiceWorker,
|
|
99
|
+
timestamp: resolveTimestampMs(payload),
|
|
100
|
+
body: bodyBytes ? { kind: "buffer", data: bodyBytes } : undefined
|
|
101
|
+
};
|
|
102
|
+
handlers.onEvent(responseEvent);
|
|
103
|
+
};
|
|
104
|
+
const onLoadingFailed = (raw) => {
|
|
105
|
+
const payload = raw;
|
|
106
|
+
const logicalId = getLogicalId(payload.requestId);
|
|
107
|
+
storedResponses.delete(payload.requestId);
|
|
108
|
+
const failedEvent = {
|
|
109
|
+
type: "failed",
|
|
110
|
+
requestId: logicalId,
|
|
111
|
+
url: requestUrls.get(logicalId) ?? "",
|
|
112
|
+
errorText: payload.errorText,
|
|
113
|
+
timestamp: resolveTimestampMs(payload)
|
|
114
|
+
};
|
|
115
|
+
handlers.onEvent(failedEvent);
|
|
116
|
+
};
|
|
117
|
+
cdpSession.on("Network.requestWillBeSent", onRequestWillBeSent);
|
|
118
|
+
cdpSession.on("Network.responseReceived", onResponseReceived);
|
|
119
|
+
cdpSession.on("Network.loadingFinished", onLoadingFinished);
|
|
120
|
+
cdpSession.on("Network.loadingFailed", onLoadingFailed);
|
|
121
|
+
const stop = async () => {
|
|
122
|
+
cdpSession.off("Network.requestWillBeSent", onRequestWillBeSent);
|
|
123
|
+
cdpSession.off("Network.responseReceived", onResponseReceived);
|
|
124
|
+
cdpSession.off("Network.loadingFinished", onLoadingFinished);
|
|
125
|
+
cdpSession.off("Network.loadingFailed", onLoadingFailed);
|
|
126
|
+
try {
|
|
127
|
+
await cdpSession.send("Network.disable");
|
|
128
|
+
}
|
|
129
|
+
catch {
|
|
130
|
+
// Worker may already be gone.
|
|
131
|
+
}
|
|
132
|
+
};
|
|
133
|
+
return { stop };
|
|
134
|
+
};
|
|
135
|
+
/**
|
|
136
|
+
* Start capturing network traffic from all current and future WebWorkers on
|
|
137
|
+
* the given page.
|
|
138
|
+
*
|
|
139
|
+
* Returns a handle whose `stop()` tears down every worker listener.
|
|
140
|
+
*/
|
|
141
|
+
export const startWorkerNetworkCapture = (page, handlers) => {
|
|
142
|
+
const sessions = [];
|
|
143
|
+
let workerSequence = 0;
|
|
144
|
+
const handleWorker = async (worker) => {
|
|
145
|
+
const prefix = `pptr-worker-${Date.now()}-${workerSequence++}`;
|
|
146
|
+
const session = await attachToWorker(worker, handlers, prefix);
|
|
147
|
+
if (session) {
|
|
148
|
+
sessions.push(session);
|
|
149
|
+
}
|
|
150
|
+
};
|
|
151
|
+
// Attach to workers that already exist.
|
|
152
|
+
const castPage = page;
|
|
153
|
+
if (typeof castPage.workers === "function") {
|
|
154
|
+
for (const existingWorker of castPage.workers()) {
|
|
155
|
+
void handleWorker(existingWorker);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
// Listen for future workers.
|
|
159
|
+
const onWorkerCreated = (event) => {
|
|
160
|
+
void handleWorker(event);
|
|
161
|
+
};
|
|
162
|
+
page.on("workercreated", onWorkerCreated);
|
|
163
|
+
const stop = async () => {
|
|
164
|
+
page.off("workercreated", onWorkerCreated);
|
|
165
|
+
const pending = sessions.map((session) => session.stop());
|
|
166
|
+
await Promise.all(pending);
|
|
167
|
+
sessions.length = 0;
|
|
168
|
+
};
|
|
169
|
+
return { stop };
|
|
170
|
+
};
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import type { NetworkEventHandlers, NetworkInterceptorAdapter, TriggerAction, InterceptTarget, InterceptSession } from "@pagepocket/lib";
|
|
2
|
-
import { type Browser, type GoToOptions, type
|
|
2
|
+
import { type Browser, type GoToOptions, type PuppeteerLaunchOptions, type Page } from "puppeteer";
|
|
3
3
|
export type PuppeteerAdapterOptions = {
|
|
4
4
|
browser?: Browser;
|
|
5
5
|
page?: Page;
|
|
6
|
-
launch?: (options?:
|
|
7
|
-
launchOptions?:
|
|
6
|
+
launch?: (options?: PuppeteerLaunchOptions) => Promise<Browser>;
|
|
7
|
+
launchOptions?: PuppeteerLaunchOptions;
|
|
8
8
|
gotoOptions?: GoToOptions;
|
|
9
9
|
triggerActions?: TriggerAction[];
|
|
10
10
|
};
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import puppeteer from "puppeteer";
|
|
2
2
|
import { createEmitRequest, createOnRequestFailed, createOnResponse, createRequestIdStore } from "./puppeteer-adapter/events.js";
|
|
3
3
|
import { buildNavigate, ensureHtmlTargetLoaded } from "./puppeteer-adapter/targets.js";
|
|
4
|
+
import { startWorkerNetworkCapture } from "./puppeteer-adapter/worker-network.js";
|
|
4
5
|
import { readDomHtml } from "./utils/dom-html.js";
|
|
5
6
|
import { getEnvString } from "./utils/env.js";
|
|
6
|
-
import {
|
|
7
|
+
import { isMissingChromeError } from "./utils/errors.js";
|
|
8
|
+
import { ensureBrowser } from "./utils/ensure-browser.js";
|
|
7
9
|
import { runTriggerActions } from "./utils/trigger-actions.js";
|
|
10
|
+
import { getSandboxArgs } from "./utils/sandbox-args.js";
|
|
8
11
|
/**
|
|
9
12
|
* Internal adapter for the capture plugin.
|
|
10
13
|
*
|
|
@@ -38,17 +41,25 @@ export class PuppeteerAdapter {
|
|
|
38
41
|
const envExecutablePath = getEnvString("PUPPETEER_EXECUTABLE_PATH");
|
|
39
42
|
const launchOptionsFromUser = this.options.launchOptions;
|
|
40
43
|
const executablePath = launchOptionsFromUser?.executablePath ?? envExecutablePath;
|
|
44
|
+
const sandboxArgs = getSandboxArgs();
|
|
45
|
+
const mergedArgs = [...(launchOptionsFromUser?.args ?? []), ...sandboxArgs];
|
|
46
|
+
const baseLaunchOptions = {
|
|
47
|
+
...launchOptionsFromUser,
|
|
48
|
+
...(executablePath ? { executablePath } : {}),
|
|
49
|
+
...(mergedArgs.length > 0 ? { args: mergedArgs } : {})
|
|
50
|
+
};
|
|
41
51
|
try {
|
|
42
|
-
browser = await doLaunch(
|
|
43
|
-
...launchOptionsFromUser,
|
|
44
|
-
...(executablePath ? { executablePath } : {})
|
|
45
|
-
});
|
|
52
|
+
browser = await doLaunch(baseLaunchOptions);
|
|
46
53
|
}
|
|
47
54
|
catch (error) {
|
|
48
55
|
if (!isMissingChromeError(error)) {
|
|
49
56
|
throw error;
|
|
50
57
|
}
|
|
51
|
-
|
|
58
|
+
const downloadedPath = await ensureBrowser();
|
|
59
|
+
browser = await doLaunch({
|
|
60
|
+
...baseLaunchOptions,
|
|
61
|
+
executablePath: downloadedPath
|
|
62
|
+
});
|
|
52
63
|
}
|
|
53
64
|
ownsBrowser = true;
|
|
54
65
|
}
|
|
@@ -62,6 +73,17 @@ export class PuppeteerAdapter {
|
|
|
62
73
|
if (!page || typeof page.on !== "function") {
|
|
63
74
|
throw new Error("PuppeteerAdapter requires a Puppeteer Page instance.");
|
|
64
75
|
}
|
|
76
|
+
// Activate the CDP Fetch domain with a catch-all pattern.
|
|
77
|
+
// Without Fetch.enable, Chrome silently omits Network events for certain
|
|
78
|
+
// XHR requests (e.g. Feishu's encrypted-image blob downloads via
|
|
79
|
+
// XMLHttpRequest to CDN). Enabling Fetch forces every request through
|
|
80
|
+
// the CDP pipeline, making them visible to Network.* listeners.
|
|
81
|
+
// We immediately continue each paused request so page behavior is unchanged.
|
|
82
|
+
const cdpSession = await page.createCDPSession();
|
|
83
|
+
await cdpSession.send("Fetch.enable", { patterns: [{ urlPattern: "*" }] });
|
|
84
|
+
cdpSession.on("Fetch.requestPaused", (event) => {
|
|
85
|
+
cdpSession.send("Fetch.continueRequest", { requestId: event.requestId }).catch(() => { });
|
|
86
|
+
});
|
|
65
87
|
const { getRequestId, has } = createRequestIdStore();
|
|
66
88
|
const emitRequest = createEmitRequest(handlers, getRequestId);
|
|
67
89
|
const onRequest = (request) => emitRequest(request, Date.now());
|
|
@@ -80,6 +102,9 @@ export class PuppeteerAdapter {
|
|
|
80
102
|
page.on("request", onRequest);
|
|
81
103
|
page.on("response", onResponse);
|
|
82
104
|
page.on("requestfailed", onRequestFailed);
|
|
105
|
+
// Capture network traffic from WebWorkers via CDP Network.enable.
|
|
106
|
+
// Puppeteer's page-level events do not fire for worker requests.
|
|
107
|
+
const workerCapture = startWorkerNetworkCapture(page, handlers);
|
|
83
108
|
const { navigate, awaitLastNavigation } = buildNavigate(page, this.options.gotoOptions);
|
|
84
109
|
let htmlLoaded = false;
|
|
85
110
|
const whenHtmlLoaded = async () => {
|
|
@@ -120,6 +145,9 @@ export class PuppeteerAdapter {
|
|
|
120
145
|
getDomHtml,
|
|
121
146
|
startCapture,
|
|
122
147
|
stop: async () => {
|
|
148
|
+
await workerCapture.stop();
|
|
149
|
+
await cdpSession.send("Fetch.disable").catch(() => { });
|
|
150
|
+
await cdpSession.detach().catch(() => { });
|
|
123
151
|
page.off("request", onRequest);
|
|
124
152
|
page.off("response", onResponse);
|
|
125
153
|
page.off("requestfailed", onRequestFailed);
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ensures a Chrome executable is available for Puppeteer.
|
|
3
|
+
*
|
|
4
|
+
* When Puppeteer cannot find a compatible Chrome installation on the system,
|
|
5
|
+
* this helper automatically downloads Chrome for Testing using `@puppeteer/browsers`.
|
|
6
|
+
* The downloaded browser is cached in `~/.cache/puppeteer` and reused on
|
|
7
|
+
* subsequent runs.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* const executablePath = await ensureBrowser();
|
|
11
|
+
* const browser = await puppeteer.launch({ executablePath });
|
|
12
|
+
*/
|
|
13
|
+
export declare const ensureBrowser: () => Promise<string>;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import os from "os";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import { Browser, computeExecutablePath, detectBrowserPlatform, getInstalledBrowsers, install, resolveBuildId } from "@puppeteer/browsers";
|
|
4
|
+
const DEFAULT_CACHE_DIR = path.join(os.homedir(), ".cache", "puppeteer");
|
|
5
|
+
/**
|
|
6
|
+
* Ensures a Chrome executable is available for Puppeteer.
|
|
7
|
+
*
|
|
8
|
+
* When Puppeteer cannot find a compatible Chrome installation on the system,
|
|
9
|
+
* this helper automatically downloads Chrome for Testing using `@puppeteer/browsers`.
|
|
10
|
+
* The downloaded browser is cached in `~/.cache/puppeteer` and reused on
|
|
11
|
+
* subsequent runs.
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* const executablePath = await ensureBrowser();
|
|
15
|
+
* const browser = await puppeteer.launch({ executablePath });
|
|
16
|
+
*/
|
|
17
|
+
export const ensureBrowser = async () => {
|
|
18
|
+
const platform = detectBrowserPlatform();
|
|
19
|
+
if (!platform) {
|
|
20
|
+
throw new Error(`Cannot detect browser platform for: ${os.platform()} (${os.arch()})`);
|
|
21
|
+
}
|
|
22
|
+
const cacheDir = DEFAULT_CACHE_DIR;
|
|
23
|
+
const buildId = await resolveBuildId(Browser.CHROME, platform, "stable");
|
|
24
|
+
const installedBrowsers = await getInstalledBrowsers({ cacheDir });
|
|
25
|
+
const existingInstall = installedBrowsers.find((entry) => entry.browser === Browser.CHROME && entry.buildId === buildId);
|
|
26
|
+
if (existingInstall) {
|
|
27
|
+
return computeExecutablePath({
|
|
28
|
+
browser: Browser.CHROME,
|
|
29
|
+
buildId,
|
|
30
|
+
cacheDir
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
console.log(`Chrome not found. Downloading Chrome (buildId: ${buildId})...`);
|
|
34
|
+
const installedBrowser = await install({
|
|
35
|
+
browser: Browser.CHROME,
|
|
36
|
+
cacheDir,
|
|
37
|
+
buildId,
|
|
38
|
+
platform
|
|
39
|
+
});
|
|
40
|
+
console.log(`Chrome downloaded to ${installedBrowser.executablePath}`);
|
|
41
|
+
return installedBrowser.executablePath;
|
|
42
|
+
};
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { Page } from "puppeteer";
|
|
2
|
+
export type AutoScrollOptions = {
|
|
3
|
+
distancePx?: number;
|
|
4
|
+
intervalMs?: number;
|
|
5
|
+
maxIdleTicks?: number;
|
|
6
|
+
maxSteps?: number;
|
|
7
|
+
};
|
|
8
|
+
/**
|
|
9
|
+
* Incrementally scrolls a page to the bottom until height stabilizes.
|
|
10
|
+
*
|
|
11
|
+
* Handles both traditional pages (window is the scroller) and SPA layouts
|
|
12
|
+
* where the actual scrollable area is a nested container with
|
|
13
|
+
* `overflow: auto | scroll` (e.g. Feishu, Notion, Google Docs).
|
|
14
|
+
*
|
|
15
|
+
* Detection logic:
|
|
16
|
+
* 1. If `document.scrollingElement.scrollHeight > window.innerHeight`,
|
|
17
|
+
* use `window.scrollBy` (classic path).
|
|
18
|
+
* 2. Otherwise, find the DOM element with the largest scrollable overflow
|
|
19
|
+
* (`scrollHeight - clientHeight`) and scroll that element instead.
|
|
20
|
+
*
|
|
21
|
+
* Usage:
|
|
22
|
+
* await autoScrollToEnd(page);
|
|
23
|
+
* await autoScrollToEnd(page, { intervalMs: 150, distancePx: 200 });
|
|
24
|
+
*/
|
|
25
|
+
export declare const autoScrollToEnd: (page: Page, options?: AutoScrollOptions) => Promise<void>;
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Incrementally scrolls a page to the bottom until height stabilizes.
|
|
3
|
+
*
|
|
4
|
+
* Handles both traditional pages (window is the scroller) and SPA layouts
|
|
5
|
+
* where the actual scrollable area is a nested container with
|
|
6
|
+
* `overflow: auto | scroll` (e.g. Feishu, Notion, Google Docs).
|
|
7
|
+
*
|
|
8
|
+
* Detection logic:
|
|
9
|
+
* 1. If `document.scrollingElement.scrollHeight > window.innerHeight`,
|
|
10
|
+
* use `window.scrollBy` (classic path).
|
|
11
|
+
* 2. Otherwise, find the DOM element with the largest scrollable overflow
|
|
12
|
+
* (`scrollHeight - clientHeight`) and scroll that element instead.
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* await autoScrollToEnd(page);
|
|
16
|
+
* await autoScrollToEnd(page, { intervalMs: 150, distancePx: 200 });
|
|
17
|
+
*/
|
|
18
|
+
export const autoScrollToEnd = async (page, options = {}) => {
|
|
19
|
+
const distancePx = options.distancePx ?? 100;
|
|
20
|
+
const intervalMs = options.intervalMs ?? 100;
|
|
21
|
+
const maxIdleTicks = options.maxIdleTicks ?? 8;
|
|
22
|
+
const maxSteps = options.maxSteps ?? 300;
|
|
23
|
+
await page.evaluate(async (scrollOptions) => {
|
|
24
|
+
/**
|
|
25
|
+
* Find the primary scroll target.
|
|
26
|
+
*
|
|
27
|
+
* Returns `null` when the window itself is scrollable (caller should
|
|
28
|
+
* fall back to `window.scrollBy`). Returns an `Element` when a nested
|
|
29
|
+
* container owns the scroll.
|
|
30
|
+
*/
|
|
31
|
+
const findScrollContainer = () => {
|
|
32
|
+
const root = document.scrollingElement ?? document.documentElement ?? document.body;
|
|
33
|
+
if (root && root.scrollHeight > window.innerHeight + 1) {
|
|
34
|
+
return null;
|
|
35
|
+
}
|
|
36
|
+
let best = null;
|
|
37
|
+
let bestDelta = 0;
|
|
38
|
+
const elements = Array.from(document.querySelectorAll("*"));
|
|
39
|
+
for (const el of elements) {
|
|
40
|
+
const style = getComputedStyle(el);
|
|
41
|
+
const oy = style.overflowY;
|
|
42
|
+
const isScrollable = oy === "auto" || oy === "scroll";
|
|
43
|
+
if (!isScrollable) {
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
const delta = el.scrollHeight - el.clientHeight;
|
|
47
|
+
if (delta > bestDelta) {
|
|
48
|
+
bestDelta = delta;
|
|
49
|
+
best = el;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return best;
|
|
53
|
+
};
|
|
54
|
+
const container = findScrollContainer();
|
|
55
|
+
const getScrollHeight = () => {
|
|
56
|
+
if (container) {
|
|
57
|
+
return container.scrollHeight;
|
|
58
|
+
}
|
|
59
|
+
const root = document.scrollingElement ?? document.documentElement ?? document.body;
|
|
60
|
+
return Math.max(document.body?.scrollHeight ?? 0, document.documentElement?.scrollHeight ?? 0, root?.scrollHeight ?? 0);
|
|
61
|
+
};
|
|
62
|
+
const getScrollPosition = () => {
|
|
63
|
+
if (container) {
|
|
64
|
+
return container.scrollTop;
|
|
65
|
+
}
|
|
66
|
+
return window.scrollY;
|
|
67
|
+
};
|
|
68
|
+
const getViewportSize = () => {
|
|
69
|
+
if (container) {
|
|
70
|
+
return container.clientHeight;
|
|
71
|
+
}
|
|
72
|
+
return window.innerHeight;
|
|
73
|
+
};
|
|
74
|
+
const scrollBy = (distance) => {
|
|
75
|
+
if (container) {
|
|
76
|
+
container.scrollTop += distance;
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
window.scrollBy(0, distance);
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
await new Promise((resolve) => {
|
|
83
|
+
let stepCount = 0;
|
|
84
|
+
let idleTickCount = 0;
|
|
85
|
+
let lastKnownHeight = getScrollHeight();
|
|
86
|
+
const epsilon = 1;
|
|
87
|
+
const timer = window.setInterval(() => {
|
|
88
|
+
const currentHeight = getScrollHeight();
|
|
89
|
+
const viewportBottom = getScrollPosition() + getViewportSize();
|
|
90
|
+
const reachedBottom = viewportBottom + epsilon >= currentHeight;
|
|
91
|
+
if (currentHeight > lastKnownHeight) {
|
|
92
|
+
lastKnownHeight = currentHeight;
|
|
93
|
+
idleTickCount = 0;
|
|
94
|
+
}
|
|
95
|
+
else if (reachedBottom) {
|
|
96
|
+
idleTickCount += 1;
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
idleTickCount = 0;
|
|
100
|
+
}
|
|
101
|
+
scrollBy(scrollOptions.distancePx);
|
|
102
|
+
stepCount += 1;
|
|
103
|
+
if (stepCount >= scrollOptions.maxSteps || idleTickCount >= scrollOptions.maxIdleTicks) {
|
|
104
|
+
window.clearInterval(timer);
|
|
105
|
+
resolve();
|
|
106
|
+
}
|
|
107
|
+
}, scrollOptions.intervalMs);
|
|
108
|
+
});
|
|
109
|
+
}, { distancePx, intervalMs, maxIdleTicks, maxSteps });
|
|
110
|
+
};
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Returns Chromium CLI flags that disable the sandbox on Linux.
|
|
3
|
+
*
|
|
4
|
+
* Ubuntu 23.10+ (and other distros) restrict unprivileged user namespaces
|
|
5
|
+
* via AppArmor, which makes the Chrome sandbox fail with:
|
|
6
|
+
*
|
|
7
|
+
* "No usable sandbox! … see apparmor-userns-restrictions.md"
|
|
8
|
+
*
|
|
9
|
+
* Adding `--no-sandbox --disable-setuid-sandbox` is the standard workaround
|
|
10
|
+
* recommended by Puppeteer for Linux environments (CI and desktop alike).
|
|
11
|
+
*
|
|
12
|
+
* On non-Linux platforms the sandbox works out of the box, so no extra
|
|
13
|
+
* flags are injected.
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
* const args = [...(userArgs ?? []), ...getSandboxArgs()];
|
|
17
|
+
*/
|
|
18
|
+
export declare const getSandboxArgs: () => string[];
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { platform } from "node:os";
|
|
2
|
+
/**
|
|
3
|
+
* Returns Chromium CLI flags that disable the sandbox on Linux.
|
|
4
|
+
*
|
|
5
|
+
* Ubuntu 23.10+ (and other distros) restrict unprivileged user namespaces
|
|
6
|
+
* via AppArmor, which makes the Chrome sandbox fail with:
|
|
7
|
+
*
|
|
8
|
+
* "No usable sandbox! … see apparmor-userns-restrictions.md"
|
|
9
|
+
*
|
|
10
|
+
* Adding `--no-sandbox --disable-setuid-sandbox` is the standard workaround
|
|
11
|
+
* recommended by Puppeteer for Linux environments (CI and desktop alike).
|
|
12
|
+
*
|
|
13
|
+
* On non-Linux platforms the sandbox works out of the box, so no extra
|
|
14
|
+
* flags are injected.
|
|
15
|
+
*
|
|
16
|
+
* Usage:
|
|
17
|
+
* const args = [...(userArgs ?? []), ...getSandboxArgs()];
|
|
18
|
+
*/
|
|
19
|
+
export const getSandboxArgs = () => {
|
|
20
|
+
if (platform() !== "linux") {
|
|
21
|
+
return [];
|
|
22
|
+
}
|
|
23
|
+
return ["--no-sandbox", "--disable-setuid-sandbox"];
|
|
24
|
+
};
|
|
@@ -1,37 +1,32 @@
|
|
|
1
|
+
import { autoScrollToEnd } from "./puppeteer-auto-scroll.js";
|
|
2
|
+
const hoverAllElements = async (page) => {
|
|
3
|
+
await page.evaluate(() => {
|
|
4
|
+
const elements = Array.from(document.querySelectorAll("*"));
|
|
5
|
+
const triggerMouseOver = (element) => {
|
|
6
|
+
const rect = element.getBoundingClientRect();
|
|
7
|
+
const pointerX = rect.left + rect.width / 2;
|
|
8
|
+
const pointerY = rect.top + rect.height / 2;
|
|
9
|
+
const event = new MouseEvent("mouseover", {
|
|
10
|
+
bubbles: true,
|
|
11
|
+
cancelable: true,
|
|
12
|
+
clientX: pointerX,
|
|
13
|
+
clientY: pointerY
|
|
14
|
+
});
|
|
15
|
+
element.dispatchEvent(event);
|
|
16
|
+
};
|
|
17
|
+
elements.forEach(triggerMouseOver);
|
|
18
|
+
});
|
|
19
|
+
};
|
|
1
20
|
export const runTriggerActions = async (page, actions = []) => {
|
|
2
21
|
if (actions.length === 0) {
|
|
3
22
|
return;
|
|
4
23
|
}
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
const pointerY = rect.top + rect.height / 2;
|
|
12
|
-
const event = new MouseEvent("mouseover", {
|
|
13
|
-
bubbles: true,
|
|
14
|
-
cancelable: true,
|
|
15
|
-
clientX: pointerX,
|
|
16
|
-
clientY: pointerY
|
|
17
|
-
});
|
|
18
|
-
el.dispatchEvent(event);
|
|
19
|
-
};
|
|
20
|
-
elements.forEach(trigger);
|
|
21
|
-
};
|
|
22
|
-
const scrollToEnd = () => {
|
|
23
|
-
const scrollHeight = document.documentElement?.scrollHeight ?? document.body?.scrollHeight;
|
|
24
|
-
if (typeof scrollHeight === "number") {
|
|
25
|
-
window.scrollTo({ top: scrollHeight, behavior: "instant" });
|
|
26
|
-
}
|
|
27
|
-
};
|
|
28
|
-
for (const action of actionsArg) {
|
|
29
|
-
if (action === "HOVER") {
|
|
30
|
-
hoverAll();
|
|
31
|
-
}
|
|
32
|
-
if (action === "SCROLL_TO_END") {
|
|
33
|
-
scrollToEnd();
|
|
34
|
-
}
|
|
24
|
+
for (const action of actions) {
|
|
25
|
+
if (action === "HOVER") {
|
|
26
|
+
await hoverAllElements(page);
|
|
27
|
+
}
|
|
28
|
+
if (action === "SCROLL_TO_END") {
|
|
29
|
+
await autoScrollToEnd(page);
|
|
35
30
|
}
|
|
36
|
-
}
|
|
31
|
+
}
|
|
37
32
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pagepocket/capture-http-puppeteer-unit",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.14.5",
|
|
4
4
|
"description": "PagePocket plugin: capture HTTP events (puppeteer adapter)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -11,8 +11,9 @@
|
|
|
11
11
|
"license": "ISC",
|
|
12
12
|
"dependencies": {
|
|
13
13
|
"puppeteer": "^22.12.1",
|
|
14
|
-
"@
|
|
15
|
-
"@pagepocket/contracts": "0.
|
|
14
|
+
"@puppeteer/browsers": "2.3.0",
|
|
15
|
+
"@pagepocket/contracts": "0.14.5",
|
|
16
|
+
"@pagepocket/lib": "0.14.5"
|
|
16
17
|
},
|
|
17
18
|
"devDependencies": {
|
|
18
19
|
"typescript": "^5.4.5"
|