@pagepocket/capture-http-puppeteer-unit 0.13.0 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ import { type PuppeteerAdapterOptions } from "./internal/puppeteer-adapter.js";
3
3
  export type CaptureHttpPuppeteerUnitOptions = PuppeteerAdapterOptions;
4
4
  export declare class CaptureHttpPuppeteerUnit extends Unit {
5
5
  readonly id = "captureHttpPuppeteer";
6
- readonly description = "Capturing page via Puppeteer";
6
+ readonly description = "Capturing page";
7
7
  private adapterOptions;
8
8
  constructor(options?: CaptureHttpPuppeteerUnitOptions);
9
9
  run(ctx: UnitContext, rt: UnitRuntime): Promise<{
@@ -44,7 +44,7 @@ export class CaptureHttpPuppeteerUnit extends Unit {
44
44
  constructor(options) {
45
45
  super();
46
46
  this.id = "captureHttpPuppeteer";
47
- this.description = "Capturing page via Puppeteer";
47
+ this.description = "Capturing page";
48
48
  this.adapterOptions = options ?? {};
49
49
  }
50
50
  async run(ctx, rt) {
@@ -0,0 +1,52 @@
1
+ /**
2
+ * CDP payload types and helpers for worker network capture.
3
+ *
4
+ * These are a minimal subset of the Chrome DevTools Protocol shapes needed
5
+ * to monitor `Network.*` events on WebWorker targets.
6
+ */
7
+ import type { ResourceType } from "@pagepocket/lib";
8
+ export type CdpRequestWillBeSent = {
9
+ requestId: string;
10
+ timestamp?: number;
11
+ wallTime?: number;
12
+ type?: string;
13
+ initiator?: {
14
+ type?: string;
15
+ url?: string;
16
+ };
17
+ request: {
18
+ url: string;
19
+ method: string;
20
+ headers?: Record<string, unknown>;
21
+ };
22
+ };
23
+ export type CdpResponseReceived = {
24
+ requestId: string;
25
+ timestamp?: number;
26
+ wallTime?: number;
27
+ type?: string;
28
+ response: {
29
+ url: string;
30
+ status: number;
31
+ statusText?: string;
32
+ headers?: Record<string, unknown>;
33
+ mimeType?: string;
34
+ fromDiskCache?: boolean;
35
+ fromServiceWorker?: boolean;
36
+ };
37
+ };
38
+ export type CdpLoadingFinished = {
39
+ requestId: string;
40
+ timestamp?: number;
41
+ };
42
+ export type CdpLoadingFailed = {
43
+ requestId: string;
44
+ timestamp?: number;
45
+ errorText: string;
46
+ };
47
+ export declare const normalizeHeaders: (headers?: Record<string, unknown>) => Record<string, string>;
48
+ export declare const mapResourceType: (input?: string) => ResourceType | undefined;
49
+ export declare const resolveTimestampMs: (payload: {
50
+ timestamp?: number;
51
+ wallTime?: number;
52
+ }) => number;
@@ -0,0 +1,46 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Helpers
3
+ // ---------------------------------------------------------------------------
4
+ export const normalizeHeaders = (headers) => {
5
+ const output = {};
6
+ if (!headers) {
7
+ return output;
8
+ }
9
+ for (const key of Object.keys(headers)) {
10
+ const value = headers[key];
11
+ if (value === undefined) {
12
+ continue;
13
+ }
14
+ output[key] = Array.isArray(value) ? value.join(", ") : String(value);
15
+ }
16
+ return output;
17
+ };
18
+ export const mapResourceType = (input) => {
19
+ if (!input) {
20
+ return undefined;
21
+ }
22
+ const normalized = input.toLowerCase();
23
+ switch (normalized) {
24
+ case "document":
25
+ case "stylesheet":
26
+ case "script":
27
+ case "image":
28
+ case "font":
29
+ case "media":
30
+ case "xhr":
31
+ case "fetch":
32
+ case "other":
33
+ return normalized;
34
+ default:
35
+ return normalized;
36
+ }
37
+ };
38
+ export const resolveTimestampMs = (payload) => {
39
+ if (typeof payload.wallTime === "number") {
40
+ return Math.round(payload.wallTime * 1000);
41
+ }
42
+ if (typeof payload.timestamp === "number") {
43
+ return Math.round(payload.timestamp * 1000);
44
+ }
45
+ return Date.now();
46
+ };
@@ -0,0 +1,22 @@
1
+ /**
2
+ * CDP-based network capture for WebWorker targets.
3
+ *
4
+ * Puppeteer's `page.on("request")` does not fire for requests made inside
5
+ * dedicated WebWorkers. This module attaches a CDP `Network` session to each
6
+ * worker so those requests are captured alongside the main-page traffic.
7
+ *
8
+ * Non-intercepting: we only listen (`Network.enable`), never pause requests.
9
+ */
10
+ import type { NetworkEventHandlers } from "@pagepocket/lib";
11
+ import type { Page } from "puppeteer";
12
+ export type WorkerNetworkCapture = {
13
+ /** Stop all worker network listeners and detach future-worker handling. */
14
+ stop: () => Promise<void>;
15
+ };
16
+ /**
17
+ * Start capturing network traffic from all current and future WebWorkers on
18
+ * the given page.
19
+ *
20
+ * Returns a handle whose `stop()` tears down every worker listener.
21
+ */
22
+ export declare const startWorkerNetworkCapture: (page: Page, handlers: NetworkEventHandlers) => WorkerNetworkCapture;
@@ -0,0 +1,170 @@
1
+ import { mapResourceType, normalizeHeaders, resolveTimestampMs } from "./worker-cdp-types.js";
2
+ const attachToWorker = async (worker, handlers, requestIdPrefix) => {
3
+ // CdpWebWorker (the Chromium-backed Puppeteer implementation) exposes a
4
+ // `client` getter that returns the underlying CDPSession. The base
5
+ // `WebWorker` class does not expose this, so we access it via cast.
6
+ const workerRef = worker;
7
+ const cdpSession = workerRef.client;
8
+ if (!cdpSession || typeof cdpSession.send !== "function") {
9
+ return undefined;
10
+ }
11
+ try {
12
+ await cdpSession.send("Network.enable");
13
+ }
14
+ catch {
15
+ return undefined;
16
+ }
17
+ let requestSequence = 0;
18
+ const cdpToLogical = new Map();
19
+ const getLogicalId = (cdpRequestId) => {
20
+ const existing = cdpToLogical.get(cdpRequestId);
21
+ if (existing) {
22
+ return existing;
23
+ }
24
+ const logicalId = `${requestIdPrefix}-${requestSequence++}`;
25
+ cdpToLogical.set(cdpRequestId, logicalId);
26
+ return logicalId;
27
+ };
28
+ const storedResponses = new Map();
29
+ const requestUrls = new Map();
30
+ const onRequestWillBeSent = (raw) => {
31
+ const payload = raw;
32
+ const logicalId = getLogicalId(payload.requestId);
33
+ const timestamp = resolveTimestampMs(payload);
34
+ requestUrls.set(logicalId, payload.request.url);
35
+ const event = {
36
+ type: "request",
37
+ requestId: logicalId,
38
+ url: payload.request.url,
39
+ method: payload.request.method,
40
+ headers: normalizeHeaders(payload.request.headers),
41
+ resourceType: mapResourceType(payload.type),
42
+ initiator: payload.initiator
43
+ ? { type: payload.initiator.type, url: payload.initiator.url }
44
+ : undefined,
45
+ timestamp
46
+ };
47
+ handlers.onEvent(event);
48
+ };
49
+ const onResponseReceived = (raw) => {
50
+ const payload = raw;
51
+ const logicalId = getLogicalId(payload.requestId);
52
+ storedResponses.set(payload.requestId, {
53
+ logicalId,
54
+ response: payload.response,
55
+ resourceType: mapResourceType(payload.type)
56
+ });
57
+ };
58
+ const onLoadingFinished = async (raw) => {
59
+ const payload = raw;
60
+ const stored = storedResponses.get(payload.requestId);
61
+ if (!stored) {
62
+ return;
63
+ }
64
+ storedResponses.delete(payload.requestId);
65
+ let bodyBytes;
66
+ try {
67
+ const result = await cdpSession.send("Network.getResponseBody", {
68
+ requestId: payload.requestId
69
+ });
70
+ const typed = result;
71
+ if (typed.base64Encoded) {
72
+ const binary = atob(typed.body);
73
+ const bytes = new Uint8Array(binary.length);
74
+ for (let byteIndex = 0; byteIndex < binary.length; byteIndex += 1) {
75
+ bytes[byteIndex] = binary.charCodeAt(byteIndex);
76
+ }
77
+ bodyBytes = bytes;
78
+ }
79
+ else {
80
+ bodyBytes = new TextEncoder().encode(typed.body);
81
+ }
82
+ }
83
+ catch {
84
+ // Body may be unavailable (204, redirects, already-destroyed worker).
85
+ }
86
+ if (bodyBytes && bodyBytes.byteLength === 0) {
87
+ bodyBytes = undefined;
88
+ }
89
+ const responseEvent = {
90
+ type: "response",
91
+ requestId: stored.logicalId,
92
+ url: stored.response.url,
93
+ status: stored.response.status,
94
+ statusText: stored.response.statusText,
95
+ headers: normalizeHeaders(stored.response.headers),
96
+ mimeType: stored.response.mimeType,
97
+ fromDiskCache: stored.response.fromDiskCache,
98
+ fromServiceWorker: stored.response.fromServiceWorker,
99
+ timestamp: resolveTimestampMs(payload),
100
+ body: bodyBytes ? { kind: "buffer", data: bodyBytes } : undefined
101
+ };
102
+ handlers.onEvent(responseEvent);
103
+ };
104
+ const onLoadingFailed = (raw) => {
105
+ const payload = raw;
106
+ const logicalId = getLogicalId(payload.requestId);
107
+ storedResponses.delete(payload.requestId);
108
+ const failedEvent = {
109
+ type: "failed",
110
+ requestId: logicalId,
111
+ url: requestUrls.get(logicalId) ?? "",
112
+ errorText: payload.errorText,
113
+ timestamp: resolveTimestampMs(payload)
114
+ };
115
+ handlers.onEvent(failedEvent);
116
+ };
117
+ cdpSession.on("Network.requestWillBeSent", onRequestWillBeSent);
118
+ cdpSession.on("Network.responseReceived", onResponseReceived);
119
+ cdpSession.on("Network.loadingFinished", onLoadingFinished);
120
+ cdpSession.on("Network.loadingFailed", onLoadingFailed);
121
+ const stop = async () => {
122
+ cdpSession.off("Network.requestWillBeSent", onRequestWillBeSent);
123
+ cdpSession.off("Network.responseReceived", onResponseReceived);
124
+ cdpSession.off("Network.loadingFinished", onLoadingFinished);
125
+ cdpSession.off("Network.loadingFailed", onLoadingFailed);
126
+ try {
127
+ await cdpSession.send("Network.disable");
128
+ }
129
+ catch {
130
+ // Worker may already be gone.
131
+ }
132
+ };
133
+ return { stop };
134
+ };
135
+ /**
136
+ * Start capturing network traffic from all current and future WebWorkers on
137
+ * the given page.
138
+ *
139
+ * Returns a handle whose `stop()` tears down every worker listener.
140
+ */
141
+ export const startWorkerNetworkCapture = (page, handlers) => {
142
+ const sessions = [];
143
+ let workerSequence = 0;
144
+ const handleWorker = async (worker) => {
145
+ const prefix = `pptr-worker-${Date.now()}-${workerSequence++}`;
146
+ const session = await attachToWorker(worker, handlers, prefix);
147
+ if (session) {
148
+ sessions.push(session);
149
+ }
150
+ };
151
+ // Attach to workers that already exist.
152
+ const castPage = page;
153
+ if (typeof castPage.workers === "function") {
154
+ for (const existingWorker of castPage.workers()) {
155
+ void handleWorker(existingWorker);
156
+ }
157
+ }
158
+ // Listen for future workers.
159
+ const onWorkerCreated = (event) => {
160
+ void handleWorker(event);
161
+ };
162
+ page.on("workercreated", onWorkerCreated);
163
+ const stop = async () => {
164
+ page.off("workercreated", onWorkerCreated);
165
+ const pending = sessions.map((session) => session.stop());
166
+ await Promise.all(pending);
167
+ sessions.length = 0;
168
+ };
169
+ return { stop };
170
+ };
@@ -1,10 +1,10 @@
1
1
  import type { NetworkEventHandlers, NetworkInterceptorAdapter, TriggerAction, InterceptTarget, InterceptSession } from "@pagepocket/lib";
2
- import { type Browser, type GoToOptions, type LaunchOptions, type Page } from "puppeteer";
2
+ import { type Browser, type GoToOptions, type PuppeteerLaunchOptions, type Page } from "puppeteer";
3
3
  export type PuppeteerAdapterOptions = {
4
4
  browser?: Browser;
5
5
  page?: Page;
6
- launch?: (options?: LaunchOptions) => Promise<Browser>;
7
- launchOptions?: LaunchOptions;
6
+ launch?: (options?: PuppeteerLaunchOptions) => Promise<Browser>;
7
+ launchOptions?: PuppeteerLaunchOptions;
8
8
  gotoOptions?: GoToOptions;
9
9
  triggerActions?: TriggerAction[];
10
10
  };
@@ -1,10 +1,13 @@
1
1
  import puppeteer from "puppeteer";
2
2
  import { createEmitRequest, createOnRequestFailed, createOnResponse, createRequestIdStore } from "./puppeteer-adapter/events.js";
3
3
  import { buildNavigate, ensureHtmlTargetLoaded } from "./puppeteer-adapter/targets.js";
4
+ import { startWorkerNetworkCapture } from "./puppeteer-adapter/worker-network.js";
4
5
  import { readDomHtml } from "./utils/dom-html.js";
5
6
  import { getEnvString } from "./utils/env.js";
6
- import { buildMissingChromeHelp, getErrorMessage, isMissingChromeError } from "./utils/errors.js";
7
+ import { isMissingChromeError } from "./utils/errors.js";
8
+ import { ensureBrowser } from "./utils/ensure-browser.js";
7
9
  import { runTriggerActions } from "./utils/trigger-actions.js";
10
+ import { getSandboxArgs } from "./utils/sandbox-args.js";
8
11
  /**
9
12
  * Internal adapter for the capture plugin.
10
13
  *
@@ -38,17 +41,25 @@ export class PuppeteerAdapter {
38
41
  const envExecutablePath = getEnvString("PUPPETEER_EXECUTABLE_PATH");
39
42
  const launchOptionsFromUser = this.options.launchOptions;
40
43
  const executablePath = launchOptionsFromUser?.executablePath ?? envExecutablePath;
44
+ const sandboxArgs = getSandboxArgs();
45
+ const mergedArgs = [...(launchOptionsFromUser?.args ?? []), ...sandboxArgs];
46
+ const baseLaunchOptions = {
47
+ ...launchOptionsFromUser,
48
+ ...(executablePath ? { executablePath } : {}),
49
+ ...(mergedArgs.length > 0 ? { args: mergedArgs } : {})
50
+ };
41
51
  try {
42
- browser = await doLaunch({
43
- ...launchOptionsFromUser,
44
- ...(executablePath ? { executablePath } : {})
45
- });
52
+ browser = await doLaunch(baseLaunchOptions);
46
53
  }
47
54
  catch (error) {
48
55
  if (!isMissingChromeError(error)) {
49
56
  throw error;
50
57
  }
51
- throw new Error(`${getErrorMessage(error)}\n\n${buildMissingChromeHelp()}`);
58
+ const downloadedPath = await ensureBrowser();
59
+ browser = await doLaunch({
60
+ ...baseLaunchOptions,
61
+ executablePath: downloadedPath
62
+ });
52
63
  }
53
64
  ownsBrowser = true;
54
65
  }
@@ -62,6 +73,17 @@ export class PuppeteerAdapter {
62
73
  if (!page || typeof page.on !== "function") {
63
74
  throw new Error("PuppeteerAdapter requires a Puppeteer Page instance.");
64
75
  }
76
+ // Activate the CDP Fetch domain with a catch-all pattern.
77
+ // Without Fetch.enable, Chrome silently omits Network events for certain
78
+ // XHR requests (e.g. Feishu's encrypted-image blob downloads via
79
+ // XMLHttpRequest to CDN). Enabling Fetch forces every request through
80
+ // the CDP pipeline, making them visible to Network.* listeners.
81
+ // We immediately continue each paused request so page behavior is unchanged.
82
+ const cdpSession = await page.createCDPSession();
83
+ await cdpSession.send("Fetch.enable", { patterns: [{ urlPattern: "*" }] });
84
+ cdpSession.on("Fetch.requestPaused", (event) => {
85
+ cdpSession.send("Fetch.continueRequest", { requestId: event.requestId }).catch(() => { });
86
+ });
65
87
  const { getRequestId, has } = createRequestIdStore();
66
88
  const emitRequest = createEmitRequest(handlers, getRequestId);
67
89
  const onRequest = (request) => emitRequest(request, Date.now());
@@ -80,6 +102,9 @@ export class PuppeteerAdapter {
80
102
  page.on("request", onRequest);
81
103
  page.on("response", onResponse);
82
104
  page.on("requestfailed", onRequestFailed);
105
+ // Capture network traffic from WebWorkers via CDP Network.enable.
106
+ // Puppeteer's page-level events do not fire for worker requests.
107
+ const workerCapture = startWorkerNetworkCapture(page, handlers);
83
108
  const { navigate, awaitLastNavigation } = buildNavigate(page, this.options.gotoOptions);
84
109
  let htmlLoaded = false;
85
110
  const whenHtmlLoaded = async () => {
@@ -120,6 +145,9 @@ export class PuppeteerAdapter {
120
145
  getDomHtml,
121
146
  startCapture,
122
147
  stop: async () => {
148
+ await workerCapture.stop();
149
+ await cdpSession.send("Fetch.disable").catch(() => { });
150
+ await cdpSession.detach().catch(() => { });
123
151
  page.off("request", onRequest);
124
152
  page.off("response", onResponse);
125
153
  page.off("requestfailed", onRequestFailed);
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Ensures a Chrome executable is available for Puppeteer.
3
+ *
4
+ * When Puppeteer cannot find a compatible Chrome installation on the system,
5
+ * this helper automatically downloads Chrome for Testing using `@puppeteer/browsers`.
6
+ * The downloaded browser is cached in `~/.cache/puppeteer` and reused on
7
+ * subsequent runs.
8
+ *
9
+ * Usage:
10
+ * const executablePath = await ensureBrowser();
11
+ * const browser = await puppeteer.launch({ executablePath });
12
+ */
13
+ export declare const ensureBrowser: () => Promise<string>;
@@ -0,0 +1,42 @@
1
+ import os from "os";
2
+ import path from "path";
3
+ import { Browser, computeExecutablePath, detectBrowserPlatform, getInstalledBrowsers, install, resolveBuildId } from "@puppeteer/browsers";
4
+ const DEFAULT_CACHE_DIR = path.join(os.homedir(), ".cache", "puppeteer");
5
+ /**
6
+ * Ensures a Chrome executable is available for Puppeteer.
7
+ *
8
+ * When Puppeteer cannot find a compatible Chrome installation on the system,
9
+ * this helper automatically downloads Chrome for Testing using `@puppeteer/browsers`.
10
+ * The downloaded browser is cached in `~/.cache/puppeteer` and reused on
11
+ * subsequent runs.
12
+ *
13
+ * Usage:
14
+ * const executablePath = await ensureBrowser();
15
+ * const browser = await puppeteer.launch({ executablePath });
16
+ */
17
+ export const ensureBrowser = async () => {
18
+ const platform = detectBrowserPlatform();
19
+ if (!platform) {
20
+ throw new Error(`Cannot detect browser platform for: ${os.platform()} (${os.arch()})`);
21
+ }
22
+ const cacheDir = DEFAULT_CACHE_DIR;
23
+ const buildId = await resolveBuildId(Browser.CHROME, platform, "stable");
24
+ const installedBrowsers = await getInstalledBrowsers({ cacheDir });
25
+ const existingInstall = installedBrowsers.find((entry) => entry.browser === Browser.CHROME && entry.buildId === buildId);
26
+ if (existingInstall) {
27
+ return computeExecutablePath({
28
+ browser: Browser.CHROME,
29
+ buildId,
30
+ cacheDir
31
+ });
32
+ }
33
+ console.log(`Chrome not found. Downloading Chrome (buildId: ${buildId})...`);
34
+ const installedBrowser = await install({
35
+ browser: Browser.CHROME,
36
+ cacheDir,
37
+ buildId,
38
+ platform
39
+ });
40
+ console.log(`Chrome downloaded to ${installedBrowser.executablePath}`);
41
+ return installedBrowser.executablePath;
42
+ };
@@ -0,0 +1,25 @@
1
+ import type { Page } from "puppeteer";
2
+ export type AutoScrollOptions = {
3
+ distancePx?: number;
4
+ intervalMs?: number;
5
+ maxIdleTicks?: number;
6
+ maxSteps?: number;
7
+ };
8
+ /**
9
+ * Incrementally scrolls a page to the bottom until height stabilizes.
10
+ *
11
+ * Handles both traditional pages (window is the scroller) and SPA layouts
12
+ * where the actual scrollable area is a nested container with
13
+ * `overflow: auto | scroll` (e.g. Feishu, Notion, Google Docs).
14
+ *
15
+ * Detection logic:
16
+ * 1. If `document.scrollingElement.scrollHeight > window.innerHeight`,
17
+ * use `window.scrollBy` (classic path).
18
+ * 2. Otherwise, find the DOM element with the largest scrollable overflow
19
+ * (`scrollHeight - clientHeight`) and scroll that element instead.
20
+ *
21
+ * Usage:
22
+ * await autoScrollToEnd(page);
23
+ * await autoScrollToEnd(page, { intervalMs: 150, distancePx: 200 });
24
+ */
25
+ export declare const autoScrollToEnd: (page: Page, options?: AutoScrollOptions) => Promise<void>;
@@ -0,0 +1,110 @@
1
+ /**
2
+ * Incrementally scrolls a page to the bottom until height stabilizes.
3
+ *
4
+ * Handles both traditional pages (window is the scroller) and SPA layouts
5
+ * where the actual scrollable area is a nested container with
6
+ * `overflow: auto | scroll` (e.g. Feishu, Notion, Google Docs).
7
+ *
8
+ * Detection logic:
9
+ * 1. If `document.scrollingElement.scrollHeight > window.innerHeight`,
10
+ * use `window.scrollBy` (classic path).
11
+ * 2. Otherwise, find the DOM element with the largest scrollable overflow
12
+ * (`scrollHeight - clientHeight`) and scroll that element instead.
13
+ *
14
+ * Usage:
15
+ * await autoScrollToEnd(page);
16
+ * await autoScrollToEnd(page, { intervalMs: 150, distancePx: 200 });
17
+ */
18
+ export const autoScrollToEnd = async (page, options = {}) => {
19
+ const distancePx = options.distancePx ?? 100;
20
+ const intervalMs = options.intervalMs ?? 100;
21
+ const maxIdleTicks = options.maxIdleTicks ?? 8;
22
+ const maxSteps = options.maxSteps ?? 300;
23
+ await page.evaluate(async (scrollOptions) => {
24
+ /**
25
+ * Find the primary scroll target.
26
+ *
27
+ * Returns `null` when the window itself is scrollable (caller should
28
+ * fall back to `window.scrollBy`). Returns an `Element` when a nested
29
+ * container owns the scroll.
30
+ */
31
+ const findScrollContainer = () => {
32
+ const root = document.scrollingElement ?? document.documentElement ?? document.body;
33
+ if (root && root.scrollHeight > window.innerHeight + 1) {
34
+ return null;
35
+ }
36
+ let best = null;
37
+ let bestDelta = 0;
38
+ const elements = Array.from(document.querySelectorAll("*"));
39
+ for (const el of elements) {
40
+ const style = getComputedStyle(el);
41
+ const oy = style.overflowY;
42
+ const isScrollable = oy === "auto" || oy === "scroll";
43
+ if (!isScrollable) {
44
+ continue;
45
+ }
46
+ const delta = el.scrollHeight - el.clientHeight;
47
+ if (delta > bestDelta) {
48
+ bestDelta = delta;
49
+ best = el;
50
+ }
51
+ }
52
+ return best;
53
+ };
54
+ const container = findScrollContainer();
55
+ const getScrollHeight = () => {
56
+ if (container) {
57
+ return container.scrollHeight;
58
+ }
59
+ const root = document.scrollingElement ?? document.documentElement ?? document.body;
60
+ return Math.max(document.body?.scrollHeight ?? 0, document.documentElement?.scrollHeight ?? 0, root?.scrollHeight ?? 0);
61
+ };
62
+ const getScrollPosition = () => {
63
+ if (container) {
64
+ return container.scrollTop;
65
+ }
66
+ return window.scrollY;
67
+ };
68
+ const getViewportSize = () => {
69
+ if (container) {
70
+ return container.clientHeight;
71
+ }
72
+ return window.innerHeight;
73
+ };
74
+ const scrollBy = (distance) => {
75
+ if (container) {
76
+ container.scrollTop += distance;
77
+ }
78
+ else {
79
+ window.scrollBy(0, distance);
80
+ }
81
+ };
82
+ await new Promise((resolve) => {
83
+ let stepCount = 0;
84
+ let idleTickCount = 0;
85
+ let lastKnownHeight = getScrollHeight();
86
+ const epsilon = 1;
87
+ const timer = window.setInterval(() => {
88
+ const currentHeight = getScrollHeight();
89
+ const viewportBottom = getScrollPosition() + getViewportSize();
90
+ const reachedBottom = viewportBottom + epsilon >= currentHeight;
91
+ if (currentHeight > lastKnownHeight) {
92
+ lastKnownHeight = currentHeight;
93
+ idleTickCount = 0;
94
+ }
95
+ else if (reachedBottom) {
96
+ idleTickCount += 1;
97
+ }
98
+ else {
99
+ idleTickCount = 0;
100
+ }
101
+ scrollBy(scrollOptions.distancePx);
102
+ stepCount += 1;
103
+ if (stepCount >= scrollOptions.maxSteps || idleTickCount >= scrollOptions.maxIdleTicks) {
104
+ window.clearInterval(timer);
105
+ resolve();
106
+ }
107
+ }, scrollOptions.intervalMs);
108
+ });
109
+ }, { distancePx, intervalMs, maxIdleTicks, maxSteps });
110
+ };
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Returns Chromium CLI flags that disable the sandbox on Linux.
3
+ *
4
+ * Ubuntu 23.10+ (and other distros) restrict unprivileged user namespaces
5
+ * via AppArmor, which makes the Chrome sandbox fail with:
6
+ *
7
+ * "No usable sandbox! … see apparmor-userns-restrictions.md"
8
+ *
9
+ * Adding `--no-sandbox --disable-setuid-sandbox` is the standard workaround
10
+ * recommended by Puppeteer for Linux environments (CI and desktop alike).
11
+ *
12
+ * On non-Linux platforms the sandbox works out of the box, so no extra
13
+ * flags are injected.
14
+ *
15
+ * Usage:
16
+ * const args = [...(userArgs ?? []), ...getSandboxArgs()];
17
+ */
18
+ export declare const getSandboxArgs: () => string[];
@@ -0,0 +1,24 @@
1
+ import { platform } from "node:os";
2
+ /**
3
+ * Returns Chromium CLI flags that disable the sandbox on Linux.
4
+ *
5
+ * Ubuntu 23.10+ (and other distros) restrict unprivileged user namespaces
6
+ * via AppArmor, which makes the Chrome sandbox fail with:
7
+ *
8
+ * "No usable sandbox! … see apparmor-userns-restrictions.md"
9
+ *
10
+ * Adding `--no-sandbox --disable-setuid-sandbox` is the standard workaround
11
+ * recommended by Puppeteer for Linux environments (CI and desktop alike).
12
+ *
13
+ * On non-Linux platforms the sandbox works out of the box, so no extra
14
+ * flags are injected.
15
+ *
16
+ * Usage:
17
+ * const args = [...(userArgs ?? []), ...getSandboxArgs()];
18
+ */
19
+ export const getSandboxArgs = () => {
20
+ if (platform() !== "linux") {
21
+ return [];
22
+ }
23
+ return ["--no-sandbox", "--disable-setuid-sandbox"];
24
+ };
@@ -1,37 +1,32 @@
1
+ import { autoScrollToEnd } from "./puppeteer-auto-scroll.js";
2
+ const hoverAllElements = async (page) => {
3
+ await page.evaluate(() => {
4
+ const elements = Array.from(document.querySelectorAll("*"));
5
+ const triggerMouseOver = (element) => {
6
+ const rect = element.getBoundingClientRect();
7
+ const pointerX = rect.left + rect.width / 2;
8
+ const pointerY = rect.top + rect.height / 2;
9
+ const event = new MouseEvent("mouseover", {
10
+ bubbles: true,
11
+ cancelable: true,
12
+ clientX: pointerX,
13
+ clientY: pointerY
14
+ });
15
+ element.dispatchEvent(event);
16
+ };
17
+ elements.forEach(triggerMouseOver);
18
+ });
19
+ };
1
20
  export const runTriggerActions = async (page, actions = []) => {
2
21
  if (actions.length === 0) {
3
22
  return;
4
23
  }
5
- await page.evaluate((actionsArg) => {
6
- const hoverAll = () => {
7
- const elements = Array.from(document.querySelectorAll("*"));
8
- const trigger = (el) => {
9
- const rect = el.getBoundingClientRect();
10
- const pointerX = rect.left + rect.width / 2;
11
- const pointerY = rect.top + rect.height / 2;
12
- const event = new MouseEvent("mouseover", {
13
- bubbles: true,
14
- cancelable: true,
15
- clientX: pointerX,
16
- clientY: pointerY
17
- });
18
- el.dispatchEvent(event);
19
- };
20
- elements.forEach(trigger);
21
- };
22
- const scrollToEnd = () => {
23
- const scrollHeight = document.documentElement?.scrollHeight ?? document.body?.scrollHeight;
24
- if (typeof scrollHeight === "number") {
25
- window.scrollTo({ top: scrollHeight, behavior: "instant" });
26
- }
27
- };
28
- for (const action of actionsArg) {
29
- if (action === "HOVER") {
30
- hoverAll();
31
- }
32
- if (action === "SCROLL_TO_END") {
33
- scrollToEnd();
34
- }
24
+ for (const action of actions) {
25
+ if (action === "HOVER") {
26
+ await hoverAllElements(page);
27
+ }
28
+ if (action === "SCROLL_TO_END") {
29
+ await autoScrollToEnd(page);
35
30
  }
36
- }, actions);
31
+ }
37
32
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pagepocket/capture-http-puppeteer-unit",
3
- "version": "0.13.0",
3
+ "version": "0.14.5",
4
4
  "description": "PagePocket plugin: capture HTTP events (puppeteer adapter)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -11,8 +11,9 @@
11
11
  "license": "ISC",
12
12
  "dependencies": {
13
13
  "puppeteer": "^22.12.1",
14
- "@pagepocket/lib": "0.13.0",
15
- "@pagepocket/contracts": "0.13.0"
14
+ "@puppeteer/browsers": "2.3.0",
15
+ "@pagepocket/contracts": "0.14.5",
16
+ "@pagepocket/lib": "0.14.5"
16
17
  },
17
18
  "devDependencies": {
18
19
  "typescript": "^5.4.5"