@pagepocket/capture-http-puppeteer-unit 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ import { type PagePocketContext, type PagePocketPlugin } from "@pagepocket/lib";
2
+ import { type PuppeteerAdapterOptions } from "./internal/puppeteer-adapter.js";
3
+ export type CaptureHttpPuppeteerPluginOptions = PuppeteerAdapterOptions;
4
+ export declare class CaptureHttpPuppeteerPlugin implements PagePocketPlugin {
5
+ readonly name = "plugin:capture-http-puppeteer";
6
+ private adapterOptions;
7
+ constructor(options?: CaptureHttpPuppeteerPluginOptions);
8
+ apply(ctx: PagePocketContext): void;
9
+ }
@@ -0,0 +1,146 @@
1
+ import { createMemoryContentStore } from "@pagepocket/lib";
2
+ import { InflightTracker, networkIdle, normalizeCompletion, timeout } from "@pagepocket/lib";
3
+ import { PuppeteerAdapter } from "./internal/puppeteer-adapter.js";
4
+ const headersRecordToList = (headers) => {
5
+ if (!headers)
6
+ return [];
7
+ return Object.keys(headers).map((name) => ({ name, value: headers[name] }));
8
+ };
9
+ export class CaptureHttpPuppeteerPlugin {
10
+ constructor(options) {
11
+ this.name = "plugin:capture-http-puppeteer";
12
+ this.adapterOptions = options ?? {};
13
+ }
14
+ apply(ctx) {
15
+ const contentStore = createMemoryContentStore("capture-http-puppeteer");
16
+ const events = [];
17
+ const capabilities = {
18
+ requestHeaders: "approx",
19
+ responseHeaders: "approx",
20
+ requestBodies: false,
21
+ responseBodies: "decoded",
22
+ httpVersion: false,
23
+ remoteIp: false,
24
+ headerOrderPreserved: false
25
+ };
26
+ ctx.capture = {
27
+ events,
28
+ contentStore,
29
+ capabilities
30
+ };
31
+ const inflightTracker = new InflightTracker();
32
+ const handleNetworkEvent = async (event) => {
33
+ inflightTracker.handleEvent(event);
34
+ ctx.emitNetworkEvent?.(event);
35
+ if (event.type === "request") {
36
+ events.push({
37
+ type: "http.request",
38
+ requestId: event.requestId,
39
+ url: event.url,
40
+ method: event.method,
41
+ headers: headersRecordToList(event.headers),
42
+ timestamp: event.timestamp,
43
+ frameId: event.frameId,
44
+ resourceType: event.resourceType,
45
+ initiator: event.initiator
46
+ });
47
+ return;
48
+ }
49
+ if (event.type === "failed") {
50
+ events.push({
51
+ type: "http.failed",
52
+ requestId: event.requestId,
53
+ url: event.url,
54
+ errorText: event.errorText,
55
+ timestamp: event.timestamp
56
+ });
57
+ return;
58
+ }
59
+ const bodyRef = event.body
60
+ ? await contentStore.put(event.body, {
61
+ url: event.url,
62
+ mimeType: event.mimeType,
63
+ sizeHint: undefined
64
+ })
65
+ : undefined;
66
+ events.push({
67
+ type: "http.response",
68
+ requestId: event.requestId,
69
+ url: event.url,
70
+ status: event.status,
71
+ statusText: event.statusText,
72
+ headers: headersRecordToList(event.headers),
73
+ timestamp: event.timestamp,
74
+ mimeType: event.mimeType,
75
+ fromDiskCache: event.fromDiskCache,
76
+ fromServiceWorker: event.fromServiceWorker,
77
+ bodyRef,
78
+ bodySize: undefined
79
+ });
80
+ };
81
+ const stateKey = "captureHttpPuppeteer.session";
82
+ ctx.onInit(async () => {
83
+ const target = ctx.entry.kind === "url"
84
+ ? { kind: "url", url: ctx.entry.url }
85
+ : ctx.entry.kind === "puppeteer-page"
86
+ ? { kind: "puppeteer-page", page: ctx.entry.page }
87
+ : ctx.entry.kind === "html-string"
88
+ ? {
89
+ kind: "html",
90
+ htmlString: await ctx.whenHtml().then((h) => h.htmlString),
91
+ baseUrl: ctx.entry.baseUrl,
92
+ ...(ctx.entry.url ? { url: ctx.entry.url } : {})
93
+ }
94
+ : (() => {
95
+ throw new Error(`CaptureHttpPuppeteerPlugin does not support entry kind: ${String(ctx.entry.kind)}`);
96
+ })();
97
+ const adapter = new PuppeteerAdapter(this.adapterOptions);
98
+ const session = await adapter.start(target, {
99
+ onEvent(event) {
100
+ void handleNetworkEvent(event);
101
+ },
102
+ onError(error) {
103
+ // eslint-disable-next-line no-console
104
+ console.warn("[pagepocket][capture-http-puppeteer] adapter error", error);
105
+ }
106
+ }, {
107
+ timeoutMs: ctx.options.timeoutMs,
108
+ maxDurationMs: ctx.options.maxDurationMs
109
+ });
110
+ ctx.state[stateKey] = session;
111
+ if (target.kind === "url" && session.navigate) {
112
+ await session.navigate(target.url);
113
+ }
114
+ if (!ctx.html) {
115
+ const html = await session.waitForHtml();
116
+ ctx.setHtml(html);
117
+ }
118
+ });
119
+ ctx.onBeforeNetwork(async () => {
120
+ const session = ctx.state[stateKey];
121
+ if (!session) {
122
+ throw new Error("CaptureHttpPuppeteerPlugin internal error: missing session");
123
+ }
124
+ await session.startCapture();
125
+ const completionStrategies = normalizeCompletion(ctx.options.completion);
126
+ const idleMs = ctx.options.timeoutMs ?? 5000;
127
+ const maxDurationMs = ctx.options.maxDurationMs;
128
+ const completion = completionStrategies.length > 0
129
+ ? completionStrategies
130
+ : [networkIdle(idleMs), ...(maxDurationMs !== undefined ? [timeout(maxDurationMs)] : [])];
131
+ if (completion.length === 1) {
132
+ await completion[0].wait({
133
+ now: () => Date.now(),
134
+ getStats: () => inflightTracker.getStats()
135
+ });
136
+ }
137
+ else {
138
+ await Promise.race(completion.map((strategy) => strategy.wait({
139
+ now: () => Date.now(),
140
+ getStats: () => inflightTracker.getStats()
141
+ })));
142
+ }
143
+ await session.stop();
144
+ });
145
+ }
146
+ }
@@ -0,0 +1,19 @@
1
+ import { Unit, type CaptureArtifacts } from "@pagepocket/lib";
2
+ import { type PuppeteerAdapterOptions } from "./internal/puppeteer-adapter.js";
3
+ export type CaptureHttpPuppeteerUnitOptions = PuppeteerAdapterOptions;
4
+ export declare class CaptureHttpPuppeteerUnit extends Unit {
5
+ readonly id = "captureHttpPuppeteer";
6
+ readonly kind = "capture.http.puppeteer";
7
+ private adapterOptions;
8
+ constructor(options?: CaptureHttpPuppeteerUnitOptions);
9
+ run(ctx: import("@pagepocket/lib").UnitContext, rt: import("@pagepocket/lib").UnitRuntime): Promise<{
10
+ capture: CaptureArtifacts;
11
+ html: {};
12
+ domHtml: {
13
+ htmlString: string;
14
+ baseUrl: string;
15
+ url?: string;
16
+ contentType?: string;
17
+ };
18
+ }>;
19
+ }
@@ -0,0 +1,137 @@
1
+ import { NETWORK } from "@pagepocket/contracts";
2
+ import { Unit, createMemoryContentStore, InflightTracker, mapKind, networkIdle, normalizeCompletion, throwUnsupportedEntryKind, timeout } from "@pagepocket/lib";
3
+ import { PuppeteerAdapter } from "./internal/puppeteer-adapter.js";
4
+ const headersRecordToList = (headers) => {
5
+ if (!headers)
6
+ return [];
7
+ return Object.keys(headers).map((name) => ({ name, value: headers[name] }));
8
+ };
9
+ const targetBuilders = {
10
+ url: (entry) => ({
11
+ kind: "url",
12
+ url: entry.url
13
+ }),
14
+ "puppeteer-page": (entry) => ({
15
+ kind: "puppeteer-page",
16
+ page: entry.page
17
+ }),
18
+ "html-string": (entry) => ({
19
+ kind: "html",
20
+ htmlString: entry.htmlString,
21
+ baseUrl: entry.baseUrl,
22
+ ...(entry.url ? { url: entry.url } : {})
23
+ })
24
+ };
25
+ export class CaptureHttpPuppeteerUnit extends Unit {
26
+ constructor(options) {
27
+ super();
28
+ this.id = "captureHttpPuppeteer";
29
+ this.kind = "capture.http.puppeteer";
30
+ this.adapterOptions = options ?? {};
31
+ }
32
+ async run(ctx, rt) {
33
+ const contentStore = createMemoryContentStore("capture-http-puppeteer");
34
+ const events = [];
35
+ const capabilities = {
36
+ requestHeaders: "approx",
37
+ responseHeaders: "approx",
38
+ requestBodies: false,
39
+ responseBodies: "decoded",
40
+ httpVersion: false,
41
+ remoteIp: false,
42
+ headerOrderPreserved: false
43
+ };
44
+ const inflightTracker = new InflightTracker();
45
+ const handleNetworkEvent = async (event) => {
46
+ inflightTracker.handleEvent(event);
47
+ rt.publish(NETWORK, event);
48
+ if (event.type === "request") {
49
+ events.push({
50
+ type: "http.request",
51
+ requestId: event.requestId,
52
+ url: event.url,
53
+ method: event.method,
54
+ headers: headersRecordToList(event.headers),
55
+ timestamp: event.timestamp,
56
+ frameId: event.frameId,
57
+ resourceType: event.resourceType,
58
+ initiator: event.initiator
59
+ });
60
+ return;
61
+ }
62
+ if (event.type === "failed") {
63
+ events.push({
64
+ type: "http.failed",
65
+ requestId: event.requestId,
66
+ url: event.url,
67
+ errorText: event.errorText,
68
+ timestamp: event.timestamp
69
+ });
70
+ return;
71
+ }
72
+ const bodyRef = event.body
73
+ ? await contentStore.put(event.body, {
74
+ url: event.url,
75
+ mimeType: event.mimeType,
76
+ sizeHint: undefined
77
+ })
78
+ : undefined;
79
+ events.push({
80
+ type: "http.response",
81
+ requestId: event.requestId,
82
+ url: event.url,
83
+ status: event.status,
84
+ statusText: event.statusText,
85
+ headers: headersRecordToList(event.headers),
86
+ timestamp: event.timestamp,
87
+ mimeType: event.mimeType,
88
+ fromDiskCache: event.fromDiskCache,
89
+ fromServiceWorker: event.fromServiceWorker,
90
+ bodyRef,
91
+ bodySize: undefined
92
+ });
93
+ };
94
+ const capture = { events, contentStore, capabilities };
95
+ const adapter = new PuppeteerAdapter(this.adapterOptions);
96
+ const target = mapKind(rt.entry, targetBuilders, {
97
+ onUnsupportedKind: throwUnsupportedEntryKind("CaptureHttpPuppeteerUnit")
98
+ });
99
+ const session = await adapter.start(target, {
100
+ onEvent: (event) => {
101
+ void handleNetworkEvent(event);
102
+ },
103
+ onError: (error) => {
104
+ console.warn("[pagepocket][capture-http-puppeteer] adapter error", error);
105
+ }
106
+ }, {
107
+ timeoutMs: rt.options.timeoutMs,
108
+ maxDurationMs: rt.options.maxDurationMs
109
+ });
110
+ if (target.kind === "url" && session.navigate) {
111
+ await session.navigate(target.url);
112
+ }
113
+ const html = ctx.value.html ?? (await session.waitForHtml());
114
+ await session.startCapture();
115
+ const completionStrategies = normalizeCompletion(rt.options.completion);
116
+ const idleMs = rt.options.timeoutMs ?? 5000;
117
+ const maxDurationMs = rt.options.maxDurationMs;
118
+ const completion = completionStrategies.length > 0
119
+ ? completionStrategies
120
+ : [networkIdle(idleMs), ...(maxDurationMs !== undefined ? [timeout(maxDurationMs)] : [])];
121
+ if (completion.length === 1) {
122
+ await completion[0].wait({
123
+ now: () => Date.now(),
124
+ getStats: () => inflightTracker.getStats()
125
+ });
126
+ }
127
+ else {
128
+ await Promise.race(completion.map((strategy) => strategy.wait({
129
+ now: () => Date.now(),
130
+ getStats: () => inflightTracker.getStats()
131
+ })));
132
+ }
133
+ const domHtml = session.getDomHtml ? await session.getDomHtml() : await session.waitForHtml();
134
+ await session.stop();
135
+ return { capture, html, domHtml };
136
+ }
137
+ }
@@ -0,0 +1,2 @@
1
+ export { CaptureHttpPuppeteerUnit } from "./capture-http-puppeteer-unit.js";
2
+ export type { CaptureHttpPuppeteerUnitOptions } from "./capture-http-puppeteer-unit.js";
package/dist/index.js ADDED
@@ -0,0 +1 @@
1
+ export { CaptureHttpPuppeteerUnit } from "./capture-http-puppeteer-unit.js";
@@ -0,0 +1,29 @@
1
+ import type { NetworkEventHandlers, NetworkInterceptorAdapter, TriggerAction, InterceptTarget, InterceptSession } from "@pagepocket/lib";
2
+ import { type Browser, type GoToOptions, type LaunchOptions, type Page } from "puppeteer";
3
+ export type PuppeteerAdapterOptions = {
4
+ browser?: Browser;
5
+ page?: Page;
6
+ launch?: (options?: LaunchOptions) => Promise<Browser>;
7
+ launchOptions?: LaunchOptions;
8
+ gotoOptions?: GoToOptions;
9
+ triggerActions?: TriggerAction[];
10
+ };
11
+ /**
12
+ * Internal adapter for the capture plugin.
13
+ *
14
+ * Note: this is intentionally NOT published as a separate package.
15
+ */
16
+ export declare class PuppeteerAdapter implements NetworkInterceptorAdapter {
17
+ readonly name = "puppeteer";
18
+ readonly capabilities: {
19
+ canGetResponseBody: boolean;
20
+ canStreamResponseBody: boolean;
21
+ canGetRequestBody: boolean;
22
+ providesResourceType: boolean;
23
+ canWaitForHtml: boolean;
24
+ supportsStagedCapture: boolean;
25
+ };
26
+ private options;
27
+ constructor(options?: PuppeteerAdapterOptions);
28
+ start(target: InterceptTarget, handlers: NetworkEventHandlers, _options?: Record<string, unknown>): Promise<InterceptSession>;
29
+ }
@@ -0,0 +1,261 @@
1
+ import puppeteer from "puppeteer";
2
+ import { readDomHtml } from "./utils/dom-html.js";
3
+ import { getEnvString } from "./utils/env.js";
4
+ import { buildMissingChromeHelp, isMissingChromeError, isResponseBodyUnavailableError } from "./utils/errors.js";
5
+ import { getHeaderValue, normalizeHeaders } from "./utils/headers.js";
6
+ import { getFrameId, getInitiator } from "./utils/puppeteer-internals.js";
7
+ import { runTriggerActions } from "./utils/trigger-actions.js";
8
+ const toResourceType = (request) => {
9
+ const type = request.resourceType?.();
10
+ return type ? type : undefined;
11
+ };
12
+ /**
13
+ * Internal adapter for the capture plugin.
14
+ *
15
+ * Note: this is intentionally NOT published as a separate package.
16
+ */
17
+ export class PuppeteerAdapter {
18
+ constructor(options = {}) {
19
+ this.name = "puppeteer";
20
+ this.capabilities = {
21
+ canGetResponseBody: true,
22
+ canStreamResponseBody: false,
23
+ canGetRequestBody: false,
24
+ providesResourceType: true,
25
+ canWaitForHtml: true,
26
+ supportsStagedCapture: true
27
+ };
28
+ this.options = options;
29
+ }
30
+ async start(target, handlers, _options) {
31
+ let browser = this.options.browser;
32
+ let page = this.options.page;
33
+ let ownsBrowser = false;
34
+ let ownsPage = false;
35
+ if (target.kind === "puppeteer-page") {
36
+ page = target.page;
37
+ }
38
+ else if (target.kind === "url" || target.kind === "html") {
39
+ if (!page) {
40
+ if (!browser) {
41
+ const doLaunch = this.options.launch ?? puppeteer.launch;
42
+ const envExecutablePath = getEnvString("PUPPETEER_EXECUTABLE_PATH");
43
+ const launchOptionsFromUser = this.options.launchOptions;
44
+ const executablePath = launchOptionsFromUser?.executablePath ?? envExecutablePath;
45
+ try {
46
+ browser = await doLaunch({
47
+ ...launchOptionsFromUser,
48
+ ...(executablePath ? { executablePath } : {}),
49
+ headless: false
50
+ });
51
+ }
52
+ catch (error) {
53
+ if (!isMissingChromeError(error)) {
54
+ throw error;
55
+ }
56
+ const message = error && typeof error.message === "string"
57
+ ? error.message
58
+ : String(error);
59
+ throw new Error(`${message}\n\n${buildMissingChromeHelp()}`);
60
+ }
61
+ ownsBrowser = true;
62
+ }
63
+ page = await browser.newPage();
64
+ ownsPage = true;
65
+ }
66
+ }
67
+ else {
68
+ throw new Error("PuppeteerAdapter only supports url, html, or puppeteer-page targets.");
69
+ }
70
+ if (!page || typeof page.on !== "function") {
71
+ throw new Error("PuppeteerAdapter requires a Puppeteer Page instance.");
72
+ }
73
+ const requestIds = new WeakMap();
74
+ let requestSequence = 0;
75
+ const getRequestId = (request) => {
76
+ const existing = requestIds.get(request);
77
+ if (existing) {
78
+ return existing;
79
+ }
80
+ const requestId = `pptr-${Date.now()}-${requestSequence++}`;
81
+ requestIds.set(request, requestId);
82
+ return requestId;
83
+ };
84
+ const emitRequest = (request, timestamp) => {
85
+ const requestEvent = {
86
+ type: "request",
87
+ requestId: getRequestId(request),
88
+ url: request.url(),
89
+ method: request.method(),
90
+ headers: normalizeHeaders(request.headers()),
91
+ frameId: getFrameId(request),
92
+ resourceType: toResourceType(request),
93
+ initiator: getInitiator(request),
94
+ timestamp
95
+ };
96
+ handlers.onEvent(requestEvent);
97
+ };
98
+ const onRequest = (request) => {
99
+ const timestamp = Date.now();
100
+ emitRequest(request, timestamp);
101
+ };
102
+ const onResponse = (response) => {
103
+ const timestamp = Date.now();
104
+ const request = response.request();
105
+ if (!requestIds.has(request)) {
106
+ emitRequest(request, timestamp);
107
+ }
108
+ const headers = normalizeHeaders(response.headers());
109
+ const isExpectedMissingBody = () => {
110
+ const method = request.method().toUpperCase();
111
+ if (method === "OPTIONS" || method === "HEAD") {
112
+ return true;
113
+ }
114
+ const status = response.status();
115
+ if (status >= 100 && status < 200) {
116
+ return true;
117
+ }
118
+ if (status >= 300 && status < 400) {
119
+ return true;
120
+ }
121
+ if (status === 206) {
122
+ // Range / streaming responses are frequently not retrievable via CDP.
123
+ // Treat missing bodies as expected to avoid noisy adapter errors.
124
+ const contentRange = getHeaderValue(headers, "content-range");
125
+ const contentType = getHeaderValue(headers, "content-type") ?? "";
126
+ if (contentRange) {
127
+ return true;
128
+ }
129
+ if (/^video\//i.test(contentType) || /^audio\//i.test(contentType)) {
130
+ return true;
131
+ }
132
+ }
133
+ return status === 204 || status === 205 || status === 304;
134
+ };
135
+ const responseEvent = {
136
+ type: "response",
137
+ requestId: getRequestId(request),
138
+ url: response.url(),
139
+ status: response.status(),
140
+ statusText: response.statusText(),
141
+ headers,
142
+ mimeType: getHeaderValue(headers, "content-type"),
143
+ fromDiskCache: response.fromCache(),
144
+ fromServiceWorker: response.fromServiceWorker(),
145
+ timestamp,
146
+ body: {
147
+ kind: "late",
148
+ read: async () => {
149
+ try {
150
+ const buffer = await response.buffer();
151
+ return new Uint8Array(buffer);
152
+ }
153
+ catch (error) {
154
+ if (isResponseBodyUnavailableError(error)) {
155
+ if (!isExpectedMissingBody()) {
156
+ const method = request.method();
157
+ const status = response.status();
158
+ const url = response.url();
159
+ handlers.onError?.(new Error(`Unexpected missing response body (method=${method} status=${status}) for ${url}`));
160
+ }
161
+ return new Uint8Array();
162
+ }
163
+ handlers.onError?.(error);
164
+ return new Uint8Array();
165
+ }
166
+ }
167
+ }
168
+ };
169
+ handlers.onEvent(responseEvent);
170
+ };
171
+ const onRequestFailed = (request) => {
172
+ const timestamp = Date.now();
173
+ const failure = request.failure?.();
174
+ const hadRequest = requestIds.has(request);
175
+ const requestId = getRequestId(request);
176
+ if (!hadRequest) {
177
+ emitRequest(request, timestamp);
178
+ }
179
+ const failedEvent = {
180
+ type: "failed",
181
+ requestId,
182
+ url: request.url(),
183
+ errorText: failure?.errorText ?? "Request failed",
184
+ timestamp
185
+ };
186
+ handlers.onEvent(failedEvent);
187
+ };
188
+ page.on("request", onRequest);
189
+ page.on("response", onResponse);
190
+ page.on("requestfailed", onRequestFailed);
191
+ let navigationPromise = null;
192
+ const navigate = async (url, options) => {
193
+ navigationPromise = page.goto(url, {
194
+ ...this.options.gotoOptions,
195
+ ...options
196
+ });
197
+ await navigationPromise;
198
+ };
199
+ const ensureHtmlTargetLoaded = async () => {
200
+ if (target.kind !== "html") {
201
+ return;
202
+ }
203
+ const baseTag = `<base href="${target.baseUrl}">`;
204
+ const alreadyHasBase = /<base\s+/i.test(target.htmlString);
205
+ const htmlWithBase = alreadyHasBase
206
+ ? target.htmlString
207
+ : target.htmlString.replace(/<head(\s[^>]*)?>/i, (match) => `${match}${baseTag}`);
208
+ await page.setContent(htmlWithBase, { waitUntil: "domcontentloaded" });
209
+ };
210
+ let htmlLoaded = false;
211
+ const whenHtmlLoaded = async () => {
212
+ if (htmlLoaded) {
213
+ return;
214
+ }
215
+ if (target.kind === "url") {
216
+ await (navigationPromise ?? navigate(target.url));
217
+ }
218
+ else if (target.kind === "html") {
219
+ await ensureHtmlTargetLoaded();
220
+ }
221
+ htmlLoaded = true;
222
+ };
223
+ const waitForHtml = async () => {
224
+ if (target.kind === "html") {
225
+ return {
226
+ htmlString: target.htmlString,
227
+ baseUrl: target.baseUrl,
228
+ url: target.url,
229
+ contentType: "text/html"
230
+ };
231
+ }
232
+ await whenHtmlLoaded();
233
+ return readDomHtml(page);
234
+ };
235
+ const getDomHtml = async () => {
236
+ await whenHtmlLoaded();
237
+ return readDomHtml(page);
238
+ };
239
+ const startCapture = async () => {
240
+ await whenHtmlLoaded();
241
+ await runTriggerActions(page, this.options.triggerActions);
242
+ };
243
+ return {
244
+ navigate: target.kind === "url" ? navigate : undefined,
245
+ waitForHtml,
246
+ getDomHtml,
247
+ startCapture,
248
+ stop: async () => {
249
+ page.off("request", onRequest);
250
+ page.off("response", onResponse);
251
+ page.off("requestfailed", onRequestFailed);
252
+ if (ownsPage) {
253
+ await page.close();
254
+ }
255
+ if (ownsBrowser && browser) {
256
+ await browser.close();
257
+ }
258
+ }
259
+ };
260
+ }
261
+ }
@@ -0,0 +1,13 @@
1
+ import type { Page } from "puppeteer";
2
+ /**
3
+ * Reads the current page DOM as HTML.
4
+ *
5
+ * This is used both for the early HTML milestone (`html@1`) and the post-load
6
+ * DOM snapshot (`dom-html@1`).
7
+ */
8
+ export declare const readDomHtml: (page: Page) => Promise<{
9
+ htmlString: string;
10
+ baseUrl: string;
11
+ url: string;
12
+ contentType: string;
13
+ }>;
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Reads the current page DOM as HTML.
3
+ *
4
+ * This is used both for the early HTML milestone (`html@1`) and the post-load
5
+ * DOM snapshot (`dom-html@1`).
6
+ */
7
+ export const readDomHtml = async (page) => {
8
+ const [url, htmlString] = await Promise.all([page.url(), page.content()]);
9
+ return {
10
+ htmlString,
11
+ baseUrl: url,
12
+ url,
13
+ contentType: "text/html"
14
+ };
15
+ };
@@ -0,0 +1 @@
1
+ export declare const getEnvString: (name: string) => string | undefined;
@@ -0,0 +1,5 @@
1
+ export const getEnvString = (name) => {
2
+ const value = globalThis.process
3
+ ?.env?.[name];
4
+ return value && value.trim().length > 0 ? value.trim() : undefined;
5
+ };
@@ -0,0 +1,10 @@
1
+ export declare const isMissingChromeError: (error: unknown) => boolean;
2
+ export declare const buildMissingChromeHelp: () => string;
3
+ export declare const isNoDataForResourceError: (error: unknown) => boolean;
4
+ /**
5
+ * True when Puppeteer/CDP signals the response body is not available.
6
+ *
7
+ * This is not always an error. For example, CORS preflight requests (OPTIONS)
8
+ * or 204/304 responses may legitimately have no retrievable body.
9
+ */
10
+ export declare const isResponseBodyUnavailableError: (error: unknown) => boolean;
@@ -0,0 +1,52 @@
1
+ export const isMissingChromeError = (error) => {
2
+ const message = error && typeof error.message === "string"
3
+ ? error.message
4
+ : "";
5
+ return message.includes("Could not find Chrome") || message.includes("Could not find Chromium");
6
+ };
7
+ export const buildMissingChromeHelp = () => {
8
+ return `Puppeteer could not find a compatible Chrome installation.
9
+
10
+ Reason:
11
+ This machine does not have a Chrome executable available to Puppeteer.
12
+
13
+ Fix options:
14
+ 1) Install Chrome for Puppeteer (recommended):
15
+ - pnpm dlx puppeteer browsers install chrome
16
+ - or: npx puppeteer browsers install chrome
17
+
18
+ 2) Use an existing system Chrome/Chromium:
19
+ - Set PUPPETEER_EXECUTABLE_PATH=/path/to/chrome
20
+ `;
21
+ };
22
+ export const isNoDataForResourceError = (error) => {
23
+ if (!error || typeof error !== "object") {
24
+ return false;
25
+ }
26
+ const message = error.message;
27
+ if (typeof message !== "string") {
28
+ return false;
29
+ }
30
+ return message.includes("Network.getResponseBody") && message.includes("No data found");
31
+ };
32
+ /**
33
+ * True when Puppeteer/CDP signals the response body is not available.
34
+ *
35
+ * This is not always an error. For example, CORS preflight requests (OPTIONS)
36
+ * or 204/304 responses may legitimately have no retrievable body.
37
+ */
38
+ export const isResponseBodyUnavailableError = (error) => {
39
+ if (!error || typeof error !== "object") {
40
+ return false;
41
+ }
42
+ const message = error.message;
43
+ if (typeof message !== "string") {
44
+ return false;
45
+ }
46
+ if (isNoDataForResourceError(error)) {
47
+ return true;
48
+ }
49
+ return (message.includes("Could not load body for this request") ||
50
+ message.includes("No resource with given identifier found") ||
51
+ message.includes("Response body is unavailable for redirect responses"));
52
+ };
@@ -0,0 +1,2 @@
1
+ export declare const normalizeHeaders: (headers: Record<string, string | number | boolean | undefined>) => Record<string, string>;
2
+ export declare const getHeaderValue: (headers: Record<string, string>, name: string) => string | undefined;
@@ -0,0 +1,19 @@
1
+ export const normalizeHeaders = (headers) => {
2
+ const output = {};
3
+ for (const key in headers) {
4
+ const value = headers[key];
5
+ if (value === undefined)
6
+ continue;
7
+ output[key] = typeof value === "string" ? value : String(value);
8
+ }
9
+ return output;
10
+ };
11
+ export const getHeaderValue = (headers, name) => {
12
+ const target = name.toLowerCase();
13
+ for (const key in headers) {
14
+ if (key.toLowerCase() === target) {
15
+ return headers[key];
16
+ }
17
+ }
18
+ return undefined;
19
+ };
@@ -0,0 +1,6 @@
1
+ import type { HTTPRequest } from "puppeteer";
2
+ export declare const getFrameId: (request: HTTPRequest) => string | undefined;
3
+ export declare const getInitiator: (request: HTTPRequest) => {
4
+ type: string | undefined;
5
+ url: string | undefined;
6
+ } | undefined;
@@ -0,0 +1,23 @@
1
+ export const getFrameId = (request) => {
2
+ const frame = request.frame();
3
+ if (!frame)
4
+ return undefined;
5
+ const frameRef = frame;
6
+ if (frameRef._id)
7
+ return frameRef._id;
8
+ if (frameRef._frameId)
9
+ return frameRef._frameId;
10
+ if (typeof frameRef.id === "function")
11
+ return frameRef.id();
12
+ return undefined;
13
+ };
14
+ export const getInitiator = (request) => {
15
+ const requestRef = request;
16
+ const initiator = requestRef.initiator?.();
17
+ if (!initiator)
18
+ return undefined;
19
+ return {
20
+ type: initiator.type,
21
+ url: initiator.url
22
+ };
23
+ };
@@ -0,0 +1,3 @@
1
+ import type { TriggerAction } from "@pagepocket/lib";
2
+ import type { Page } from "puppeteer";
3
+ export declare const runTriggerActions: (page: Page, actions?: TriggerAction[]) => Promise<void>;
@@ -0,0 +1,36 @@
1
+ export const runTriggerActions = async (page, actions = []) => {
2
+ if (actions.length === 0)
3
+ return;
4
+ await page.evaluate((actionsArg) => {
5
+ const hoverAll = () => {
6
+ const elements = Array.from(document.querySelectorAll("*"));
7
+ const trigger = (el) => {
8
+ const rect = el.getBoundingClientRect();
9
+ const x = rect.left + rect.width / 2;
10
+ const y = rect.top + rect.height / 2;
11
+ const event = new MouseEvent("mouseover", {
12
+ bubbles: true,
13
+ cancelable: true,
14
+ clientX: x,
15
+ clientY: y
16
+ });
17
+ el.dispatchEvent(event);
18
+ };
19
+ elements.forEach(trigger);
20
+ };
21
+ const scrollToEnd = () => {
22
+ const scrollHeight = document.documentElement?.scrollHeight ?? document.body?.scrollHeight;
23
+ if (typeof scrollHeight === "number") {
24
+ window.scrollTo({ top: scrollHeight, behavior: "instant" });
25
+ }
26
+ };
27
+ for (const action of actionsArg) {
28
+ if (action === "HOVER") {
29
+ hoverAll();
30
+ }
31
+ if (action === "SCROLL_TO_END") {
32
+ scrollToEnd();
33
+ }
34
+ }
35
+ }, actions);
36
+ };
package/package.json ADDED
@@ -0,0 +1,24 @@
1
+ {
2
+ "name": "@pagepocket/capture-http-puppeteer-unit",
3
+ "version": "0.8.0",
4
+ "description": "PagePocket plugin: capture HTTP events (puppeteer adapter)",
5
+ "type": "module",
6
+ "main": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
+ "files": [
9
+ "dist"
10
+ ],
11
+ "license": "ISC",
12
+ "dependencies": {
13
+ "puppeteer": "^22.12.1",
14
+ "@pagepocket/lib": "0.8.0",
15
+ "@pagepocket/contracts": "0.8.0"
16
+ },
17
+ "devDependencies": {
18
+ "typescript": "^5.4.5"
19
+ },
20
+ "scripts": {
21
+ "build": "tsc -p tsconfig.json",
22
+ "test": "node -e \"process.exit(0)\""
23
+ }
24
+ }