@pagepocket/capture-http-puppeteer-unit 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/capture-http-puppeteer-plugin.d.ts +9 -0
- package/dist/capture-http-puppeteer-plugin.js +146 -0
- package/dist/capture-http-puppeteer-unit.d.ts +19 -0
- package/dist/capture-http-puppeteer-unit.js +137 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +1 -0
- package/dist/internal/puppeteer-adapter.d.ts +29 -0
- package/dist/internal/puppeteer-adapter.js +261 -0
- package/dist/internal/utils/dom-html.d.ts +13 -0
- package/dist/internal/utils/dom-html.js +15 -0
- package/dist/internal/utils/env.d.ts +1 -0
- package/dist/internal/utils/env.js +5 -0
- package/dist/internal/utils/errors.d.ts +10 -0
- package/dist/internal/utils/errors.js +52 -0
- package/dist/internal/utils/headers.d.ts +2 -0
- package/dist/internal/utils/headers.js +19 -0
- package/dist/internal/utils/puppeteer-internals.d.ts +6 -0
- package/dist/internal/utils/puppeteer-internals.js +23 -0
- package/dist/internal/utils/trigger-actions.d.ts +3 -0
- package/dist/internal/utils/trigger-actions.js +36 -0
- package/package.json +24 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { type PagePocketContext, type PagePocketPlugin } from "@pagepocket/lib";
|
|
2
|
+
import { type PuppeteerAdapterOptions } from "./internal/puppeteer-adapter.js";
|
|
3
|
+
export type CaptureHttpPuppeteerPluginOptions = PuppeteerAdapterOptions;
|
|
4
|
+
export declare class CaptureHttpPuppeteerPlugin implements PagePocketPlugin {
|
|
5
|
+
readonly name = "plugin:capture-http-puppeteer";
|
|
6
|
+
private adapterOptions;
|
|
7
|
+
constructor(options?: CaptureHttpPuppeteerPluginOptions);
|
|
8
|
+
apply(ctx: PagePocketContext): void;
|
|
9
|
+
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import { createMemoryContentStore } from "@pagepocket/lib";
|
|
2
|
+
import { InflightTracker, networkIdle, normalizeCompletion, timeout } from "@pagepocket/lib";
|
|
3
|
+
import { PuppeteerAdapter } from "./internal/puppeteer-adapter.js";
|
|
4
|
+
const headersRecordToList = (headers) => {
|
|
5
|
+
if (!headers)
|
|
6
|
+
return [];
|
|
7
|
+
return Object.keys(headers).map((name) => ({ name, value: headers[name] }));
|
|
8
|
+
};
|
|
9
|
+
export class CaptureHttpPuppeteerPlugin {
|
|
10
|
+
constructor(options) {
|
|
11
|
+
this.name = "plugin:capture-http-puppeteer";
|
|
12
|
+
this.adapterOptions = options ?? {};
|
|
13
|
+
}
|
|
14
|
+
apply(ctx) {
|
|
15
|
+
const contentStore = createMemoryContentStore("capture-http-puppeteer");
|
|
16
|
+
const events = [];
|
|
17
|
+
const capabilities = {
|
|
18
|
+
requestHeaders: "approx",
|
|
19
|
+
responseHeaders: "approx",
|
|
20
|
+
requestBodies: false,
|
|
21
|
+
responseBodies: "decoded",
|
|
22
|
+
httpVersion: false,
|
|
23
|
+
remoteIp: false,
|
|
24
|
+
headerOrderPreserved: false
|
|
25
|
+
};
|
|
26
|
+
ctx.capture = {
|
|
27
|
+
events,
|
|
28
|
+
contentStore,
|
|
29
|
+
capabilities
|
|
30
|
+
};
|
|
31
|
+
const inflightTracker = new InflightTracker();
|
|
32
|
+
const handleNetworkEvent = async (event) => {
|
|
33
|
+
inflightTracker.handleEvent(event);
|
|
34
|
+
ctx.emitNetworkEvent?.(event);
|
|
35
|
+
if (event.type === "request") {
|
|
36
|
+
events.push({
|
|
37
|
+
type: "http.request",
|
|
38
|
+
requestId: event.requestId,
|
|
39
|
+
url: event.url,
|
|
40
|
+
method: event.method,
|
|
41
|
+
headers: headersRecordToList(event.headers),
|
|
42
|
+
timestamp: event.timestamp,
|
|
43
|
+
frameId: event.frameId,
|
|
44
|
+
resourceType: event.resourceType,
|
|
45
|
+
initiator: event.initiator
|
|
46
|
+
});
|
|
47
|
+
return;
|
|
48
|
+
}
|
|
49
|
+
if (event.type === "failed") {
|
|
50
|
+
events.push({
|
|
51
|
+
type: "http.failed",
|
|
52
|
+
requestId: event.requestId,
|
|
53
|
+
url: event.url,
|
|
54
|
+
errorText: event.errorText,
|
|
55
|
+
timestamp: event.timestamp
|
|
56
|
+
});
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
const bodyRef = event.body
|
|
60
|
+
? await contentStore.put(event.body, {
|
|
61
|
+
url: event.url,
|
|
62
|
+
mimeType: event.mimeType,
|
|
63
|
+
sizeHint: undefined
|
|
64
|
+
})
|
|
65
|
+
: undefined;
|
|
66
|
+
events.push({
|
|
67
|
+
type: "http.response",
|
|
68
|
+
requestId: event.requestId,
|
|
69
|
+
url: event.url,
|
|
70
|
+
status: event.status,
|
|
71
|
+
statusText: event.statusText,
|
|
72
|
+
headers: headersRecordToList(event.headers),
|
|
73
|
+
timestamp: event.timestamp,
|
|
74
|
+
mimeType: event.mimeType,
|
|
75
|
+
fromDiskCache: event.fromDiskCache,
|
|
76
|
+
fromServiceWorker: event.fromServiceWorker,
|
|
77
|
+
bodyRef,
|
|
78
|
+
bodySize: undefined
|
|
79
|
+
});
|
|
80
|
+
};
|
|
81
|
+
const stateKey = "captureHttpPuppeteer.session";
|
|
82
|
+
ctx.onInit(async () => {
|
|
83
|
+
const target = ctx.entry.kind === "url"
|
|
84
|
+
? { kind: "url", url: ctx.entry.url }
|
|
85
|
+
: ctx.entry.kind === "puppeteer-page"
|
|
86
|
+
? { kind: "puppeteer-page", page: ctx.entry.page }
|
|
87
|
+
: ctx.entry.kind === "html-string"
|
|
88
|
+
? {
|
|
89
|
+
kind: "html",
|
|
90
|
+
htmlString: await ctx.whenHtml().then((h) => h.htmlString),
|
|
91
|
+
baseUrl: ctx.entry.baseUrl,
|
|
92
|
+
...(ctx.entry.url ? { url: ctx.entry.url } : {})
|
|
93
|
+
}
|
|
94
|
+
: (() => {
|
|
95
|
+
throw new Error(`CaptureHttpPuppeteerPlugin does not support entry kind: ${String(ctx.entry.kind)}`);
|
|
96
|
+
})();
|
|
97
|
+
const adapter = new PuppeteerAdapter(this.adapterOptions);
|
|
98
|
+
const session = await adapter.start(target, {
|
|
99
|
+
onEvent(event) {
|
|
100
|
+
void handleNetworkEvent(event);
|
|
101
|
+
},
|
|
102
|
+
onError(error) {
|
|
103
|
+
// eslint-disable-next-line no-console
|
|
104
|
+
console.warn("[pagepocket][capture-http-puppeteer] adapter error", error);
|
|
105
|
+
}
|
|
106
|
+
}, {
|
|
107
|
+
timeoutMs: ctx.options.timeoutMs,
|
|
108
|
+
maxDurationMs: ctx.options.maxDurationMs
|
|
109
|
+
});
|
|
110
|
+
ctx.state[stateKey] = session;
|
|
111
|
+
if (target.kind === "url" && session.navigate) {
|
|
112
|
+
await session.navigate(target.url);
|
|
113
|
+
}
|
|
114
|
+
if (!ctx.html) {
|
|
115
|
+
const html = await session.waitForHtml();
|
|
116
|
+
ctx.setHtml(html);
|
|
117
|
+
}
|
|
118
|
+
});
|
|
119
|
+
ctx.onBeforeNetwork(async () => {
|
|
120
|
+
const session = ctx.state[stateKey];
|
|
121
|
+
if (!session) {
|
|
122
|
+
throw new Error("CaptureHttpPuppeteerPlugin internal error: missing session");
|
|
123
|
+
}
|
|
124
|
+
await session.startCapture();
|
|
125
|
+
const completionStrategies = normalizeCompletion(ctx.options.completion);
|
|
126
|
+
const idleMs = ctx.options.timeoutMs ?? 5000;
|
|
127
|
+
const maxDurationMs = ctx.options.maxDurationMs;
|
|
128
|
+
const completion = completionStrategies.length > 0
|
|
129
|
+
? completionStrategies
|
|
130
|
+
: [networkIdle(idleMs), ...(maxDurationMs !== undefined ? [timeout(maxDurationMs)] : [])];
|
|
131
|
+
if (completion.length === 1) {
|
|
132
|
+
await completion[0].wait({
|
|
133
|
+
now: () => Date.now(),
|
|
134
|
+
getStats: () => inflightTracker.getStats()
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
else {
|
|
138
|
+
await Promise.race(completion.map((strategy) => strategy.wait({
|
|
139
|
+
now: () => Date.now(),
|
|
140
|
+
getStats: () => inflightTracker.getStats()
|
|
141
|
+
})));
|
|
142
|
+
}
|
|
143
|
+
await session.stop();
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { Unit, type CaptureArtifacts } from "@pagepocket/lib";
|
|
2
|
+
import { type PuppeteerAdapterOptions } from "./internal/puppeteer-adapter.js";
|
|
3
|
+
export type CaptureHttpPuppeteerUnitOptions = PuppeteerAdapterOptions;
|
|
4
|
+
export declare class CaptureHttpPuppeteerUnit extends Unit {
|
|
5
|
+
readonly id = "captureHttpPuppeteer";
|
|
6
|
+
readonly kind = "capture.http.puppeteer";
|
|
7
|
+
private adapterOptions;
|
|
8
|
+
constructor(options?: CaptureHttpPuppeteerUnitOptions);
|
|
9
|
+
run(ctx: import("@pagepocket/lib").UnitContext, rt: import("@pagepocket/lib").UnitRuntime): Promise<{
|
|
10
|
+
capture: CaptureArtifacts;
|
|
11
|
+
html: {};
|
|
12
|
+
domHtml: {
|
|
13
|
+
htmlString: string;
|
|
14
|
+
baseUrl: string;
|
|
15
|
+
url?: string;
|
|
16
|
+
contentType?: string;
|
|
17
|
+
};
|
|
18
|
+
}>;
|
|
19
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import { NETWORK } from "@pagepocket/contracts";
|
|
2
|
+
import { Unit, createMemoryContentStore, InflightTracker, mapKind, networkIdle, normalizeCompletion, throwUnsupportedEntryKind, timeout } from "@pagepocket/lib";
|
|
3
|
+
import { PuppeteerAdapter } from "./internal/puppeteer-adapter.js";
|
|
4
|
+
const headersRecordToList = (headers) => {
|
|
5
|
+
if (!headers)
|
|
6
|
+
return [];
|
|
7
|
+
return Object.keys(headers).map((name) => ({ name, value: headers[name] }));
|
|
8
|
+
};
|
|
9
|
+
const targetBuilders = {
|
|
10
|
+
url: (entry) => ({
|
|
11
|
+
kind: "url",
|
|
12
|
+
url: entry.url
|
|
13
|
+
}),
|
|
14
|
+
"puppeteer-page": (entry) => ({
|
|
15
|
+
kind: "puppeteer-page",
|
|
16
|
+
page: entry.page
|
|
17
|
+
}),
|
|
18
|
+
"html-string": (entry) => ({
|
|
19
|
+
kind: "html",
|
|
20
|
+
htmlString: entry.htmlString,
|
|
21
|
+
baseUrl: entry.baseUrl,
|
|
22
|
+
...(entry.url ? { url: entry.url } : {})
|
|
23
|
+
})
|
|
24
|
+
};
|
|
25
|
+
export class CaptureHttpPuppeteerUnit extends Unit {
|
|
26
|
+
constructor(options) {
|
|
27
|
+
super();
|
|
28
|
+
this.id = "captureHttpPuppeteer";
|
|
29
|
+
this.kind = "capture.http.puppeteer";
|
|
30
|
+
this.adapterOptions = options ?? {};
|
|
31
|
+
}
|
|
32
|
+
async run(ctx, rt) {
|
|
33
|
+
const contentStore = createMemoryContentStore("capture-http-puppeteer");
|
|
34
|
+
const events = [];
|
|
35
|
+
const capabilities = {
|
|
36
|
+
requestHeaders: "approx",
|
|
37
|
+
responseHeaders: "approx",
|
|
38
|
+
requestBodies: false,
|
|
39
|
+
responseBodies: "decoded",
|
|
40
|
+
httpVersion: false,
|
|
41
|
+
remoteIp: false,
|
|
42
|
+
headerOrderPreserved: false
|
|
43
|
+
};
|
|
44
|
+
const inflightTracker = new InflightTracker();
|
|
45
|
+
const handleNetworkEvent = async (event) => {
|
|
46
|
+
inflightTracker.handleEvent(event);
|
|
47
|
+
rt.publish(NETWORK, event);
|
|
48
|
+
if (event.type === "request") {
|
|
49
|
+
events.push({
|
|
50
|
+
type: "http.request",
|
|
51
|
+
requestId: event.requestId,
|
|
52
|
+
url: event.url,
|
|
53
|
+
method: event.method,
|
|
54
|
+
headers: headersRecordToList(event.headers),
|
|
55
|
+
timestamp: event.timestamp,
|
|
56
|
+
frameId: event.frameId,
|
|
57
|
+
resourceType: event.resourceType,
|
|
58
|
+
initiator: event.initiator
|
|
59
|
+
});
|
|
60
|
+
return;
|
|
61
|
+
}
|
|
62
|
+
if (event.type === "failed") {
|
|
63
|
+
events.push({
|
|
64
|
+
type: "http.failed",
|
|
65
|
+
requestId: event.requestId,
|
|
66
|
+
url: event.url,
|
|
67
|
+
errorText: event.errorText,
|
|
68
|
+
timestamp: event.timestamp
|
|
69
|
+
});
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
const bodyRef = event.body
|
|
73
|
+
? await contentStore.put(event.body, {
|
|
74
|
+
url: event.url,
|
|
75
|
+
mimeType: event.mimeType,
|
|
76
|
+
sizeHint: undefined
|
|
77
|
+
})
|
|
78
|
+
: undefined;
|
|
79
|
+
events.push({
|
|
80
|
+
type: "http.response",
|
|
81
|
+
requestId: event.requestId,
|
|
82
|
+
url: event.url,
|
|
83
|
+
status: event.status,
|
|
84
|
+
statusText: event.statusText,
|
|
85
|
+
headers: headersRecordToList(event.headers),
|
|
86
|
+
timestamp: event.timestamp,
|
|
87
|
+
mimeType: event.mimeType,
|
|
88
|
+
fromDiskCache: event.fromDiskCache,
|
|
89
|
+
fromServiceWorker: event.fromServiceWorker,
|
|
90
|
+
bodyRef,
|
|
91
|
+
bodySize: undefined
|
|
92
|
+
});
|
|
93
|
+
};
|
|
94
|
+
const capture = { events, contentStore, capabilities };
|
|
95
|
+
const adapter = new PuppeteerAdapter(this.adapterOptions);
|
|
96
|
+
const target = mapKind(rt.entry, targetBuilders, {
|
|
97
|
+
onUnsupportedKind: throwUnsupportedEntryKind("CaptureHttpPuppeteerUnit")
|
|
98
|
+
});
|
|
99
|
+
const session = await adapter.start(target, {
|
|
100
|
+
onEvent: (event) => {
|
|
101
|
+
void handleNetworkEvent(event);
|
|
102
|
+
},
|
|
103
|
+
onError: (error) => {
|
|
104
|
+
console.warn("[pagepocket][capture-http-puppeteer] adapter error", error);
|
|
105
|
+
}
|
|
106
|
+
}, {
|
|
107
|
+
timeoutMs: rt.options.timeoutMs,
|
|
108
|
+
maxDurationMs: rt.options.maxDurationMs
|
|
109
|
+
});
|
|
110
|
+
if (target.kind === "url" && session.navigate) {
|
|
111
|
+
await session.navigate(target.url);
|
|
112
|
+
}
|
|
113
|
+
const html = ctx.value.html ?? (await session.waitForHtml());
|
|
114
|
+
await session.startCapture();
|
|
115
|
+
const completionStrategies = normalizeCompletion(rt.options.completion);
|
|
116
|
+
const idleMs = rt.options.timeoutMs ?? 5000;
|
|
117
|
+
const maxDurationMs = rt.options.maxDurationMs;
|
|
118
|
+
const completion = completionStrategies.length > 0
|
|
119
|
+
? completionStrategies
|
|
120
|
+
: [networkIdle(idleMs), ...(maxDurationMs !== undefined ? [timeout(maxDurationMs)] : [])];
|
|
121
|
+
if (completion.length === 1) {
|
|
122
|
+
await completion[0].wait({
|
|
123
|
+
now: () => Date.now(),
|
|
124
|
+
getStats: () => inflightTracker.getStats()
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
else {
|
|
128
|
+
await Promise.race(completion.map((strategy) => strategy.wait({
|
|
129
|
+
now: () => Date.now(),
|
|
130
|
+
getStats: () => inflightTracker.getStats()
|
|
131
|
+
})));
|
|
132
|
+
}
|
|
133
|
+
const domHtml = session.getDomHtml ? await session.getDomHtml() : await session.waitForHtml();
|
|
134
|
+
await session.stop();
|
|
135
|
+
return { capture, html, domHtml };
|
|
136
|
+
}
|
|
137
|
+
}
|
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { CaptureHttpPuppeteerUnit } from "./capture-http-puppeteer-unit.js";
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { NetworkEventHandlers, NetworkInterceptorAdapter, TriggerAction, InterceptTarget, InterceptSession } from "@pagepocket/lib";
|
|
2
|
+
import { type Browser, type GoToOptions, type LaunchOptions, type Page } from "puppeteer";
|
|
3
|
+
export type PuppeteerAdapterOptions = {
|
|
4
|
+
browser?: Browser;
|
|
5
|
+
page?: Page;
|
|
6
|
+
launch?: (options?: LaunchOptions) => Promise<Browser>;
|
|
7
|
+
launchOptions?: LaunchOptions;
|
|
8
|
+
gotoOptions?: GoToOptions;
|
|
9
|
+
triggerActions?: TriggerAction[];
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* Internal adapter for the capture plugin.
|
|
13
|
+
*
|
|
14
|
+
* Note: this is intentionally NOT published as a separate package.
|
|
15
|
+
*/
|
|
16
|
+
export declare class PuppeteerAdapter implements NetworkInterceptorAdapter {
|
|
17
|
+
readonly name = "puppeteer";
|
|
18
|
+
readonly capabilities: {
|
|
19
|
+
canGetResponseBody: boolean;
|
|
20
|
+
canStreamResponseBody: boolean;
|
|
21
|
+
canGetRequestBody: boolean;
|
|
22
|
+
providesResourceType: boolean;
|
|
23
|
+
canWaitForHtml: boolean;
|
|
24
|
+
supportsStagedCapture: boolean;
|
|
25
|
+
};
|
|
26
|
+
private options;
|
|
27
|
+
constructor(options?: PuppeteerAdapterOptions);
|
|
28
|
+
start(target: InterceptTarget, handlers: NetworkEventHandlers, _options?: Record<string, unknown>): Promise<InterceptSession>;
|
|
29
|
+
}
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import puppeteer from "puppeteer";
|
|
2
|
+
import { readDomHtml } from "./utils/dom-html.js";
|
|
3
|
+
import { getEnvString } from "./utils/env.js";
|
|
4
|
+
import { buildMissingChromeHelp, isMissingChromeError, isResponseBodyUnavailableError } from "./utils/errors.js";
|
|
5
|
+
import { getHeaderValue, normalizeHeaders } from "./utils/headers.js";
|
|
6
|
+
import { getFrameId, getInitiator } from "./utils/puppeteer-internals.js";
|
|
7
|
+
import { runTriggerActions } from "./utils/trigger-actions.js";
|
|
8
|
+
const toResourceType = (request) => {
|
|
9
|
+
const type = request.resourceType?.();
|
|
10
|
+
return type ? type : undefined;
|
|
11
|
+
};
|
|
12
|
+
/**
|
|
13
|
+
* Internal adapter for the capture plugin.
|
|
14
|
+
*
|
|
15
|
+
* Note: this is intentionally NOT published as a separate package.
|
|
16
|
+
*/
|
|
17
|
+
export class PuppeteerAdapter {
|
|
18
|
+
constructor(options = {}) {
|
|
19
|
+
this.name = "puppeteer";
|
|
20
|
+
this.capabilities = {
|
|
21
|
+
canGetResponseBody: true,
|
|
22
|
+
canStreamResponseBody: false,
|
|
23
|
+
canGetRequestBody: false,
|
|
24
|
+
providesResourceType: true,
|
|
25
|
+
canWaitForHtml: true,
|
|
26
|
+
supportsStagedCapture: true
|
|
27
|
+
};
|
|
28
|
+
this.options = options;
|
|
29
|
+
}
|
|
30
|
+
async start(target, handlers, _options) {
|
|
31
|
+
let browser = this.options.browser;
|
|
32
|
+
let page = this.options.page;
|
|
33
|
+
let ownsBrowser = false;
|
|
34
|
+
let ownsPage = false;
|
|
35
|
+
if (target.kind === "puppeteer-page") {
|
|
36
|
+
page = target.page;
|
|
37
|
+
}
|
|
38
|
+
else if (target.kind === "url" || target.kind === "html") {
|
|
39
|
+
if (!page) {
|
|
40
|
+
if (!browser) {
|
|
41
|
+
const doLaunch = this.options.launch ?? puppeteer.launch;
|
|
42
|
+
const envExecutablePath = getEnvString("PUPPETEER_EXECUTABLE_PATH");
|
|
43
|
+
const launchOptionsFromUser = this.options.launchOptions;
|
|
44
|
+
const executablePath = launchOptionsFromUser?.executablePath ?? envExecutablePath;
|
|
45
|
+
try {
|
|
46
|
+
browser = await doLaunch({
|
|
47
|
+
...launchOptionsFromUser,
|
|
48
|
+
...(executablePath ? { executablePath } : {}),
|
|
49
|
+
headless: false
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
catch (error) {
|
|
53
|
+
if (!isMissingChromeError(error)) {
|
|
54
|
+
throw error;
|
|
55
|
+
}
|
|
56
|
+
const message = error && typeof error.message === "string"
|
|
57
|
+
? error.message
|
|
58
|
+
: String(error);
|
|
59
|
+
throw new Error(`${message}\n\n${buildMissingChromeHelp()}`);
|
|
60
|
+
}
|
|
61
|
+
ownsBrowser = true;
|
|
62
|
+
}
|
|
63
|
+
page = await browser.newPage();
|
|
64
|
+
ownsPage = true;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
throw new Error("PuppeteerAdapter only supports url, html, or puppeteer-page targets.");
|
|
69
|
+
}
|
|
70
|
+
if (!page || typeof page.on !== "function") {
|
|
71
|
+
throw new Error("PuppeteerAdapter requires a Puppeteer Page instance.");
|
|
72
|
+
}
|
|
73
|
+
const requestIds = new WeakMap();
|
|
74
|
+
let requestSequence = 0;
|
|
75
|
+
const getRequestId = (request) => {
|
|
76
|
+
const existing = requestIds.get(request);
|
|
77
|
+
if (existing) {
|
|
78
|
+
return existing;
|
|
79
|
+
}
|
|
80
|
+
const requestId = `pptr-${Date.now()}-${requestSequence++}`;
|
|
81
|
+
requestIds.set(request, requestId);
|
|
82
|
+
return requestId;
|
|
83
|
+
};
|
|
84
|
+
const emitRequest = (request, timestamp) => {
|
|
85
|
+
const requestEvent = {
|
|
86
|
+
type: "request",
|
|
87
|
+
requestId: getRequestId(request),
|
|
88
|
+
url: request.url(),
|
|
89
|
+
method: request.method(),
|
|
90
|
+
headers: normalizeHeaders(request.headers()),
|
|
91
|
+
frameId: getFrameId(request),
|
|
92
|
+
resourceType: toResourceType(request),
|
|
93
|
+
initiator: getInitiator(request),
|
|
94
|
+
timestamp
|
|
95
|
+
};
|
|
96
|
+
handlers.onEvent(requestEvent);
|
|
97
|
+
};
|
|
98
|
+
const onRequest = (request) => {
|
|
99
|
+
const timestamp = Date.now();
|
|
100
|
+
emitRequest(request, timestamp);
|
|
101
|
+
};
|
|
102
|
+
const onResponse = (response) => {
|
|
103
|
+
const timestamp = Date.now();
|
|
104
|
+
const request = response.request();
|
|
105
|
+
if (!requestIds.has(request)) {
|
|
106
|
+
emitRequest(request, timestamp);
|
|
107
|
+
}
|
|
108
|
+
const headers = normalizeHeaders(response.headers());
|
|
109
|
+
const isExpectedMissingBody = () => {
|
|
110
|
+
const method = request.method().toUpperCase();
|
|
111
|
+
if (method === "OPTIONS" || method === "HEAD") {
|
|
112
|
+
return true;
|
|
113
|
+
}
|
|
114
|
+
const status = response.status();
|
|
115
|
+
if (status >= 100 && status < 200) {
|
|
116
|
+
return true;
|
|
117
|
+
}
|
|
118
|
+
if (status >= 300 && status < 400) {
|
|
119
|
+
return true;
|
|
120
|
+
}
|
|
121
|
+
if (status === 206) {
|
|
122
|
+
// Range / streaming responses are frequently not retrievable via CDP.
|
|
123
|
+
// Treat missing bodies as expected to avoid noisy adapter errors.
|
|
124
|
+
const contentRange = getHeaderValue(headers, "content-range");
|
|
125
|
+
const contentType = getHeaderValue(headers, "content-type") ?? "";
|
|
126
|
+
if (contentRange) {
|
|
127
|
+
return true;
|
|
128
|
+
}
|
|
129
|
+
if (/^video\//i.test(contentType) || /^audio\//i.test(contentType)) {
|
|
130
|
+
return true;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return status === 204 || status === 205 || status === 304;
|
|
134
|
+
};
|
|
135
|
+
const responseEvent = {
|
|
136
|
+
type: "response",
|
|
137
|
+
requestId: getRequestId(request),
|
|
138
|
+
url: response.url(),
|
|
139
|
+
status: response.status(),
|
|
140
|
+
statusText: response.statusText(),
|
|
141
|
+
headers,
|
|
142
|
+
mimeType: getHeaderValue(headers, "content-type"),
|
|
143
|
+
fromDiskCache: response.fromCache(),
|
|
144
|
+
fromServiceWorker: response.fromServiceWorker(),
|
|
145
|
+
timestamp,
|
|
146
|
+
body: {
|
|
147
|
+
kind: "late",
|
|
148
|
+
read: async () => {
|
|
149
|
+
try {
|
|
150
|
+
const buffer = await response.buffer();
|
|
151
|
+
return new Uint8Array(buffer);
|
|
152
|
+
}
|
|
153
|
+
catch (error) {
|
|
154
|
+
if (isResponseBodyUnavailableError(error)) {
|
|
155
|
+
if (!isExpectedMissingBody()) {
|
|
156
|
+
const method = request.method();
|
|
157
|
+
const status = response.status();
|
|
158
|
+
const url = response.url();
|
|
159
|
+
handlers.onError?.(new Error(`Unexpected missing response body (method=${method} status=${status}) for ${url}`));
|
|
160
|
+
}
|
|
161
|
+
return new Uint8Array();
|
|
162
|
+
}
|
|
163
|
+
handlers.onError?.(error);
|
|
164
|
+
return new Uint8Array();
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
};
|
|
169
|
+
handlers.onEvent(responseEvent);
|
|
170
|
+
};
|
|
171
|
+
const onRequestFailed = (request) => {
|
|
172
|
+
const timestamp = Date.now();
|
|
173
|
+
const failure = request.failure?.();
|
|
174
|
+
const hadRequest = requestIds.has(request);
|
|
175
|
+
const requestId = getRequestId(request);
|
|
176
|
+
if (!hadRequest) {
|
|
177
|
+
emitRequest(request, timestamp);
|
|
178
|
+
}
|
|
179
|
+
const failedEvent = {
|
|
180
|
+
type: "failed",
|
|
181
|
+
requestId,
|
|
182
|
+
url: request.url(),
|
|
183
|
+
errorText: failure?.errorText ?? "Request failed",
|
|
184
|
+
timestamp
|
|
185
|
+
};
|
|
186
|
+
handlers.onEvent(failedEvent);
|
|
187
|
+
};
|
|
188
|
+
page.on("request", onRequest);
|
|
189
|
+
page.on("response", onResponse);
|
|
190
|
+
page.on("requestfailed", onRequestFailed);
|
|
191
|
+
let navigationPromise = null;
|
|
192
|
+
const navigate = async (url, options) => {
|
|
193
|
+
navigationPromise = page.goto(url, {
|
|
194
|
+
...this.options.gotoOptions,
|
|
195
|
+
...options
|
|
196
|
+
});
|
|
197
|
+
await navigationPromise;
|
|
198
|
+
};
|
|
199
|
+
const ensureHtmlTargetLoaded = async () => {
|
|
200
|
+
if (target.kind !== "html") {
|
|
201
|
+
return;
|
|
202
|
+
}
|
|
203
|
+
const baseTag = `<base href="${target.baseUrl}">`;
|
|
204
|
+
const alreadyHasBase = /<base\s+/i.test(target.htmlString);
|
|
205
|
+
const htmlWithBase = alreadyHasBase
|
|
206
|
+
? target.htmlString
|
|
207
|
+
: target.htmlString.replace(/<head(\s[^>]*)?>/i, (match) => `${match}${baseTag}`);
|
|
208
|
+
await page.setContent(htmlWithBase, { waitUntil: "domcontentloaded" });
|
|
209
|
+
};
|
|
210
|
+
let htmlLoaded = false;
|
|
211
|
+
const whenHtmlLoaded = async () => {
|
|
212
|
+
if (htmlLoaded) {
|
|
213
|
+
return;
|
|
214
|
+
}
|
|
215
|
+
if (target.kind === "url") {
|
|
216
|
+
await (navigationPromise ?? navigate(target.url));
|
|
217
|
+
}
|
|
218
|
+
else if (target.kind === "html") {
|
|
219
|
+
await ensureHtmlTargetLoaded();
|
|
220
|
+
}
|
|
221
|
+
htmlLoaded = true;
|
|
222
|
+
};
|
|
223
|
+
const waitForHtml = async () => {
|
|
224
|
+
if (target.kind === "html") {
|
|
225
|
+
return {
|
|
226
|
+
htmlString: target.htmlString,
|
|
227
|
+
baseUrl: target.baseUrl,
|
|
228
|
+
url: target.url,
|
|
229
|
+
contentType: "text/html"
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
await whenHtmlLoaded();
|
|
233
|
+
return readDomHtml(page);
|
|
234
|
+
};
|
|
235
|
+
const getDomHtml = async () => {
|
|
236
|
+
await whenHtmlLoaded();
|
|
237
|
+
return readDomHtml(page);
|
|
238
|
+
};
|
|
239
|
+
const startCapture = async () => {
|
|
240
|
+
await whenHtmlLoaded();
|
|
241
|
+
await runTriggerActions(page, this.options.triggerActions);
|
|
242
|
+
};
|
|
243
|
+
return {
|
|
244
|
+
navigate: target.kind === "url" ? navigate : undefined,
|
|
245
|
+
waitForHtml,
|
|
246
|
+
getDomHtml,
|
|
247
|
+
startCapture,
|
|
248
|
+
stop: async () => {
|
|
249
|
+
page.off("request", onRequest);
|
|
250
|
+
page.off("response", onResponse);
|
|
251
|
+
page.off("requestfailed", onRequestFailed);
|
|
252
|
+
if (ownsPage) {
|
|
253
|
+
await page.close();
|
|
254
|
+
}
|
|
255
|
+
if (ownsBrowser && browser) {
|
|
256
|
+
await browser.close();
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Page } from "puppeteer";
|
|
2
|
+
/**
|
|
3
|
+
* Reads the current page DOM as HTML.
|
|
4
|
+
*
|
|
5
|
+
* This is used both for the early HTML milestone (`html@1`) and the post-load
|
|
6
|
+
* DOM snapshot (`dom-html@1`).
|
|
7
|
+
*/
|
|
8
|
+
export declare const readDomHtml: (page: Page) => Promise<{
|
|
9
|
+
htmlString: string;
|
|
10
|
+
baseUrl: string;
|
|
11
|
+
url: string;
|
|
12
|
+
contentType: string;
|
|
13
|
+
}>;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reads the current page DOM as HTML.
|
|
3
|
+
*
|
|
4
|
+
* This is used both for the early HTML milestone (`html@1`) and the post-load
|
|
5
|
+
* DOM snapshot (`dom-html@1`).
|
|
6
|
+
*/
|
|
7
|
+
export const readDomHtml = async (page) => {
|
|
8
|
+
const [url, htmlString] = await Promise.all([page.url(), page.content()]);
|
|
9
|
+
return {
|
|
10
|
+
htmlString,
|
|
11
|
+
baseUrl: url,
|
|
12
|
+
url,
|
|
13
|
+
contentType: "text/html"
|
|
14
|
+
};
|
|
15
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare const getEnvString: (name: string) => string | undefined;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export declare const isMissingChromeError: (error: unknown) => boolean;
|
|
2
|
+
export declare const buildMissingChromeHelp: () => string;
|
|
3
|
+
export declare const isNoDataForResourceError: (error: unknown) => boolean;
|
|
4
|
+
/**
|
|
5
|
+
* True when Puppeteer/CDP signals the response body is not available.
|
|
6
|
+
*
|
|
7
|
+
* This is not always an error. For example, CORS preflight requests (OPTIONS)
|
|
8
|
+
* or 204/304 responses may legitimately have no retrievable body.
|
|
9
|
+
*/
|
|
10
|
+
export declare const isResponseBodyUnavailableError: (error: unknown) => boolean;
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
export const isMissingChromeError = (error) => {
|
|
2
|
+
const message = error && typeof error.message === "string"
|
|
3
|
+
? error.message
|
|
4
|
+
: "";
|
|
5
|
+
return message.includes("Could not find Chrome") || message.includes("Could not find Chromium");
|
|
6
|
+
};
|
|
7
|
+
export const buildMissingChromeHelp = () => {
|
|
8
|
+
return `Puppeteer could not find a compatible Chrome installation.
|
|
9
|
+
|
|
10
|
+
Reason:
|
|
11
|
+
This machine does not have a Chrome executable available to Puppeteer.
|
|
12
|
+
|
|
13
|
+
Fix options:
|
|
14
|
+
1) Install Chrome for Puppeteer (recommended):
|
|
15
|
+
- pnpm dlx puppeteer browsers install chrome
|
|
16
|
+
- or: npx puppeteer browsers install chrome
|
|
17
|
+
|
|
18
|
+
2) Use an existing system Chrome/Chromium:
|
|
19
|
+
- Set PUPPETEER_EXECUTABLE_PATH=/path/to/chrome
|
|
20
|
+
`;
|
|
21
|
+
};
|
|
22
|
+
export const isNoDataForResourceError = (error) => {
|
|
23
|
+
if (!error || typeof error !== "object") {
|
|
24
|
+
return false;
|
|
25
|
+
}
|
|
26
|
+
const message = error.message;
|
|
27
|
+
if (typeof message !== "string") {
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
return message.includes("Network.getResponseBody") && message.includes("No data found");
|
|
31
|
+
};
|
|
32
|
+
/**
|
|
33
|
+
* True when Puppeteer/CDP signals the response body is not available.
|
|
34
|
+
*
|
|
35
|
+
* This is not always an error. For example, CORS preflight requests (OPTIONS)
|
|
36
|
+
* or 204/304 responses may legitimately have no retrievable body.
|
|
37
|
+
*/
|
|
38
|
+
export const isResponseBodyUnavailableError = (error) => {
|
|
39
|
+
if (!error || typeof error !== "object") {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
const message = error.message;
|
|
43
|
+
if (typeof message !== "string") {
|
|
44
|
+
return false;
|
|
45
|
+
}
|
|
46
|
+
if (isNoDataForResourceError(error)) {
|
|
47
|
+
return true;
|
|
48
|
+
}
|
|
49
|
+
return (message.includes("Could not load body for this request") ||
|
|
50
|
+
message.includes("No resource with given identifier found") ||
|
|
51
|
+
message.includes("Response body is unavailable for redirect responses"));
|
|
52
|
+
};
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export const normalizeHeaders = (headers) => {
|
|
2
|
+
const output = {};
|
|
3
|
+
for (const key in headers) {
|
|
4
|
+
const value = headers[key];
|
|
5
|
+
if (value === undefined)
|
|
6
|
+
continue;
|
|
7
|
+
output[key] = typeof value === "string" ? value : String(value);
|
|
8
|
+
}
|
|
9
|
+
return output;
|
|
10
|
+
};
|
|
11
|
+
export const getHeaderValue = (headers, name) => {
|
|
12
|
+
const target = name.toLowerCase();
|
|
13
|
+
for (const key in headers) {
|
|
14
|
+
if (key.toLowerCase() === target) {
|
|
15
|
+
return headers[key];
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return undefined;
|
|
19
|
+
};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export const getFrameId = (request) => {
|
|
2
|
+
const frame = request.frame();
|
|
3
|
+
if (!frame)
|
|
4
|
+
return undefined;
|
|
5
|
+
const frameRef = frame;
|
|
6
|
+
if (frameRef._id)
|
|
7
|
+
return frameRef._id;
|
|
8
|
+
if (frameRef._frameId)
|
|
9
|
+
return frameRef._frameId;
|
|
10
|
+
if (typeof frameRef.id === "function")
|
|
11
|
+
return frameRef.id();
|
|
12
|
+
return undefined;
|
|
13
|
+
};
|
|
14
|
+
export const getInitiator = (request) => {
|
|
15
|
+
const requestRef = request;
|
|
16
|
+
const initiator = requestRef.initiator?.();
|
|
17
|
+
if (!initiator)
|
|
18
|
+
return undefined;
|
|
19
|
+
return {
|
|
20
|
+
type: initiator.type,
|
|
21
|
+
url: initiator.url
|
|
22
|
+
};
|
|
23
|
+
};
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
export const runTriggerActions = async (page, actions = []) => {
|
|
2
|
+
if (actions.length === 0)
|
|
3
|
+
return;
|
|
4
|
+
await page.evaluate((actionsArg) => {
|
|
5
|
+
const hoverAll = () => {
|
|
6
|
+
const elements = Array.from(document.querySelectorAll("*"));
|
|
7
|
+
const trigger = (el) => {
|
|
8
|
+
const rect = el.getBoundingClientRect();
|
|
9
|
+
const x = rect.left + rect.width / 2;
|
|
10
|
+
const y = rect.top + rect.height / 2;
|
|
11
|
+
const event = new MouseEvent("mouseover", {
|
|
12
|
+
bubbles: true,
|
|
13
|
+
cancelable: true,
|
|
14
|
+
clientX: x,
|
|
15
|
+
clientY: y
|
|
16
|
+
});
|
|
17
|
+
el.dispatchEvent(event);
|
|
18
|
+
};
|
|
19
|
+
elements.forEach(trigger);
|
|
20
|
+
};
|
|
21
|
+
const scrollToEnd = () => {
|
|
22
|
+
const scrollHeight = document.documentElement?.scrollHeight ?? document.body?.scrollHeight;
|
|
23
|
+
if (typeof scrollHeight === "number") {
|
|
24
|
+
window.scrollTo({ top: scrollHeight, behavior: "instant" });
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
for (const action of actionsArg) {
|
|
28
|
+
if (action === "HOVER") {
|
|
29
|
+
hoverAll();
|
|
30
|
+
}
|
|
31
|
+
if (action === "SCROLL_TO_END") {
|
|
32
|
+
scrollToEnd();
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}, actions);
|
|
36
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@pagepocket/capture-http-puppeteer-unit",
|
|
3
|
+
"version": "0.8.0",
|
|
4
|
+
"description": "PagePocket plugin: capture HTTP events (puppeteer adapter)",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"files": [
|
|
9
|
+
"dist"
|
|
10
|
+
],
|
|
11
|
+
"license": "ISC",
|
|
12
|
+
"dependencies": {
|
|
13
|
+
"puppeteer": "^22.12.1",
|
|
14
|
+
"@pagepocket/lib": "0.8.0",
|
|
15
|
+
"@pagepocket/contracts": "0.8.0"
|
|
16
|
+
},
|
|
17
|
+
"devDependencies": {
|
|
18
|
+
"typescript": "^5.4.5"
|
|
19
|
+
},
|
|
20
|
+
"scripts": {
|
|
21
|
+
"build": "tsc -p tsconfig.json",
|
|
22
|
+
"test": "node -e \"process.exit(0)\""
|
|
23
|
+
}
|
|
24
|
+
}
|