@d-zero/beholder 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/LICENSE +21 -0
- package/README.md +4 -0
- package/dist/debug.d.ts +9 -0
- package/dist/debug.js +9 -0
- package/dist/dom-evaluation.d.ts +109 -0
- package/dist/dom-evaluation.js +273 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.js +16 -0
- package/dist/is-error.d.ts +8 -0
- package/dist/is-error.js +10 -0
- package/dist/keyword-check.d.ts +8 -0
- package/dist/keyword-check.js +17 -0
- package/dist/network-disconnection.d.ts +28 -0
- package/dist/network-disconnection.js +30 -0
- package/dist/parse-url.d.ts +14 -0
- package/dist/parse-url.js +23 -0
- package/dist/scraper.d.ts +41 -0
- package/dist/scraper.js +712 -0
- package/dist/types.d.ts +348 -0
- package/dist/types.js +7 -0
- package/package.json +5 -4
- package/src/network-disconnection.spec.ts +68 -0
- package/src/network-disconnection.ts +33 -0
- package/src/scraper.ts +72 -13
- package/src/types.ts +4 -2
- package/tsconfig.tsbuildinfo +1 -0
package/dist/scraper.js
ADDED
|
@@ -0,0 +1,712 @@
|
|
|
1
|
+
var __runInitializers = (this && this.__runInitializers) || function (thisArg, initializers, value) {
|
|
2
|
+
var useValue = arguments.length > 2;
|
|
3
|
+
for (var i = 0; i < initializers.length; i++) {
|
|
4
|
+
value = useValue ? initializers[i].call(thisArg, value) : initializers[i].call(thisArg);
|
|
5
|
+
}
|
|
6
|
+
return useValue ? value : void 0;
|
|
7
|
+
};
|
|
8
|
+
var __esDecorate = (this && this.__esDecorate) || function (ctor, descriptorIn, decorators, contextIn, initializers, extraInitializers) {
|
|
9
|
+
function accept(f) { if (f !== void 0 && typeof f !== "function") throw new TypeError("Function expected"); return f; }
|
|
10
|
+
var kind = contextIn.kind, key = kind === "getter" ? "get" : kind === "setter" ? "set" : "value";
|
|
11
|
+
var target = !descriptorIn && ctor ? contextIn["static"] ? ctor : ctor.prototype : null;
|
|
12
|
+
var descriptor = descriptorIn || (target ? Object.getOwnPropertyDescriptor(target, contextIn.name) : {});
|
|
13
|
+
var _, done = false;
|
|
14
|
+
for (var i = decorators.length - 1; i >= 0; i--) {
|
|
15
|
+
var context = {};
|
|
16
|
+
for (var p in contextIn) context[p] = p === "access" ? {} : contextIn[p];
|
|
17
|
+
for (var p in contextIn.access) context.access[p] = contextIn.access[p];
|
|
18
|
+
context.addInitializer = function (f) { if (done) throw new TypeError("Cannot add initializers after decoration has completed"); extraInitializers.push(accept(f || null)); };
|
|
19
|
+
var result = (0, decorators[i])(kind === "accessor" ? { get: descriptor.get, set: descriptor.set } : descriptor[key], context);
|
|
20
|
+
if (kind === "accessor") {
|
|
21
|
+
if (result === void 0) continue;
|
|
22
|
+
if (result === null || typeof result !== "object") throw new TypeError("Object expected");
|
|
23
|
+
if (_ = accept(result.get)) descriptor.get = _;
|
|
24
|
+
if (_ = accept(result.set)) descriptor.set = _;
|
|
25
|
+
if (_ = accept(result.init)) initializers.unshift(_);
|
|
26
|
+
}
|
|
27
|
+
else if (_ = accept(result)) {
|
|
28
|
+
if (kind === "field") initializers.unshift(_);
|
|
29
|
+
else descriptor[key] = _;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
if (target) Object.defineProperty(target, contextIn.name, descriptor);
|
|
33
|
+
done = true;
|
|
34
|
+
};
|
|
35
|
+
var __setFunctionName = (this && this.__setFunctionName) || function (f, name, prefix) {
|
|
36
|
+
if (typeof name === "symbol") name = name.description ? "[".concat(name.description, "]") : "";
|
|
37
|
+
return Object.defineProperty(f, "name", { configurable: true, value: prefix ? "".concat(prefix, " ", name) : name });
|
|
38
|
+
};
|
|
39
|
+
import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
|
|
40
|
+
import { detectCDN } from '@d-zero/shared/detect-cdn';
|
|
41
|
+
import { detectCompress } from '@d-zero/shared/detect-compress';
|
|
42
|
+
import { retry as retryable } from '@d-zero/shared/retry';
|
|
43
|
+
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
44
|
+
import { resourceLog, scraperLog } from './debug.js';
|
|
45
|
+
import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
|
|
46
|
+
import { isError } from './is-error.js';
|
|
47
|
+
import { keywordCheck } from './keyword-check.js';
|
|
48
|
+
import { findDisconnectionFailures } from './network-disconnection.js';
|
|
49
|
+
import { parseUrl } from './parse-url.js';
|
|
50
|
+
const pid = `${process.pid}`;
|
|
51
|
+
const log = scraperLog.extend(pid);
|
|
52
|
+
const rLog = resourceLog.extend(pid);
|
|
53
|
+
let Scraper = (() => {
|
|
54
|
+
let _classSuper = EventEmitter;
|
|
55
|
+
let _instanceExtraInitializers = [];
|
|
56
|
+
let _private_fetchData_decorators;
|
|
57
|
+
let _private_fetchData_descriptor;
|
|
58
|
+
let _private_fetchImages_decorators;
|
|
59
|
+
let _private_fetchImages_descriptor;
|
|
60
|
+
return class Scraper extends _classSuper {
|
|
61
|
+
static {
|
|
62
|
+
const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
|
|
63
|
+
_private_fetchData_decorators = [retryable({
|
|
64
|
+
timeout: 3 * 60 * 1000,
|
|
65
|
+
onWait(determinedInterval, retryCount, methodName, error) {
|
|
66
|
+
void this.emit('changePhase', {
|
|
67
|
+
pid: process.pid,
|
|
68
|
+
name: 'retryWait',
|
|
69
|
+
url: null,
|
|
70
|
+
isExternal: false,
|
|
71
|
+
message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1})`,
|
|
72
|
+
});
|
|
73
|
+
},
|
|
74
|
+
onGiveUp(retryCount, error, methodName) {
|
|
75
|
+
void this.emit('changePhase', {
|
|
76
|
+
pid: process.pid,
|
|
77
|
+
name: 'retryExhausted',
|
|
78
|
+
url: null,
|
|
79
|
+
isExternal: false,
|
|
80
|
+
message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
|
|
81
|
+
});
|
|
82
|
+
},
|
|
83
|
+
})];
|
|
84
|
+
_private_fetchImages_decorators = [retryable({
|
|
85
|
+
timeout: 5 * 60 * 1000,
|
|
86
|
+
fallback: [],
|
|
87
|
+
onWait(determinedInterval, retryCount, methodName, error) {
|
|
88
|
+
void this.emit('changePhase', {
|
|
89
|
+
pid: process.pid,
|
|
90
|
+
name: 'retryWait',
|
|
91
|
+
url: null,
|
|
92
|
+
isExternal: false,
|
|
93
|
+
message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1} / images)`,
|
|
94
|
+
});
|
|
95
|
+
},
|
|
96
|
+
onGiveUp(retryCount, error, methodName) {
|
|
97
|
+
void this.emit('changePhase', {
|
|
98
|
+
pid: process.pid,
|
|
99
|
+
name: 'retryExhausted',
|
|
100
|
+
url: null,
|
|
101
|
+
isExternal: false,
|
|
102
|
+
message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
|
|
103
|
+
});
|
|
104
|
+
},
|
|
105
|
+
})];
|
|
106
|
+
__esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, captureImages, imageLoadTimeout, resources, failedRequests, options) {
|
|
107
|
+
const parseOpts = options?.disableQueries == null
|
|
108
|
+
? undefined
|
|
109
|
+
: { disableQueries: options.disableQueries };
|
|
110
|
+
const networkLogs = {};
|
|
111
|
+
// Clear stale state from previous retries (@retryable may re-invoke this method
|
|
112
|
+
// with the same page and mutable arrays, so we must reset to avoid accumulation)
|
|
113
|
+
this.#cleanupPageListeners();
|
|
114
|
+
failedRequests.length = 0;
|
|
115
|
+
resources.length = 0;
|
|
116
|
+
// Define named listeners so they can be individually removed on retry/cleanup
|
|
117
|
+
const onDialog = async (dialog) => {
|
|
118
|
+
log(`Appear ${dialog.type()} dialog: ${dialog.message()}`);
|
|
119
|
+
try {
|
|
120
|
+
await dialog.accept();
|
|
121
|
+
}
|
|
122
|
+
catch (error) {
|
|
123
|
+
log(`Error: ${error}`);
|
|
124
|
+
}
|
|
125
|
+
log(`Accept ${dialog.type()} dialog`);
|
|
126
|
+
};
|
|
127
|
+
page.on('dialog', onDialog);
|
|
128
|
+
let onRequest = null;
|
|
129
|
+
let onResponse = null;
|
|
130
|
+
let onRequestFailed = null;
|
|
131
|
+
if (!isExternal) {
|
|
132
|
+
onRequest = (request) => {
|
|
133
|
+
const url = parseUrl(request.url(), parseOpts);
|
|
134
|
+
networkLogs[request.url()] = {
|
|
135
|
+
url,
|
|
136
|
+
status: null,
|
|
137
|
+
contentLength: 0,
|
|
138
|
+
contentType: '',
|
|
139
|
+
isError: false,
|
|
140
|
+
request: {
|
|
141
|
+
ts: Date.now(),
|
|
142
|
+
headers: request.headers(),
|
|
143
|
+
method: request.method(),
|
|
144
|
+
},
|
|
145
|
+
};
|
|
146
|
+
};
|
|
147
|
+
const uniqueRes = new Set();
|
|
148
|
+
onResponse = (response) => {
|
|
149
|
+
const resURL = parseUrl(response.url(), parseOpts);
|
|
150
|
+
if (uniqueRes.has(resURL.withoutHash)) {
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
if (resURL.withoutHash === url.withoutHash) {
|
|
154
|
+
return;
|
|
155
|
+
}
|
|
156
|
+
uniqueRes.add(resURL.withoutHash);
|
|
157
|
+
const headers = response.headers();
|
|
158
|
+
const status = response.status();
|
|
159
|
+
const statusText = response.statusText();
|
|
160
|
+
const contentType = headers['content-type']?.split(';')[0] || null;
|
|
161
|
+
const contentLength = Number.parseInt(headers['content-length'] ?? '', 10) || null;
|
|
162
|
+
const request = networkLogs[resURL.withoutHash];
|
|
163
|
+
const log = {
|
|
164
|
+
...request,
|
|
165
|
+
response: {
|
|
166
|
+
ts: Date.now(),
|
|
167
|
+
status,
|
|
168
|
+
statusText,
|
|
169
|
+
fromCache: response.fromCache(),
|
|
170
|
+
headers,
|
|
171
|
+
},
|
|
172
|
+
status,
|
|
173
|
+
isError: isError(status),
|
|
174
|
+
contentType: contentType || '',
|
|
175
|
+
contentLength: contentLength || 0,
|
|
176
|
+
};
|
|
177
|
+
const referredLink = {
|
|
178
|
+
url: resURL,
|
|
179
|
+
isExternal: resURL.hostname !== url.hostname,
|
|
180
|
+
isError: log.isError,
|
|
181
|
+
status,
|
|
182
|
+
statusText,
|
|
183
|
+
contentType,
|
|
184
|
+
contentLength,
|
|
185
|
+
compress: detectCompress(headers),
|
|
186
|
+
cdn: detectCDN(headers),
|
|
187
|
+
headers: headers,
|
|
188
|
+
};
|
|
189
|
+
rLog('Fetched: %s', resURL.href);
|
|
190
|
+
// Collect resource into the results array
|
|
191
|
+
resources.push({ log, resource: referredLink, pageUrl: url.withoutHash });
|
|
192
|
+
// Also emit for streaming consumers
|
|
193
|
+
void this.emit('resourceResponse', {
|
|
194
|
+
pid: process.pid,
|
|
195
|
+
url,
|
|
196
|
+
log,
|
|
197
|
+
resource: referredLink,
|
|
198
|
+
});
|
|
199
|
+
};
|
|
200
|
+
onRequestFailed = (request) => {
|
|
201
|
+
const errorText = request.failure()?.errorText ?? 'Unknown error';
|
|
202
|
+
rLog('Request failed: %s (%s)', request.url(), errorText);
|
|
203
|
+
failedRequests.push({ url: request.url(), errorText });
|
|
204
|
+
};
|
|
205
|
+
page.on('request', onRequest);
|
|
206
|
+
page.on('response', onResponse);
|
|
207
|
+
page.on('requestfailed', onRequestFailed);
|
|
208
|
+
}
|
|
209
|
+
// Store cleanup function for retry/post-fetch removal
|
|
210
|
+
this.#pageListenerCleanup = () => {
|
|
211
|
+
page.off('dialog', onDialog);
|
|
212
|
+
if (onRequest)
|
|
213
|
+
page.off('request', onRequest);
|
|
214
|
+
if (onResponse)
|
|
215
|
+
page.off('response', onResponse);
|
|
216
|
+
if (onRequestFailed)
|
|
217
|
+
page.off('requestfailed', onRequestFailed);
|
|
218
|
+
};
|
|
219
|
+
const navigationTimeout = options?.navigationTimeout ?? 60_000;
|
|
220
|
+
void this.emit('changePhase', {
|
|
221
|
+
pid: process.pid,
|
|
222
|
+
name: 'openPage',
|
|
223
|
+
url,
|
|
224
|
+
isExternal,
|
|
225
|
+
message: `%countdown(${navigationTimeout},openPage_${url.withoutHash},s)%s`,
|
|
226
|
+
});
|
|
227
|
+
if (url.username && url.password) {
|
|
228
|
+
await page.setExtraHTTPHeaders({
|
|
229
|
+
Authorization: `Basic ${Buffer.from(`${url.username}:${url.password}`).toString('base64')}`,
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
const res = await page.goto(url.withoutHashAndAuth, { timeout: navigationTimeout });
|
|
233
|
+
if (!res) {
|
|
234
|
+
throw new Error('The method Page.goto returned null');
|
|
235
|
+
}
|
|
236
|
+
const destUrl = parseUrl(page.url(), parseOpts);
|
|
237
|
+
const redirectPaths = new Set();
|
|
238
|
+
if (url.withoutHash !== destUrl.withoutHash) {
|
|
239
|
+
const redirectChain = res
|
|
240
|
+
.request()
|
|
241
|
+
.redirectChain()
|
|
242
|
+
.map((req) => req.url());
|
|
243
|
+
for (const redirectPath of redirectChain) {
|
|
244
|
+
redirectPaths.add(redirectPath);
|
|
245
|
+
}
|
|
246
|
+
redirectPaths.add(destUrl.withoutHash);
|
|
247
|
+
}
|
|
248
|
+
if (destUrl.hostname !== url.hostname) {
|
|
249
|
+
isExternal = true;
|
|
250
|
+
}
|
|
251
|
+
const status = res.status();
|
|
252
|
+
const statusText = res.statusText();
|
|
253
|
+
const responseHeaders = res.headers();
|
|
254
|
+
const contentType = responseHeaders['content-type']?.split(';')[0] || null;
|
|
255
|
+
const _contentLength = Number.parseInt(responseHeaders['content-length'] ?? '');
|
|
256
|
+
const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
|
|
257
|
+
if (contentType !== 'text/html') {
|
|
258
|
+
return {
|
|
259
|
+
url,
|
|
260
|
+
isTarget: false,
|
|
261
|
+
isExternal,
|
|
262
|
+
redirectPaths: [...redirectPaths],
|
|
263
|
+
status,
|
|
264
|
+
statusText,
|
|
265
|
+
contentType,
|
|
266
|
+
contentLength,
|
|
267
|
+
responseHeaders,
|
|
268
|
+
meta: {
|
|
269
|
+
title: '',
|
|
270
|
+
},
|
|
271
|
+
imageList: [],
|
|
272
|
+
anchorList: [],
|
|
273
|
+
html: '',
|
|
274
|
+
isSkipped: false,
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
void this.emit('changePhase', {
|
|
278
|
+
pid: process.pid,
|
|
279
|
+
name: 'loadDOMContent',
|
|
280
|
+
url,
|
|
281
|
+
isExternal,
|
|
282
|
+
message: '',
|
|
283
|
+
});
|
|
284
|
+
await page
|
|
285
|
+
.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 5000 })
|
|
286
|
+
.catch(() => { });
|
|
287
|
+
void this.emit('changePhase', {
|
|
288
|
+
pid: process.pid,
|
|
289
|
+
name: 'getHTML',
|
|
290
|
+
url,
|
|
291
|
+
isExternal,
|
|
292
|
+
message: '',
|
|
293
|
+
});
|
|
294
|
+
const { title, html } = await page.evaluate(() => {
|
|
295
|
+
/* global document */
|
|
296
|
+
return {
|
|
297
|
+
title: document.title,
|
|
298
|
+
html: document.documentElement.outerHTML,
|
|
299
|
+
};
|
|
300
|
+
});
|
|
301
|
+
if (isExternal) {
|
|
302
|
+
return {
|
|
303
|
+
url,
|
|
304
|
+
isTarget: false,
|
|
305
|
+
isExternal,
|
|
306
|
+
redirectPaths: [...redirectPaths],
|
|
307
|
+
status,
|
|
308
|
+
statusText,
|
|
309
|
+
contentType,
|
|
310
|
+
contentLength,
|
|
311
|
+
responseHeaders,
|
|
312
|
+
meta: {
|
|
313
|
+
title,
|
|
314
|
+
},
|
|
315
|
+
imageList: [],
|
|
316
|
+
anchorList: [],
|
|
317
|
+
html,
|
|
318
|
+
isSkipped: false,
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
void this.emit('changePhase', {
|
|
322
|
+
pid: process.pid,
|
|
323
|
+
name: 'waitNetworkIdle',
|
|
324
|
+
url,
|
|
325
|
+
isExternal,
|
|
326
|
+
message: '',
|
|
327
|
+
});
|
|
328
|
+
await page
|
|
329
|
+
.waitForNavigation({ waitUntil: 'networkidle0', timeout: 5000 })
|
|
330
|
+
.catch(() => { });
|
|
331
|
+
// Check for network disconnection errors in failed requests
|
|
332
|
+
const disconnectionFailures = findDisconnectionFailures(failedRequests);
|
|
333
|
+
if (disconnectionFailures.length > 0) {
|
|
334
|
+
const errorSummary = disconnectionFailures
|
|
335
|
+
.map((r) => `${r.url} (${r.errorText})`)
|
|
336
|
+
.join(', ');
|
|
337
|
+
throw new Error(`Network disconnection detected during page load: ${errorSummary}`);
|
|
338
|
+
}
|
|
339
|
+
void this.emit('changePhase', {
|
|
340
|
+
pid: process.pid,
|
|
341
|
+
name: 'getAnchors',
|
|
342
|
+
url,
|
|
343
|
+
isExternal,
|
|
344
|
+
message: '',
|
|
345
|
+
});
|
|
346
|
+
const anchorList = await getAnchorList(page, parseOpts);
|
|
347
|
+
void this.emit('changePhase', {
|
|
348
|
+
pid: process.pid,
|
|
349
|
+
name: 'getMeta',
|
|
350
|
+
url,
|
|
351
|
+
isExternal,
|
|
352
|
+
message: '',
|
|
353
|
+
});
|
|
354
|
+
const meta = await getMeta(page);
|
|
355
|
+
const imageList = captureImages
|
|
356
|
+
? await (async () => {
|
|
357
|
+
void this.emit('changePhase', {
|
|
358
|
+
pid: process.pid,
|
|
359
|
+
name: 'extractImages',
|
|
360
|
+
url,
|
|
361
|
+
isExternal,
|
|
362
|
+
message: '',
|
|
363
|
+
});
|
|
364
|
+
return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
|
|
365
|
+
})()
|
|
366
|
+
: [];
|
|
367
|
+
return {
|
|
368
|
+
url,
|
|
369
|
+
isTarget: true,
|
|
370
|
+
isExternal,
|
|
371
|
+
redirectPaths: [...redirectPaths],
|
|
372
|
+
status,
|
|
373
|
+
statusText,
|
|
374
|
+
contentType,
|
|
375
|
+
contentLength,
|
|
376
|
+
responseHeaders,
|
|
377
|
+
meta,
|
|
378
|
+
anchorList,
|
|
379
|
+
imageList,
|
|
380
|
+
html,
|
|
381
|
+
isSkipped: false,
|
|
382
|
+
};
|
|
383
|
+
}, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
384
|
+
__esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
|
|
385
|
+
const listener = this.#createPageScanListener(isExternal);
|
|
386
|
+
const devices = [
|
|
387
|
+
{ key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
|
|
388
|
+
{ key: 'mobile-small', preset: devicePresets['mobile-small'] },
|
|
389
|
+
];
|
|
390
|
+
const imageList = [];
|
|
391
|
+
for (const { key, preset } of devices) {
|
|
392
|
+
void this.emit('changePhase', {
|
|
393
|
+
pid: process.pid,
|
|
394
|
+
name: 'setViewport',
|
|
395
|
+
url: null,
|
|
396
|
+
isExternal,
|
|
397
|
+
message: `📷 ${key} ↔️ ${preset.width}px`,
|
|
398
|
+
});
|
|
399
|
+
await beforePageScan(page, url, {
|
|
400
|
+
name: key,
|
|
401
|
+
width: preset.width,
|
|
402
|
+
resolution: preset.resolution,
|
|
403
|
+
listener,
|
|
404
|
+
timeout: 5000,
|
|
405
|
+
});
|
|
406
|
+
void this.emit('changePhase', {
|
|
407
|
+
pid: process.pid,
|
|
408
|
+
name: 'waitImageLoad',
|
|
409
|
+
url: null,
|
|
410
|
+
isExternal,
|
|
411
|
+
message: `📷 ${key}: Waiting for images%dots%`,
|
|
412
|
+
});
|
|
413
|
+
await page
|
|
414
|
+
.waitForFunction(() => [...document.images].every((img) => img.complete), {
|
|
415
|
+
timeout: imageLoadTimeout,
|
|
416
|
+
})
|
|
417
|
+
.catch(() => { });
|
|
418
|
+
void this.emit('changePhase', {
|
|
419
|
+
pid: process.pid,
|
|
420
|
+
name: 'getImages',
|
|
421
|
+
url: null,
|
|
422
|
+
isExternal,
|
|
423
|
+
message: `📸 ${key}: Extracting images%dots%`,
|
|
424
|
+
});
|
|
425
|
+
const images = await getImageList(page, preset.width);
|
|
426
|
+
imageList.push(...images);
|
|
427
|
+
}
|
|
428
|
+
return imageList;
|
|
429
|
+
}, "#fetchImages") }, _private_fetchImages_decorators, { kind: "method", name: "#fetchImages", static: false, private: true, access: { has: obj => #fetchImages in obj, get: obj => obj.#fetchImages }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
430
|
+
if (_metadata) Object.defineProperty(this, Symbol.metadata, { enumerable: true, configurable: true, writable: true, value: _metadata });
|
|
431
|
+
}
|
|
432
|
+
/** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
|
|
433
|
+
retries = __runInitializers(this, _instanceExtraInitializers);
|
|
434
|
+
/** Cleanup function to remove page listeners registered by `#fetchData`. */
|
|
435
|
+
#pageListenerCleanup = null;
|
|
436
|
+
/**
|
|
437
|
+
* Begins the scraping process for a given URL on the provided Puppeteer page.
|
|
438
|
+
*
|
|
439
|
+
* Returns a `ScrapeResult` containing the outcome:
|
|
440
|
+
* - `type: "success"` with `pageData` on success
|
|
441
|
+
* - `type: "skipped"` with `ignored` details when the page is excluded
|
|
442
|
+
* - `type: "error"` with `error` details when scraping fails
|
|
443
|
+
*
|
|
444
|
+
* Sub-resources are collected via the `resourceResponse` event and
|
|
445
|
+
* included in the returned `ScrapeResult.resources`.
|
|
446
|
+
* @param page - The Puppeteer page instance to use for navigation and DOM evaluation.
|
|
447
|
+
* @param url - The extended URL to scrape.
|
|
448
|
+
* @param options - Optional scraper configuration overriding defaults.
|
|
449
|
+
* @param isSkip - When `true`, the page is immediately skipped without any network requests.
|
|
450
|
+
* @returns The scrape result containing the outcome and captured resources.
|
|
451
|
+
*/
|
|
452
|
+
async scrapeStart(page, url, options, isSkip = false) {
|
|
453
|
+
this.retries = options?.retries;
|
|
454
|
+
const isExternal = options?.isExternal ?? false;
|
|
455
|
+
const captureImages = options?.captureImages ?? true;
|
|
456
|
+
const excludeKeywords = options?.excludeKeywords ?? [];
|
|
457
|
+
const metadataOnly = options?.metadataOnly ?? false;
|
|
458
|
+
const imageLoadTimeout = options?.imageLoadTimeout ?? 5000;
|
|
459
|
+
const resources = [];
|
|
460
|
+
const failedRequests = [];
|
|
461
|
+
void this.emit('changePhase', {
|
|
462
|
+
pid: process.pid,
|
|
463
|
+
name: 'scrapeStart',
|
|
464
|
+
url,
|
|
465
|
+
isExternal,
|
|
466
|
+
message: '',
|
|
467
|
+
});
|
|
468
|
+
// Path-excluded: return SkippedPageData
|
|
469
|
+
if (isSkip) {
|
|
470
|
+
void this.emit('changePhase', {
|
|
471
|
+
pid: process.pid,
|
|
472
|
+
name: 'pageSkipped',
|
|
473
|
+
url,
|
|
474
|
+
isExternal,
|
|
475
|
+
message: 'Matched: excluded path',
|
|
476
|
+
});
|
|
477
|
+
return {
|
|
478
|
+
type: 'skipped',
|
|
479
|
+
resources,
|
|
480
|
+
ignored: {
|
|
481
|
+
url,
|
|
482
|
+
matchedText: url.pathname || '',
|
|
483
|
+
excludeKeywords,
|
|
484
|
+
},
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
// Non-HTTP protocol: return minimal PageData
|
|
488
|
+
if (!url.isHTTP) {
|
|
489
|
+
const result = {
|
|
490
|
+
url,
|
|
491
|
+
isTarget: false,
|
|
492
|
+
isExternal,
|
|
493
|
+
redirectPaths: [],
|
|
494
|
+
status: -1,
|
|
495
|
+
statusText: '__THIS_IS_NOT_HTTP_PROTOCOL__',
|
|
496
|
+
contentType: null,
|
|
497
|
+
contentLength: null,
|
|
498
|
+
responseHeaders: {},
|
|
499
|
+
meta: {
|
|
500
|
+
title: '',
|
|
501
|
+
},
|
|
502
|
+
imageList: [],
|
|
503
|
+
anchorList: [],
|
|
504
|
+
html: '',
|
|
505
|
+
isSkipped: false,
|
|
506
|
+
};
|
|
507
|
+
void this.emit('changePhase', {
|
|
508
|
+
pid: process.pid,
|
|
509
|
+
name: 'scrapeEnd',
|
|
510
|
+
url,
|
|
511
|
+
isExternal,
|
|
512
|
+
message: '',
|
|
513
|
+
});
|
|
514
|
+
return { type: 'success', pageData: result, resources };
|
|
515
|
+
}
|
|
516
|
+
let headResult = options?.headCheckResult ?? null;
|
|
517
|
+
if (headResult && metadataOnly) {
|
|
518
|
+
void this.emit('changePhase', {
|
|
519
|
+
pid: process.pid,
|
|
520
|
+
name: 'scrapeEnd',
|
|
521
|
+
url,
|
|
522
|
+
isExternal,
|
|
523
|
+
message: '',
|
|
524
|
+
});
|
|
525
|
+
return {
|
|
526
|
+
type: 'success',
|
|
527
|
+
pageData: {
|
|
528
|
+
...headResult,
|
|
529
|
+
isTarget: false,
|
|
530
|
+
},
|
|
531
|
+
resources,
|
|
532
|
+
};
|
|
533
|
+
}
|
|
534
|
+
if (headResult === null || headResult.contentType === 'text/html') {
|
|
535
|
+
const fetchResult = await this.#fetchData(page, url, isExternal, captureImages, imageLoadTimeout, resources, failedRequests, options).catch((error) => {
|
|
536
|
+
if (error instanceof Error) {
|
|
537
|
+
return error;
|
|
538
|
+
}
|
|
539
|
+
return new Error(error);
|
|
540
|
+
});
|
|
541
|
+
if (fetchResult instanceof Error) {
|
|
542
|
+
log('Error(FETCH_DATA): %s', url.href);
|
|
543
|
+
this.#cleanupPageListeners();
|
|
544
|
+
return {
|
|
545
|
+
type: 'error',
|
|
546
|
+
resources,
|
|
547
|
+
failedRequests: failedRequests.length > 0 ? failedRequests : undefined,
|
|
548
|
+
error: {
|
|
549
|
+
name: fetchResult.name,
|
|
550
|
+
message: fetchResult.message,
|
|
551
|
+
stack: fetchResult.stack,
|
|
552
|
+
shutdown: true,
|
|
553
|
+
},
|
|
554
|
+
};
|
|
555
|
+
}
|
|
556
|
+
this.#cleanupPageListeners();
|
|
557
|
+
headResult = fetchResult;
|
|
558
|
+
if (!headResult.isSkipped) {
|
|
559
|
+
const checkedKeyword = keywordCheck(headResult.html, excludeKeywords);
|
|
560
|
+
if (checkedKeyword) {
|
|
561
|
+
headResult = {
|
|
562
|
+
url,
|
|
563
|
+
isSkipped: true,
|
|
564
|
+
matched: {
|
|
565
|
+
type: 'keyword',
|
|
566
|
+
text: checkedKeyword,
|
|
567
|
+
excludeKeywords,
|
|
568
|
+
},
|
|
569
|
+
};
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
if (headResult.isSkipped) {
|
|
573
|
+
if (headResult.matched.type === 'path') {
|
|
574
|
+
return {
|
|
575
|
+
type: 'skipped',
|
|
576
|
+
resources,
|
|
577
|
+
ignored: {
|
|
578
|
+
url,
|
|
579
|
+
matchedText: url.pathname || '',
|
|
580
|
+
excludeKeywords,
|
|
581
|
+
},
|
|
582
|
+
};
|
|
583
|
+
}
|
|
584
|
+
void this.emit('changePhase', {
|
|
585
|
+
pid: process.pid,
|
|
586
|
+
name: 'pageSkipped',
|
|
587
|
+
url,
|
|
588
|
+
isExternal,
|
|
589
|
+
message: `Matched: "${headResult.matched.text}"`,
|
|
590
|
+
});
|
|
591
|
+
return {
|
|
592
|
+
type: 'skipped',
|
|
593
|
+
resources,
|
|
594
|
+
ignored: {
|
|
595
|
+
url,
|
|
596
|
+
matchedText: headResult.matched.text,
|
|
597
|
+
excludeKeywords,
|
|
598
|
+
},
|
|
599
|
+
};
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
void this.emit('changePhase', {
|
|
603
|
+
pid: process.pid,
|
|
604
|
+
name: 'scrapeEnd',
|
|
605
|
+
url,
|
|
606
|
+
isExternal,
|
|
607
|
+
message: '',
|
|
608
|
+
});
|
|
609
|
+
return {
|
|
610
|
+
type: 'success',
|
|
611
|
+
pageData: headResult,
|
|
612
|
+
resources,
|
|
613
|
+
failedRequests: failedRequests.length > 0 ? failedRequests : undefined,
|
|
614
|
+
};
|
|
615
|
+
}
|
|
616
|
+
#cleanupPageListeners() {
|
|
617
|
+
if (this.#pageListenerCleanup) {
|
|
618
|
+
this.#pageListenerCleanup();
|
|
619
|
+
this.#pageListenerCleanup = null;
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
/**
|
|
623
|
+
* Creates a callback for `@d-zero/puppeteer-page-scan`'s `beforePageScan` listener.
|
|
624
|
+
*
|
|
625
|
+
* WHY a separate factory: The listener must capture `isExternal` for phase events
|
|
626
|
+
* while conforming to the `beforePageScan` listener signature.
|
|
627
|
+
* Currently only handles the `scroll` phase to report scroll progress.
|
|
628
|
+
* @param isExternal - Whether the current page is external to the crawl scope
|
|
629
|
+
* @returns A listener function compatible with `beforePageScan`'s `listener` option
|
|
630
|
+
*/
|
|
631
|
+
#createPageScanListener(isExternal) {
|
|
632
|
+
return (phase, data) => {
|
|
633
|
+
switch (phase) {
|
|
634
|
+
case 'scroll': {
|
|
635
|
+
const d = data;
|
|
636
|
+
const scrollMsg = Number.isNaN(d.scrollHeight)
|
|
637
|
+
? `%propeller% ${d.message}`
|
|
638
|
+
: `%propeller% ${d.scrollY}px/${d.scrollHeight}px (${Math.round((d.scrollY / d.scrollHeight) * 100)}%) ${d.message}`;
|
|
639
|
+
void this.emit('changePhase', {
|
|
640
|
+
pid: process.pid,
|
|
641
|
+
name: 'scrollToBottom',
|
|
642
|
+
url: null,
|
|
643
|
+
isExternal,
|
|
644
|
+
message: scrollMsg,
|
|
645
|
+
});
|
|
646
|
+
break;
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
};
|
|
650
|
+
}
|
|
651
|
+
/**
|
|
652
|
+
* Navigates the page to the target URL and extracts full page data.
|
|
653
|
+
*
|
|
654
|
+
* WHY retryable with 3-min timeout: Page navigation can fail due to transient
|
|
655
|
+
* network issues or slow-loading pages. The decorator retries automatically,
|
|
656
|
+
* emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
|
|
657
|
+
*
|
|
658
|
+
* Flow:
|
|
659
|
+
* 1. Register request/response/requestfailed listeners to capture sub-resources (internal pages only)
|
|
660
|
+
* 2. Navigate to URL via `page.goto()` and track redirect chain
|
|
661
|
+
* 3. Wait for DOM content and network idle
|
|
662
|
+
* 4. Check for network disconnection errors and throw to trigger retry
|
|
663
|
+
* 5. Extract anchors, meta, and optionally images
|
|
664
|
+
* 6. Check for keyword exclusion in HTML content
|
|
665
|
+
* @param page - Puppeteer page instance
|
|
666
|
+
* @param url - Target URL to navigate to
|
|
667
|
+
* @param isExternal - Whether the URL is external to the crawl scope
|
|
668
|
+
* @param captureImages - Whether to run the image extraction pipeline
|
|
669
|
+
* @param imageLoadTimeout - Timeout (ms) for waiting lazy-loaded images to complete
|
|
670
|
+
* @param resources - Mutable array to collect captured sub-resources into
|
|
671
|
+
* @param failedRequests - Mutable array to collect failed sub-resource requests into
|
|
672
|
+
* @param options - Additional scraper options (e.g. `disableQueries`, `navigationTimeout`)
|
|
673
|
+
* @returns Full page data or skipped page data if an exclusion rule matched
|
|
674
|
+
*/
|
|
675
|
+
get #fetchData() { return _private_fetchData_descriptor.value; }
|
|
676
|
+
/**
|
|
677
|
+
* Extracts image data from the page across multiple device presets.
|
|
678
|
+
*
|
|
679
|
+
* WHY multiple device presets: Images may differ between desktop and mobile
|
|
680
|
+
* due to responsive `<picture>` / `srcset`. Capturing both `desktop-compact`
|
|
681
|
+
* and `mobile-small` viewports reveals responsive image issues.
|
|
682
|
+
*
|
|
683
|
+
* WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
|
|
684
|
+
* best-effort. If all retries fail, an empty array is returned rather than
|
|
685
|
+
* failing the entire page scrape.
|
|
686
|
+
* @param page - Puppeteer page instance
|
|
687
|
+
* @param url - The page URL string (without hash and auth)
|
|
688
|
+
* @param isExternal - Whether the page is external
|
|
689
|
+
* @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
|
|
690
|
+
* @returns Array of image elements from all device presets
|
|
691
|
+
*/
|
|
692
|
+
get #fetchImages() { return _private_fetchImages_descriptor.value; }
|
|
693
|
+
};
|
|
694
|
+
})();
|
|
695
|
+
/**
|
|
696
|
+
* Page-level scraper that extracts data from a single browser page.
|
|
697
|
+
*
|
|
698
|
+
* The scraper returns results as values from `scrapeStart()` rather than
|
|
699
|
+
* emitting them as events. Only streaming events (changePhase, resourceResponse)
|
|
700
|
+
* are emitted for progress monitoring.
|
|
701
|
+
*
|
|
702
|
+
* The Puppeteer `Page` object is injected externally, and page lifecycle
|
|
703
|
+
* (including `page.close()`) is managed by the caller.
|
|
704
|
+
* @example
|
|
705
|
+
* ```ts
|
|
706
|
+
* const scraper = new Scraper();
|
|
707
|
+
* scraper.on('changePhase', (e) => console.log(e.name));
|
|
708
|
+
* const result = await scraper.scrapeStart(page, url, { isExternal: false });
|
|
709
|
+
* ```
|
|
710
|
+
*/
|
|
711
|
+
// eslint-disable-next-line unicorn/prefer-event-target -- TypedAwaitEventEmitter is a project-specific typed wrapper, not Node.js EventEmitter
|
|
712
|
+
export default Scraper;
|