@d-zero/beholder 0.1.29 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/README.md +172 -477
- package/dist/debug.d.ts +4 -1
- package/dist/debug.js +5 -2
- package/dist/dom-evaluation.d.ts +72 -14
- package/dist/dom-evaluation.js +169 -43
- package/dist/index.d.ts +20 -3
- package/dist/index.js +15 -3
- package/dist/is-error.d.ts +8 -0
- package/dist/is-error.js +10 -0
- package/dist/keyword-check.d.ts +5 -3
- package/dist/keyword-check.js +5 -3
- package/dist/parse-url.d.ts +14 -0
- package/dist/parse-url.js +23 -0
- package/dist/scraper.d.ts +39 -13
- package/dist/scraper.js +300 -263
- package/dist/types.d.ts +286 -214
- package/dist/types.js +6 -0
- package/package.json +7 -10
- package/src/debug.ts +5 -2
- package/src/dom-evaluation.ts +195 -65
- package/src/index.ts +27 -3
- package/src/is-error.spec.ts +33 -0
- package/src/is-error.ts +10 -0
- package/src/keyword-check.spec.ts +45 -4
- package/src/keyword-check.ts +5 -3
- package/src/parse-url.spec.ts +35 -0
- package/src/parse-url.ts +26 -0
- package/src/scraper.ts +338 -300
- package/src/types.ts +345 -258
- package/tsconfig.tsbuildinfo +1 -1
- package/dist/events.d.ts +0 -32
- package/dist/events.js +0 -15
- package/dist/fetch-destination.d.ts +0 -8
- package/dist/fetch-destination.js +0 -145
- package/dist/net-timeout-error.d.ts +0 -3
- package/dist/net-timeout-error.js +0 -3
- package/dist/sub-process-runner.d.ts +0 -12
- package/dist/sub-process-runner.js +0 -180
- package/dist/sub-process.d.ts +0 -1
- package/dist/sub-process.js +0 -67
- package/dist/utils.d.ts +0 -16
- package/dist/utils.js +0 -69
- package/src/events.ts +0 -21
- package/src/fetch-destination.ts +0 -173
- package/src/net-timeout-error.ts +0 -3
- package/src/sub-process-runner.ts +0 -220
- package/src/sub-process.ts +0 -86
- package/src/utils.ts +0 -89
package/dist/scraper.js
CHANGED
|
@@ -36,106 +36,76 @@ var __setFunctionName = (this && this.__setFunctionName) || function (f, name, p
|
|
|
36
36
|
if (typeof name === "symbol") name = name.description ? "[".concat(name.description, "]") : "";
|
|
37
37
|
return Object.defineProperty(f, "name", { configurable: true, value: prefix ? "".concat(prefix, " ", name) : name });
|
|
38
38
|
};
|
|
39
|
-
import { beforePageScan } from '@d-zero/puppeteer-page-scan';
|
|
40
|
-
import {
|
|
41
|
-
import {
|
|
42
|
-
import {
|
|
43
|
-
import {
|
|
39
|
+
import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
|
|
40
|
+
import { detectCDN } from '@d-zero/shared/detect-cdn';
|
|
41
|
+
import { detectCompress } from '@d-zero/shared/detect-compress';
|
|
42
|
+
import { retry as retryable } from '@d-zero/shared/retry';
|
|
43
|
+
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
44
44
|
import { resourceLog, scraperLog } from './debug.js';
|
|
45
45
|
import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
|
|
46
|
-
import {
|
|
46
|
+
import { isError } from './is-error.js';
|
|
47
47
|
import { keywordCheck } from './keyword-check.js';
|
|
48
|
-
import {
|
|
48
|
+
import { parseUrl } from './parse-url.js';
|
|
49
49
|
const pid = `${process.pid}`;
|
|
50
50
|
const log = scraperLog.extend(pid);
|
|
51
51
|
const rLog = resourceLog.extend(pid);
|
|
52
|
-
const LAUNCH_BROWSER_TIMEOUT = 1000 * 30;
|
|
53
52
|
let Scraper = (() => {
|
|
54
|
-
let _classSuper =
|
|
53
|
+
let _classSuper = EventEmitter;
|
|
55
54
|
let _instanceExtraInitializers = [];
|
|
56
|
-
let _private_bootBrowser_decorators;
|
|
57
|
-
let _private_bootBrowser_descriptor;
|
|
58
|
-
let _private_createPage_decorators;
|
|
59
|
-
let _private_createPage_descriptor;
|
|
60
55
|
let _private_fetchData_decorators;
|
|
61
56
|
let _private_fetchData_descriptor;
|
|
62
|
-
let _private_fetchHead_decorators;
|
|
63
|
-
let _private_fetchHead_descriptor;
|
|
64
57
|
let _private_fetchImages_decorators;
|
|
65
58
|
let _private_fetchImages_descriptor;
|
|
66
59
|
return class Scraper extends _classSuper {
|
|
67
60
|
static {
|
|
68
61
|
const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
62
|
+
_private_fetchData_decorators = [retryable({
|
|
63
|
+
timeout: 3 * 60 * 1000,
|
|
64
|
+
onWait(determinedInterval, retryCount, methodName, error) {
|
|
65
|
+
void this.emit('changePhase', {
|
|
66
|
+
pid: process.pid,
|
|
67
|
+
name: 'retryWait',
|
|
68
|
+
url: null,
|
|
69
|
+
isExternal: false,
|
|
70
|
+
message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1})`,
|
|
71
|
+
});
|
|
72
|
+
},
|
|
73
|
+
onGiveUp(retryCount, error, methodName) {
|
|
74
|
+
void this.emit('changePhase', {
|
|
75
|
+
pid: process.pid,
|
|
76
|
+
name: 'retryExhausted',
|
|
77
|
+
url: null,
|
|
78
|
+
isExternal: false,
|
|
79
|
+
message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
|
|
80
|
+
});
|
|
81
|
+
},
|
|
73
82
|
})];
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
timeout: 5 * 60 * 1000, // 5sec
|
|
83
|
+
_private_fetchImages_decorators = [retryable({
|
|
84
|
+
timeout: 5 * 60 * 1000,
|
|
77
85
|
fallback: [],
|
|
78
|
-
|
|
79
|
-
__esDecorate(this, _private_bootBrowser_descriptor = { value: __setFunctionName(async function (isExternal, executablePath, headless) {
|
|
80
|
-
if (!this.#browser) {
|
|
86
|
+
onWait(determinedInterval, retryCount, methodName, error) {
|
|
81
87
|
void this.emit('changePhase', {
|
|
82
88
|
pid: process.pid,
|
|
83
|
-
name: '
|
|
84
|
-
url:
|
|
85
|
-
isExternal,
|
|
86
|
-
message:
|
|
89
|
+
name: 'retryWait',
|
|
90
|
+
url: null,
|
|
91
|
+
isExternal: false,
|
|
92
|
+
message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1} / images)`,
|
|
87
93
|
});
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
'--ignore-certificate-errors',
|
|
97
|
-
],
|
|
98
|
-
}).catch((error) => {
|
|
99
|
-
if (error instanceof Error) {
|
|
100
|
-
return error;
|
|
101
|
-
}
|
|
102
|
-
throw error;
|
|
94
|
+
},
|
|
95
|
+
onGiveUp(retryCount, error, methodName) {
|
|
96
|
+
void this.emit('changePhase', {
|
|
97
|
+
pid: process.pid,
|
|
98
|
+
name: 'retryExhausted',
|
|
99
|
+
url: null,
|
|
100
|
+
isExternal: false,
|
|
101
|
+
message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
|
|
103
102
|
});
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
});
|
|
111
|
-
throw browser;
|
|
112
|
-
}
|
|
113
|
-
this.#browser = browser;
|
|
114
|
-
}
|
|
115
|
-
else if (!this.#browser.isConnected()) {
|
|
116
|
-
await this.#browser.close();
|
|
117
|
-
}
|
|
118
|
-
return this.#browser;
|
|
119
|
-
}, "#bootBrowser") }, _private_bootBrowser_decorators, { kind: "method", name: "#bootBrowser", static: false, private: true, access: { has: obj => #bootBrowser in obj, get: obj => obj.#bootBrowser }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
120
|
-
__esDecorate(this, _private_createPage_descriptor = { value: __setFunctionName(async function (isExternal, executablePath, headless) {
|
|
121
|
-
const browser = await this.#bootBrowser(isExternal, executablePath, headless);
|
|
122
|
-
void this.emit('changePhase', {
|
|
123
|
-
pid: process.pid,
|
|
124
|
-
name: 'newPage',
|
|
125
|
-
url: this.#url,
|
|
126
|
-
isExternal,
|
|
127
|
-
message: '',
|
|
128
|
-
});
|
|
129
|
-
const page = await browser.newPage();
|
|
130
|
-
page.setDefaultNavigationTimeout(0);
|
|
131
|
-
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36');
|
|
132
|
-
await page.setExtraHTTPHeaders({
|
|
133
|
-
// TODO: Optional lang
|
|
134
|
-
'Accept-Language': 'ja-JP',
|
|
135
|
-
});
|
|
136
|
-
return page;
|
|
137
|
-
}, "#createPage") }, _private_createPage_decorators, { kind: "method", name: "#createPage", static: false, private: true, access: { has: obj => #createPage in obj, get: obj => obj.#createPage }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
138
|
-
__esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, isGettingImages, options) {
|
|
103
|
+
},
|
|
104
|
+
})];
|
|
105
|
+
__esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, captureImages, imageLoadTimeout, resources, options) {
|
|
106
|
+
const parseOpts = options?.disableQueries == null
|
|
107
|
+
? undefined
|
|
108
|
+
: { disableQueries: options.disableQueries };
|
|
139
109
|
const networkLogs = {};
|
|
140
110
|
page.on('dialog', async (dialog) => {
|
|
141
111
|
log(`Appear ${dialog.type()} dialog: ${dialog.message()}`);
|
|
@@ -149,7 +119,7 @@ let Scraper = (() => {
|
|
|
149
119
|
});
|
|
150
120
|
if (!isExternal) {
|
|
151
121
|
page.on('request', (request) => {
|
|
152
|
-
const url = parseUrl(request.url(),
|
|
122
|
+
const url = parseUrl(request.url(), parseOpts);
|
|
153
123
|
networkLogs[request.url()] = {
|
|
154
124
|
url,
|
|
155
125
|
status: null,
|
|
@@ -165,7 +135,7 @@ let Scraper = (() => {
|
|
|
165
135
|
});
|
|
166
136
|
const uniqueRes = new Set();
|
|
167
137
|
page.on('response', (response) => {
|
|
168
|
-
const resURL = parseUrl(response.url(),
|
|
138
|
+
const resURL = parseUrl(response.url(), parseOpts);
|
|
169
139
|
if (uniqueRes.has(resURL.withoutHash)) {
|
|
170
140
|
return;
|
|
171
141
|
}
|
|
@@ -206,6 +176,9 @@ let Scraper = (() => {
|
|
|
206
176
|
headers: headers,
|
|
207
177
|
};
|
|
208
178
|
rLog('Fetched: %s', resURL.href);
|
|
179
|
+
// Collect resource into the results array
|
|
180
|
+
resources.push({ log, resource: referredLink, pageUrl: url.withoutHash });
|
|
181
|
+
// Also emit for streaming consumers
|
|
209
182
|
void this.emit('resourceResponse', {
|
|
210
183
|
pid: process.pid,
|
|
211
184
|
url,
|
|
@@ -214,29 +187,34 @@ let Scraper = (() => {
|
|
|
214
187
|
});
|
|
215
188
|
});
|
|
216
189
|
}
|
|
190
|
+
const navigationTimeout = options?.navigationTimeout ?? 60_000;
|
|
217
191
|
void this.emit('changePhase', {
|
|
218
192
|
pid: process.pid,
|
|
219
193
|
name: 'openPage',
|
|
220
|
-
url
|
|
194
|
+
url,
|
|
221
195
|
isExternal,
|
|
222
|
-
message:
|
|
196
|
+
message: `%countdown(${navigationTimeout},openPage_${url.withoutHash},s)%s`,
|
|
223
197
|
});
|
|
224
198
|
if (url.username && url.password) {
|
|
225
199
|
await page.setExtraHTTPHeaders({
|
|
226
200
|
Authorization: `Basic ${Buffer.from(`${url.username}:${url.password}`).toString('base64')}`,
|
|
227
201
|
});
|
|
228
202
|
}
|
|
229
|
-
const res = await page.goto(url.withoutHashAndAuth);
|
|
203
|
+
const res = await page.goto(url.withoutHashAndAuth, { timeout: navigationTimeout });
|
|
230
204
|
if (!res) {
|
|
231
205
|
throw new Error('The method Page.goto returned null');
|
|
232
206
|
}
|
|
233
|
-
const destUrl = parseUrl(page.url(),
|
|
234
|
-
const redirectPaths =
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
207
|
+
const destUrl = parseUrl(page.url(), parseOpts);
|
|
208
|
+
const redirectPaths = new Set();
|
|
209
|
+
if (url.withoutHash !== destUrl.withoutHash) {
|
|
210
|
+
const redirectChain = res
|
|
211
|
+
.request()
|
|
212
|
+
.redirectChain()
|
|
213
|
+
.map((req) => req.url());
|
|
214
|
+
for (const redirectPath of redirectChain) {
|
|
215
|
+
redirectPaths.add(redirectPath);
|
|
216
|
+
}
|
|
217
|
+
redirectPaths.add(destUrl.withoutHash);
|
|
240
218
|
}
|
|
241
219
|
if (destUrl.hostname !== url.hostname) {
|
|
242
220
|
isExternal = true;
|
|
@@ -252,7 +230,7 @@ let Scraper = (() => {
|
|
|
252
230
|
url,
|
|
253
231
|
isTarget: false,
|
|
254
232
|
isExternal,
|
|
255
|
-
redirectPaths,
|
|
233
|
+
redirectPaths: [...redirectPaths],
|
|
256
234
|
status,
|
|
257
235
|
statusText,
|
|
258
236
|
contentType,
|
|
@@ -270,7 +248,7 @@ let Scraper = (() => {
|
|
|
270
248
|
void this.emit('changePhase', {
|
|
271
249
|
pid: process.pid,
|
|
272
250
|
name: 'loadDOMContent',
|
|
273
|
-
url
|
|
251
|
+
url,
|
|
274
252
|
isExternal,
|
|
275
253
|
message: '',
|
|
276
254
|
});
|
|
@@ -280,7 +258,7 @@ let Scraper = (() => {
|
|
|
280
258
|
void this.emit('changePhase', {
|
|
281
259
|
pid: process.pid,
|
|
282
260
|
name: 'getHTML',
|
|
283
|
-
url
|
|
261
|
+
url,
|
|
284
262
|
isExternal,
|
|
285
263
|
message: '',
|
|
286
264
|
});
|
|
@@ -296,7 +274,7 @@ let Scraper = (() => {
|
|
|
296
274
|
url,
|
|
297
275
|
isTarget: false,
|
|
298
276
|
isExternal,
|
|
299
|
-
redirectPaths,
|
|
277
|
+
redirectPaths: [...redirectPaths],
|
|
300
278
|
status,
|
|
301
279
|
statusText,
|
|
302
280
|
contentType,
|
|
@@ -313,8 +291,8 @@ let Scraper = (() => {
|
|
|
313
291
|
}
|
|
314
292
|
void this.emit('changePhase', {
|
|
315
293
|
pid: process.pid,
|
|
316
|
-
name: '
|
|
317
|
-
url
|
|
294
|
+
name: 'waitNetworkIdle',
|
|
295
|
+
url,
|
|
318
296
|
isExternal,
|
|
319
297
|
message: '',
|
|
320
298
|
});
|
|
@@ -324,25 +302,36 @@ let Scraper = (() => {
|
|
|
324
302
|
void this.emit('changePhase', {
|
|
325
303
|
pid: process.pid,
|
|
326
304
|
name: 'getAnchors',
|
|
327
|
-
url
|
|
305
|
+
url,
|
|
328
306
|
isExternal,
|
|
329
307
|
message: '',
|
|
330
308
|
});
|
|
331
|
-
const anchorList = await getAnchorList(page,
|
|
309
|
+
const anchorList = await getAnchorList(page, parseOpts);
|
|
332
310
|
void this.emit('changePhase', {
|
|
333
311
|
pid: process.pid,
|
|
334
312
|
name: 'getMeta',
|
|
335
|
-
url
|
|
313
|
+
url,
|
|
336
314
|
isExternal,
|
|
337
315
|
message: '',
|
|
338
316
|
});
|
|
339
317
|
const meta = await getMeta(page);
|
|
340
|
-
const imageList =
|
|
318
|
+
const imageList = captureImages
|
|
319
|
+
? await (async () => {
|
|
320
|
+
void this.emit('changePhase', {
|
|
321
|
+
pid: process.pid,
|
|
322
|
+
name: 'extractImages',
|
|
323
|
+
url,
|
|
324
|
+
isExternal,
|
|
325
|
+
message: '',
|
|
326
|
+
});
|
|
327
|
+
return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
|
|
328
|
+
})()
|
|
329
|
+
: [];
|
|
341
330
|
return {
|
|
342
331
|
url,
|
|
343
332
|
isTarget: true,
|
|
344
333
|
isExternal,
|
|
345
|
-
redirectPaths,
|
|
334
|
+
redirectPaths: [...redirectPaths],
|
|
346
335
|
status,
|
|
347
336
|
statusText,
|
|
348
337
|
contentType,
|
|
@@ -355,126 +344,110 @@ let Scraper = (() => {
|
|
|
355
344
|
isSkipped: false,
|
|
356
345
|
};
|
|
357
346
|
}, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
358
|
-
__esDecorate(this,
|
|
359
|
-
|
|
360
|
-
}, "#fetchHead") }, _private_fetchHead_decorators, { kind: "method", name: "#fetchHead", static: false, private: true, access: { has: obj => #fetchHead in obj, get: obj => obj.#fetchHead }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
361
|
-
__esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, isExternal) {
|
|
362
|
-
const url = this.#url.withoutHashAndAuth;
|
|
363
|
-
const imageList = [];
|
|
347
|
+
__esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
|
|
348
|
+
const listener = this.#createPageScanListener(isExternal);
|
|
364
349
|
const devices = [
|
|
365
|
-
{
|
|
366
|
-
{
|
|
350
|
+
{ key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
|
|
351
|
+
{ key: 'mobile-small', preset: devicePresets['mobile-small'] },
|
|
367
352
|
];
|
|
368
|
-
|
|
353
|
+
const imageList = [];
|
|
354
|
+
for (const { key, preset } of devices) {
|
|
369
355
|
void this.emit('changePhase', {
|
|
370
356
|
pid: process.pid,
|
|
371
357
|
name: 'setViewport',
|
|
372
|
-
url:
|
|
358
|
+
url: null,
|
|
373
359
|
isExternal,
|
|
374
|
-
message:
|
|
360
|
+
message: `📷 ${key} ↔️ ${preset.width}px`,
|
|
375
361
|
});
|
|
376
362
|
await beforePageScan(page, url, {
|
|
377
|
-
name:
|
|
378
|
-
width:
|
|
379
|
-
resolution:
|
|
363
|
+
name: key,
|
|
364
|
+
width: preset.width,
|
|
365
|
+
resolution: preset.resolution,
|
|
366
|
+
listener,
|
|
380
367
|
timeout: 5000,
|
|
381
368
|
});
|
|
369
|
+
void this.emit('changePhase', {
|
|
370
|
+
pid: process.pid,
|
|
371
|
+
name: 'waitImageLoad',
|
|
372
|
+
url: null,
|
|
373
|
+
isExternal,
|
|
374
|
+
message: `📷 ${key}: Waiting for images%dots%`,
|
|
375
|
+
});
|
|
376
|
+
await page
|
|
377
|
+
.waitForFunction(() => [...document.images].every((img) => img.complete), {
|
|
378
|
+
timeout: imageLoadTimeout,
|
|
379
|
+
})
|
|
380
|
+
.catch(() => { });
|
|
382
381
|
void this.emit('changePhase', {
|
|
383
382
|
pid: process.pid,
|
|
384
383
|
name: 'getImages',
|
|
385
|
-
url:
|
|
384
|
+
url: null,
|
|
386
385
|
isExternal,
|
|
387
|
-
message:
|
|
386
|
+
message: `📸 ${key}: Extracting images%dots%`,
|
|
388
387
|
});
|
|
389
|
-
const images = await getImageList(page,
|
|
388
|
+
const images = await getImageList(page, preset.width);
|
|
390
389
|
imageList.push(...images);
|
|
391
390
|
}
|
|
392
391
|
return imageList;
|
|
393
392
|
}, "#fetchImages") }, _private_fetchImages_decorators, { kind: "method", name: "#fetchImages", static: false, private: true, access: { has: obj => #fetchImages in obj, get: obj => obj.#fetchImages }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
394
393
|
if (_metadata) Object.defineProperty(this, Symbol.metadata, { enumerable: true, configurable: true, writable: true, value: _metadata });
|
|
395
394
|
}
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
while (!this.#browser.isConnected()) {
|
|
417
|
-
log('Browser closes all pages');
|
|
418
|
-
const pages = await this.#browser.pages();
|
|
419
|
-
for (const page of pages) {
|
|
420
|
-
page.removeAllListeners();
|
|
421
|
-
if (!page.isClosed) {
|
|
422
|
-
await page.close();
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
log('Browser closes self');
|
|
426
|
-
await this.#browser.close();
|
|
427
|
-
log('Browser disconnects');
|
|
428
|
-
await this.#browser.disconnect();
|
|
429
|
-
}
|
|
430
|
-
log('Scraper discards browser');
|
|
431
|
-
this.#browser = null;
|
|
432
|
-
void this.emit('destroyed', {
|
|
433
|
-
pid: process.pid,
|
|
434
|
-
});
|
|
435
|
-
void this.emit('changePhase', {
|
|
436
|
-
pid: process.pid,
|
|
437
|
-
name: 'destroyed',
|
|
438
|
-
url: this.#url,
|
|
439
|
-
isExternal,
|
|
440
|
-
message: '',
|
|
441
|
-
});
|
|
442
|
-
}
|
|
443
|
-
async scrapeStart(url, options, isSkip = false) {
|
|
395
|
+
/** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
|
|
396
|
+
retries = __runInitializers(this, _instanceExtraInitializers);
|
|
397
|
+
/**
|
|
398
|
+
* Begins the scraping process for a given URL on the provided Puppeteer page.
|
|
399
|
+
*
|
|
400
|
+
* Returns a `ScrapeResult` containing the outcome:
|
|
401
|
+
* - `type: "success"` with `pageData` on success
|
|
402
|
+
* - `type: "skipped"` with `ignored` details when the page is excluded
|
|
403
|
+
* - `type: "error"` with `error` details when scraping fails
|
|
404
|
+
*
|
|
405
|
+
* Sub-resources are collected via the `resourceResponse` event and
|
|
406
|
+
* included in the returned `ScrapeResult.resources`.
|
|
407
|
+
* @param page - The Puppeteer page instance to use for navigation and DOM evaluation.
|
|
408
|
+
* @param url - The extended URL to scrape.
|
|
409
|
+
* @param options - Optional scraper configuration overriding defaults.
|
|
410
|
+
* @param isSkip - When `true`, the page is immediately skipped without any network requests.
|
|
411
|
+
* @returns The scrape result containing the outcome and captured resources.
|
|
412
|
+
*/
|
|
413
|
+
async scrapeStart(page, url, options, isSkip = false) {
|
|
414
|
+
this.retries = options?.retries;
|
|
444
415
|
const isExternal = options?.isExternal ?? false;
|
|
445
|
-
const
|
|
416
|
+
const captureImages = options?.captureImages ?? true;
|
|
446
417
|
const excludeKeywords = options?.excludeKeywords ?? [];
|
|
447
|
-
const
|
|
448
|
-
const
|
|
449
|
-
|
|
418
|
+
const metadataOnly = options?.metadataOnly ?? false;
|
|
419
|
+
const imageLoadTimeout = options?.imageLoadTimeout ?? 5000;
|
|
420
|
+
const resources = [];
|
|
450
421
|
void this.emit('changePhase', {
|
|
451
422
|
pid: process.pid,
|
|
452
423
|
name: 'scrapeStart',
|
|
453
|
-
url
|
|
424
|
+
url,
|
|
454
425
|
isExternal,
|
|
455
426
|
message: '',
|
|
456
427
|
});
|
|
428
|
+
// Path-excluded: return SkippedPageData
|
|
457
429
|
if (isSkip) {
|
|
458
|
-
void this.emit('ignoreAndSkip', {
|
|
459
|
-
pid: process.pid,
|
|
460
|
-
url: this.#url,
|
|
461
|
-
reason: {
|
|
462
|
-
matchedText: this.#url.pathname || '',
|
|
463
|
-
excludeKeywords,
|
|
464
|
-
},
|
|
465
|
-
});
|
|
466
430
|
void this.emit('changePhase', {
|
|
467
431
|
pid: process.pid,
|
|
468
|
-
name: '
|
|
469
|
-
url
|
|
432
|
+
name: 'pageSkipped',
|
|
433
|
+
url,
|
|
470
434
|
isExternal,
|
|
471
435
|
message: 'Matched: excluded path',
|
|
472
436
|
});
|
|
473
|
-
return
|
|
437
|
+
return {
|
|
438
|
+
type: 'skipped',
|
|
439
|
+
resources,
|
|
440
|
+
ignored: {
|
|
441
|
+
url,
|
|
442
|
+
matchedText: url.pathname || '',
|
|
443
|
+
excludeKeywords,
|
|
444
|
+
},
|
|
445
|
+
};
|
|
474
446
|
}
|
|
475
|
-
|
|
447
|
+
// Non-HTTP protocol: return minimal PageData
|
|
448
|
+
if (!url.isHTTP) {
|
|
476
449
|
const result = {
|
|
477
|
-
url
|
|
450
|
+
url,
|
|
478
451
|
isTarget: false,
|
|
479
452
|
isExternal,
|
|
480
453
|
redirectPaths: [],
|
|
@@ -491,79 +464,60 @@ let Scraper = (() => {
|
|
|
491
464
|
html: '',
|
|
492
465
|
isSkipped: false,
|
|
493
466
|
};
|
|
494
|
-
void this.emit('scrapeEnd', {
|
|
495
|
-
pid: process.pid,
|
|
496
|
-
url: this.#url,
|
|
497
|
-
timestamp: Date.now(),
|
|
498
|
-
result,
|
|
499
|
-
});
|
|
500
467
|
void this.emit('changePhase', {
|
|
501
468
|
pid: process.pid,
|
|
502
469
|
name: 'scrapeEnd',
|
|
503
|
-
url
|
|
470
|
+
url,
|
|
504
471
|
isExternal,
|
|
505
472
|
message: '',
|
|
506
473
|
});
|
|
507
|
-
return;
|
|
474
|
+
return { type: 'success', pageData: result, resources };
|
|
508
475
|
}
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
url: this.#url,
|
|
513
|
-
isExternal,
|
|
514
|
-
message: '',
|
|
515
|
-
});
|
|
516
|
-
let result = await this.#fetchHead(url, isExternal);
|
|
517
|
-
if (result instanceof Error) {
|
|
518
|
-
log('Error(FETCH_HEAD): %s', url.href);
|
|
519
|
-
void this.emit('error', {
|
|
476
|
+
let headResult = options?.headCheckResult ?? null;
|
|
477
|
+
if (headResult && metadataOnly) {
|
|
478
|
+
void this.emit('changePhase', {
|
|
520
479
|
pid: process.pid,
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
480
|
+
name: 'scrapeEnd',
|
|
481
|
+
url,
|
|
482
|
+
isExternal,
|
|
483
|
+
message: '',
|
|
524
484
|
});
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
pid: process.pid,
|
|
530
|
-
url: this.#url,
|
|
531
|
-
timestamp: Date.now(),
|
|
532
|
-
result: {
|
|
533
|
-
...result,
|
|
485
|
+
return {
|
|
486
|
+
type: 'success',
|
|
487
|
+
pageData: {
|
|
488
|
+
...headResult,
|
|
534
489
|
isTarget: false,
|
|
535
490
|
},
|
|
536
|
-
|
|
537
|
-
|
|
491
|
+
resources,
|
|
492
|
+
};
|
|
538
493
|
}
|
|
539
|
-
if (
|
|
540
|
-
const
|
|
541
|
-
const page = await this.#createPage(isExternal, executablePath, headlessMode);
|
|
542
|
-
result = await this.#fetchData(page, url, isExternal, isGettingImages, options).catch((error) => {
|
|
494
|
+
if (headResult === null || headResult.contentType === 'text/html') {
|
|
495
|
+
const fetchResult = await this.#fetchData(page, url, isExternal, captureImages, imageLoadTimeout, resources, options).catch((error) => {
|
|
543
496
|
if (error instanceof Error) {
|
|
544
497
|
return error;
|
|
545
498
|
}
|
|
546
499
|
return new Error(error);
|
|
547
500
|
});
|
|
548
|
-
if (
|
|
501
|
+
if (fetchResult instanceof Error) {
|
|
549
502
|
log('Error(FETCH_DATA): %s', url.href);
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
error:
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
503
|
+
page.removeAllListeners();
|
|
504
|
+
return {
|
|
505
|
+
type: 'error',
|
|
506
|
+
resources,
|
|
507
|
+
error: {
|
|
508
|
+
name: fetchResult.name,
|
|
509
|
+
message: fetchResult.message,
|
|
510
|
+
stack: fetchResult.stack,
|
|
511
|
+
shutdown: true,
|
|
512
|
+
},
|
|
513
|
+
};
|
|
558
514
|
}
|
|
559
515
|
page.removeAllListeners();
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
if (!result.isSkipped) {
|
|
564
|
-
const checkedKeyword = keywordCheck(result.html, excludeKeywords);
|
|
516
|
+
headResult = fetchResult;
|
|
517
|
+
if (!headResult.isSkipped) {
|
|
518
|
+
const checkedKeyword = keywordCheck(headResult.html, excludeKeywords);
|
|
565
519
|
if (checkedKeyword) {
|
|
566
|
-
|
|
520
|
+
headResult = {
|
|
567
521
|
url,
|
|
568
522
|
isSkipped: true,
|
|
569
523
|
matched: {
|
|
@@ -574,48 +528,131 @@ let Scraper = (() => {
|
|
|
574
528
|
};
|
|
575
529
|
}
|
|
576
530
|
}
|
|
577
|
-
if (
|
|
578
|
-
if (
|
|
579
|
-
return
|
|
531
|
+
if (headResult.isSkipped) {
|
|
532
|
+
if (headResult.matched.type === 'path') {
|
|
533
|
+
return {
|
|
534
|
+
type: 'skipped',
|
|
535
|
+
resources,
|
|
536
|
+
ignored: {
|
|
537
|
+
url,
|
|
538
|
+
matchedText: url.pathname || '',
|
|
539
|
+
excludeKeywords,
|
|
540
|
+
},
|
|
541
|
+
};
|
|
580
542
|
}
|
|
581
|
-
void this.emit('ignoreAndSkip', {
|
|
582
|
-
pid: process.pid,
|
|
583
|
-
url: this.#url,
|
|
584
|
-
reason: {
|
|
585
|
-
matchedText: result.matched.text,
|
|
586
|
-
excludeKeywords,
|
|
587
|
-
},
|
|
588
|
-
});
|
|
589
543
|
void this.emit('changePhase', {
|
|
590
544
|
pid: process.pid,
|
|
591
|
-
name: '
|
|
592
|
-
url
|
|
545
|
+
name: 'pageSkipped',
|
|
546
|
+
url,
|
|
593
547
|
isExternal,
|
|
594
|
-
message: `Matched: "${
|
|
548
|
+
message: `Matched: "${headResult.matched.text}"`,
|
|
595
549
|
});
|
|
596
|
-
return
|
|
550
|
+
return {
|
|
551
|
+
type: 'skipped',
|
|
552
|
+
resources,
|
|
553
|
+
ignored: {
|
|
554
|
+
url,
|
|
555
|
+
matchedText: headResult.matched.text,
|
|
556
|
+
excludeKeywords,
|
|
557
|
+
},
|
|
558
|
+
};
|
|
597
559
|
}
|
|
598
560
|
}
|
|
599
|
-
void this.emit('scrapeEnd', {
|
|
600
|
-
pid: process.pid,
|
|
601
|
-
url: this.#url,
|
|
602
|
-
timestamp: Date.now(),
|
|
603
|
-
result,
|
|
604
|
-
});
|
|
605
561
|
void this.emit('changePhase', {
|
|
606
562
|
pid: process.pid,
|
|
607
563
|
name: 'scrapeEnd',
|
|
608
|
-
url
|
|
564
|
+
url,
|
|
609
565
|
isExternal,
|
|
610
566
|
message: '',
|
|
611
567
|
});
|
|
612
|
-
return
|
|
568
|
+
return { type: 'success', pageData: headResult, resources };
|
|
569
|
+
}
|
|
570
|
+
/**
|
|
571
|
+
* Creates a callback for `@d-zero/puppeteer-page-scan`'s `beforePageScan` listener.
|
|
572
|
+
*
|
|
573
|
+
* WHY a separate factory: The listener must capture `isExternal` for phase events
|
|
574
|
+
* while conforming to the `beforePageScan` listener signature.
|
|
575
|
+
* Currently only handles the `scroll` phase to report scroll progress.
|
|
576
|
+
* @param isExternal - Whether the current page is external to the crawl scope
|
|
577
|
+
* @returns A listener function compatible with `beforePageScan`'s `listener` option
|
|
578
|
+
*/
|
|
579
|
+
#createPageScanListener(isExternal) {
|
|
580
|
+
return (phase, data) => {
|
|
581
|
+
switch (phase) {
|
|
582
|
+
case 'scroll': {
|
|
583
|
+
const d = data;
|
|
584
|
+
const scrollMsg = Number.isNaN(d.scrollHeight)
|
|
585
|
+
? `%propeller% ${d.message}`
|
|
586
|
+
: `%propeller% ${d.scrollY}px/${d.scrollHeight}px (${Math.round((d.scrollY / d.scrollHeight) * 100)}%) ${d.message}`;
|
|
587
|
+
void this.emit('changePhase', {
|
|
588
|
+
pid: process.pid,
|
|
589
|
+
name: 'scrollToBottom',
|
|
590
|
+
url: null,
|
|
591
|
+
isExternal,
|
|
592
|
+
message: scrollMsg,
|
|
593
|
+
});
|
|
594
|
+
break;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
};
|
|
613
598
|
}
|
|
614
|
-
|
|
615
|
-
|
|
599
|
+
/**
|
|
600
|
+
* Navigates the page to the target URL and extracts full page data.
|
|
601
|
+
*
|
|
602
|
+
* WHY retryable with 3-min timeout: Page navigation can fail due to transient
|
|
603
|
+
* network issues or slow-loading pages. The decorator retries automatically,
|
|
604
|
+
* emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
|
|
605
|
+
*
|
|
606
|
+
* Flow:
|
|
607
|
+
* 1. Register request/response listeners to capture sub-resources (internal pages only)
|
|
608
|
+
* 2. Navigate to URL via `page.goto()` and track redirect chain
|
|
609
|
+
* 3. Wait for DOM content and network idle
|
|
610
|
+
* 4. Extract anchors, meta, and optionally images
|
|
611
|
+
* 5. Check for keyword exclusion in HTML content
|
|
612
|
+
* @param page - Puppeteer page instance
|
|
613
|
+
* @param url - Target URL to navigate to
|
|
614
|
+
* @param isExternal - Whether the URL is external to the crawl scope
|
|
615
|
+
* @param captureImages - Whether to run the image extraction pipeline
|
|
616
|
+
* @param imageLoadTimeout - Timeout (ms) for waiting lazy-loaded images to complete
|
|
617
|
+
* @param resources - Mutable array to collect captured sub-resources into
|
|
618
|
+
* @param options - Additional scraper options (e.g. `disableQueries`, `navigationTimeout`)
|
|
619
|
+
* @returns Full page data or skipped page data if an exclusion rule matched
|
|
620
|
+
*/
|
|
616
621
|
get #fetchData() { return _private_fetchData_descriptor.value; }
|
|
617
|
-
|
|
622
|
+
/**
|
|
623
|
+
* Extracts image data from the page across multiple device presets.
|
|
624
|
+
*
|
|
625
|
+
* WHY multiple device presets: Images may differ between desktop and mobile
|
|
626
|
+
* due to responsive `<picture>` / `srcset`. Capturing both `desktop-compact`
|
|
627
|
+
* and `mobile-small` viewports reveals responsive image issues.
|
|
628
|
+
*
|
|
629
|
+
* WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
|
|
630
|
+
* best-effort. If all retries fail, an empty array is returned rather than
|
|
631
|
+
* failing the entire page scrape.
|
|
632
|
+
* @param page - Puppeteer page instance
|
|
633
|
+
* @param url - The page URL string (without hash and auth)
|
|
634
|
+
* @param isExternal - Whether the page is external
|
|
635
|
+
* @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
|
|
636
|
+
* @returns Array of image elements from all device presets
|
|
637
|
+
*/
|
|
618
638
|
get #fetchImages() { return _private_fetchImages_descriptor.value; }
|
|
619
639
|
};
|
|
620
640
|
})();
|
|
641
|
+
/**
|
|
642
|
+
* Page-level scraper that extracts data from a single browser page.
|
|
643
|
+
*
|
|
644
|
+
* The scraper returns results as values from `scrapeStart()` rather than
|
|
645
|
+
* emitting them as events. Only streaming events (changePhase, resourceResponse)
|
|
646
|
+
* are emitted for progress monitoring.
|
|
647
|
+
*
|
|
648
|
+
* The Puppeteer `Page` object is injected externally, and page lifecycle
|
|
649
|
+
* (including `page.close()`) is managed by the caller.
|
|
650
|
+
* @example
|
|
651
|
+
* ```ts
|
|
652
|
+
* const scraper = new Scraper();
|
|
653
|
+
* scraper.on('changePhase', (e) => console.log(e.name));
|
|
654
|
+
* const result = await scraper.scrapeStart(page, url, { isExternal: false });
|
|
655
|
+
* ```
|
|
656
|
+
*/
|
|
657
|
+
// eslint-disable-next-line unicorn/prefer-event-target -- TypedAwaitEventEmitter is a project-specific typed wrapper, not Node.js EventEmitter
|
|
621
658
|
export default Scraper;
|