@d-zero/beholder 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/LICENSE +21 -0
- package/README.md +5 -0
- package/dist/debug.d.ts +6 -0
- package/dist/debug.js +6 -0
- package/dist/dom-evaluation.d.ts +24 -0
- package/dist/dom-evaluation.js +114 -0
- package/dist/events.d.ts +32 -0
- package/dist/events.js +15 -0
- package/dist/fetch-destination.d.ts +2 -0
- package/dist/fetch-destination.js +132 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +4 -0
- package/dist/keyword-check.d.ts +1 -0
- package/dist/keyword-check.js +10 -0
- package/dist/net-timeout-error.d.ts +3 -0
- package/dist/net-timeout-error.js +3 -0
- package/dist/network.d.ts +2 -0
- package/dist/network.js +132 -0
- package/dist/scraper.d.ts +15 -0
- package/dist/scraper.js +678 -0
- package/dist/sub-process-runner.d.ts +12 -0
- package/dist/sub-process-runner.js +180 -0
- package/dist/sub-process.d.ts +1 -0
- package/dist/sub-process.js +67 -0
- package/dist/types.d.ts +271 -0
- package/dist/types.js +1 -0
- package/dist/utils.d.ts +5 -0
- package/dist/utils.js +142 -0
- package/package.json +34 -0
- package/src/debug.ts +7 -0
- package/src/dom-evaluation.ts +175 -0
- package/src/events.ts +21 -0
- package/src/fetch-destination.ts +160 -0
- package/src/index.ts +4 -0
- package/src/keyword-check.spec.ts +8 -0
- package/src/keyword-check.ts +12 -0
- package/src/net-timeout-error.ts +3 -0
- package/src/scraper.ts +733 -0
- package/src/sub-process-runner.ts +220 -0
- package/src/sub-process.ts +86 -0
- package/src/types.ts +341 -0
- package/src/utils.ts +171 -0
- package/tsconfig.json +15 -0
- package/tsconfig.tsbuildinfo +1 -0
package/src/scraper.ts
ADDED
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
ScrapeEventTypes,
|
|
3
|
+
ImageElement,
|
|
4
|
+
NetworkLog,
|
|
5
|
+
PageData,
|
|
6
|
+
ParseURLOptions,
|
|
7
|
+
Resource,
|
|
8
|
+
SkippedPageData,
|
|
9
|
+
ExURL,
|
|
10
|
+
} from './types.js';
|
|
11
|
+
import type { Browser, Page } from 'puppeteer';
|
|
12
|
+
|
|
13
|
+
import { retry } from '@d-zero/shared/retry';
|
|
14
|
+
import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
15
|
+
import puppeteer from 'puppeteer';
|
|
16
|
+
|
|
17
|
+
import { resourceLog, scraperLog } from './debug.js';
|
|
18
|
+
import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
|
|
19
|
+
import { fetchDestination } from './fetch-destination.js';
|
|
20
|
+
import { keywordCheck } from './keyword-check.js';
|
|
21
|
+
import { detectCDN, detectCompress, isError, parseUrl } from './utils.js';
|
|
22
|
+
|
|
23
|
+
const pid = `${process.pid}`;
|
|
24
|
+
const log = scraperLog.extend(pid);
|
|
25
|
+
const rLog = resourceLog.extend(pid);
|
|
26
|
+
|
|
27
|
+
const LAUNCH_BROWSER_TIMEOUT = 1000 * 30;
|
|
28
|
+
|
|
29
|
+
export type ScraperOptions = {
|
|
30
|
+
isExternal: boolean;
|
|
31
|
+
isGettingImages: boolean;
|
|
32
|
+
excludeKeywords: string[];
|
|
33
|
+
executablePath: string | null;
|
|
34
|
+
isTitleOnly: boolean;
|
|
35
|
+
screenshot: string | null;
|
|
36
|
+
} & ParseURLOptions;
|
|
37
|
+
|
|
38
|
+
export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
39
|
+
#browser: Browser | null = null;
|
|
40
|
+
#url: ExURL | null = null;
|
|
41
|
+
|
|
42
|
+
async destroy(isExternal: boolean) {
|
|
43
|
+
log('Scraper destroys self');
|
|
44
|
+
if (!this.#url) {
|
|
45
|
+
throw new Error('The instance is already destroyed.');
|
|
46
|
+
}
|
|
47
|
+
if (!this.#browser) {
|
|
48
|
+
void this.emit('destroyed', {
|
|
49
|
+
pid: process.pid,
|
|
50
|
+
});
|
|
51
|
+
void this.emit('changePhase', {
|
|
52
|
+
pid: process.pid,
|
|
53
|
+
name: 'destroyed',
|
|
54
|
+
url: this.#url,
|
|
55
|
+
isExternal,
|
|
56
|
+
message: '',
|
|
57
|
+
});
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
while (!this.#browser.isConnected()) {
|
|
61
|
+
log('Browser closes all pages');
|
|
62
|
+
const pages = await this.#browser.pages();
|
|
63
|
+
for (const page of pages) {
|
|
64
|
+
page.removeAllListeners();
|
|
65
|
+
if (!page.isClosed) {
|
|
66
|
+
await page.close();
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
log('Browser closes self');
|
|
70
|
+
await this.#browser.close();
|
|
71
|
+
log('Browser disconnects');
|
|
72
|
+
await this.#browser.disconnect();
|
|
73
|
+
}
|
|
74
|
+
log('Scraper discards browser');
|
|
75
|
+
this.#browser = null;
|
|
76
|
+
void this.emit('destroyed', {
|
|
77
|
+
pid: process.pid,
|
|
78
|
+
});
|
|
79
|
+
void this.emit('changePhase', {
|
|
80
|
+
pid: process.pid,
|
|
81
|
+
name: 'destroyed',
|
|
82
|
+
url: this.#url,
|
|
83
|
+
isExternal,
|
|
84
|
+
message: '',
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
async scrapeStart(url: ExURL, options?: Partial<ScraperOptions>, isSkip = false) {
|
|
89
|
+
const isExternal = options?.isExternal ?? false;
|
|
90
|
+
const isGettingImages = options?.isGettingImages ?? true;
|
|
91
|
+
const excludeKeywords = options?.excludeKeywords ?? [];
|
|
92
|
+
const executablePath = options?.executablePath ?? null;
|
|
93
|
+
const isTitleOnly = options?.isTitleOnly ?? false;
|
|
94
|
+
|
|
95
|
+
this.#url = url;
|
|
96
|
+
void this.emit('changePhase', {
|
|
97
|
+
pid: process.pid,
|
|
98
|
+
name: 'scrapeStart',
|
|
99
|
+
url: this.#url,
|
|
100
|
+
isExternal,
|
|
101
|
+
message: '',
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
if (isSkip) {
|
|
105
|
+
void this.emit('ignoreAndSkip', {
|
|
106
|
+
pid: process.pid,
|
|
107
|
+
url: this.#url,
|
|
108
|
+
reason: {
|
|
109
|
+
matchedText: this.#url.pathname || '',
|
|
110
|
+
excludeKeywords,
|
|
111
|
+
},
|
|
112
|
+
});
|
|
113
|
+
void this.emit('changePhase', {
|
|
114
|
+
pid: process.pid,
|
|
115
|
+
name: 'ignoreAndSkip',
|
|
116
|
+
url: this.#url,
|
|
117
|
+
isExternal,
|
|
118
|
+
message: 'Matched: excluded path',
|
|
119
|
+
});
|
|
120
|
+
return;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if (!this.#url.isHTTP) {
|
|
124
|
+
const result: PageData = {
|
|
125
|
+
url: this.#url,
|
|
126
|
+
isTarget: false,
|
|
127
|
+
isExternal,
|
|
128
|
+
redirectPaths: [],
|
|
129
|
+
status: -1,
|
|
130
|
+
statusText: '__THIS_IS_NOT_HTTP_PROTOCOL__',
|
|
131
|
+
contentType: null,
|
|
132
|
+
contentLength: null,
|
|
133
|
+
responseHeaders: {},
|
|
134
|
+
meta: {
|
|
135
|
+
title: '',
|
|
136
|
+
},
|
|
137
|
+
imageList: [],
|
|
138
|
+
anchorList: [],
|
|
139
|
+
html: '',
|
|
140
|
+
isSkipped: false,
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
void this.emit('scrapeEnd', {
|
|
144
|
+
pid: process.pid,
|
|
145
|
+
url: this.#url,
|
|
146
|
+
timestamp: Date.now(),
|
|
147
|
+
result,
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
void this.emit('changePhase', {
|
|
151
|
+
pid: process.pid,
|
|
152
|
+
name: 'scrapeEnd',
|
|
153
|
+
url: this.#url,
|
|
154
|
+
isExternal,
|
|
155
|
+
message: '',
|
|
156
|
+
});
|
|
157
|
+
return;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
void this.emit('changePhase', {
|
|
161
|
+
pid: process.pid,
|
|
162
|
+
name: 'touchHead',
|
|
163
|
+
url: this.#url,
|
|
164
|
+
isExternal,
|
|
165
|
+
message: '',
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
let result: PageData | SkippedPageData | Error | null = await this.#fetchHead(
|
|
169
|
+
url,
|
|
170
|
+
isExternal,
|
|
171
|
+
);
|
|
172
|
+
|
|
173
|
+
if (result instanceof Error) {
|
|
174
|
+
log('Error(FETCH_HEAD): %s', url.href);
|
|
175
|
+
void this.emit('error', {
|
|
176
|
+
pid: process.pid,
|
|
177
|
+
url: this.#url,
|
|
178
|
+
shutdown: false,
|
|
179
|
+
error: result,
|
|
180
|
+
});
|
|
181
|
+
result = null;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (result && isTitleOnly) {
|
|
185
|
+
void this.emit('scrapeEnd', {
|
|
186
|
+
pid: process.pid,
|
|
187
|
+
url: this.#url,
|
|
188
|
+
timestamp: Date.now(),
|
|
189
|
+
result: {
|
|
190
|
+
...result,
|
|
191
|
+
isTarget: false,
|
|
192
|
+
},
|
|
193
|
+
});
|
|
194
|
+
return;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if (result === null || result.contentType === 'text/html') {
|
|
198
|
+
const headlessMode: true | 'shell' = url.isSecure ? true : 'shell';
|
|
199
|
+
const page = await this.#createPage(isExternal, executablePath, headlessMode);
|
|
200
|
+
|
|
201
|
+
result = await this.#fetchData(
|
|
202
|
+
page,
|
|
203
|
+
url,
|
|
204
|
+
isExternal,
|
|
205
|
+
isGettingImages,
|
|
206
|
+
options,
|
|
207
|
+
).catch((error) => {
|
|
208
|
+
if (error instanceof Error) {
|
|
209
|
+
return error;
|
|
210
|
+
}
|
|
211
|
+
return new Error(error);
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
if (result instanceof Error) {
|
|
215
|
+
log('Error(FETCH_DATA): %s', url.href);
|
|
216
|
+
void this.emit('error', {
|
|
217
|
+
pid: process.pid,
|
|
218
|
+
url: this.#url,
|
|
219
|
+
shutdown: true,
|
|
220
|
+
error: result,
|
|
221
|
+
});
|
|
222
|
+
await this.destroy(isExternal);
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
page.removeAllListeners();
|
|
227
|
+
if (!page.isClosed) {
|
|
228
|
+
await page.close();
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
if (!result.isSkipped) {
|
|
232
|
+
const checkedKeyword = keywordCheck(result.html, excludeKeywords);
|
|
233
|
+
|
|
234
|
+
if (checkedKeyword) {
|
|
235
|
+
result = {
|
|
236
|
+
url,
|
|
237
|
+
isSkipped: true,
|
|
238
|
+
matched: {
|
|
239
|
+
type: 'keyword',
|
|
240
|
+
text: checkedKeyword,
|
|
241
|
+
excludeKeywords,
|
|
242
|
+
},
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (result.isSkipped) {
|
|
248
|
+
if (result.matched.type === 'path') {
|
|
249
|
+
return;
|
|
250
|
+
}
|
|
251
|
+
void this.emit('ignoreAndSkip', {
|
|
252
|
+
pid: process.pid,
|
|
253
|
+
url: this.#url,
|
|
254
|
+
reason: {
|
|
255
|
+
matchedText: result.matched.text,
|
|
256
|
+
excludeKeywords,
|
|
257
|
+
},
|
|
258
|
+
});
|
|
259
|
+
void this.emit('changePhase', {
|
|
260
|
+
pid: process.pid,
|
|
261
|
+
name: 'ignoreAndSkip',
|
|
262
|
+
url: this.#url,
|
|
263
|
+
isExternal,
|
|
264
|
+
message: `Matched: "${result.matched.text}"`,
|
|
265
|
+
});
|
|
266
|
+
return;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
void this.emit('scrapeEnd', {
|
|
271
|
+
pid: process.pid,
|
|
272
|
+
url: this.#url,
|
|
273
|
+
timestamp: Date.now(),
|
|
274
|
+
result,
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
void this.emit('changePhase', {
|
|
278
|
+
pid: process.pid,
|
|
279
|
+
name: 'scrapeEnd',
|
|
280
|
+
url: this.#url,
|
|
281
|
+
isExternal,
|
|
282
|
+
message: '',
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
return result;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// eslint-disable-next-line no-restricted-syntax
|
|
289
|
+
@retry()
|
|
290
|
+
async #bootBrowser(
|
|
291
|
+
isExternal: boolean,
|
|
292
|
+
executablePath: string | null,
|
|
293
|
+
headless: boolean | 'shell',
|
|
294
|
+
) {
|
|
295
|
+
if (!this.#browser) {
|
|
296
|
+
void this.emit('changePhase', {
|
|
297
|
+
pid: process.pid,
|
|
298
|
+
name: 'launchBrowser',
|
|
299
|
+
url: this.#url,
|
|
300
|
+
isExternal,
|
|
301
|
+
message: executablePath || '(executablePath is default)',
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
const browser = await puppeteer
|
|
305
|
+
.launch({
|
|
306
|
+
headless,
|
|
307
|
+
timeout: LAUNCH_BROWSER_TIMEOUT,
|
|
308
|
+
executablePath: executablePath ?? undefined,
|
|
309
|
+
args: [
|
|
310
|
+
// TODO: Optional lang
|
|
311
|
+
'--lang=ja',
|
|
312
|
+
'--no-zygote',
|
|
313
|
+
'--ignore-certificate-errors',
|
|
314
|
+
],
|
|
315
|
+
})
|
|
316
|
+
.catch((error) => {
|
|
317
|
+
if (error instanceof Error) {
|
|
318
|
+
return error;
|
|
319
|
+
}
|
|
320
|
+
throw error;
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
if (browser instanceof Error) {
|
|
324
|
+
void this.emit('error', {
|
|
325
|
+
pid: process.pid,
|
|
326
|
+
url: this.#url!,
|
|
327
|
+
shutdown: false,
|
|
328
|
+
error: browser,
|
|
329
|
+
});
|
|
330
|
+
throw browser;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
this.#browser = browser;
|
|
334
|
+
} else if (!this.#browser.isConnected()) {
|
|
335
|
+
await this.#browser.close();
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
return this.#browser;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// eslint-disable-next-line no-restricted-syntax
|
|
342
|
+
@retry()
|
|
343
|
+
async #createPage(
|
|
344
|
+
isExternal: boolean,
|
|
345
|
+
executablePath: string | null,
|
|
346
|
+
headless: boolean | 'shell',
|
|
347
|
+
) {
|
|
348
|
+
const browser = await this.#bootBrowser(isExternal, executablePath, headless);
|
|
349
|
+
|
|
350
|
+
void this.emit('changePhase', {
|
|
351
|
+
pid: process.pid,
|
|
352
|
+
name: 'newPage',
|
|
353
|
+
url: this.#url,
|
|
354
|
+
isExternal,
|
|
355
|
+
message: '',
|
|
356
|
+
});
|
|
357
|
+
|
|
358
|
+
const page = await browser.newPage();
|
|
359
|
+
page.setDefaultNavigationTimeout(0);
|
|
360
|
+
await page.setUserAgent(
|
|
361
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
|
362
|
+
);
|
|
363
|
+
await page.setExtraHTTPHeaders({
|
|
364
|
+
// TODO: Optional lang
|
|
365
|
+
'Accept-Language': 'ja-JP',
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
return page;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
// eslint-disable-next-line no-restricted-syntax
|
|
372
|
+
@retry({
|
|
373
|
+
timeout: 1 * 60 * 1000, // 1sec,
|
|
374
|
+
// retries: 1,
|
|
375
|
+
})
|
|
376
|
+
async #fetchData(
|
|
377
|
+
page: Page,
|
|
378
|
+
url: ExURL,
|
|
379
|
+
isExternal: boolean,
|
|
380
|
+
isGettingImages: boolean,
|
|
381
|
+
options?: ParseURLOptions,
|
|
382
|
+
): Promise<PageData | SkippedPageData> {
|
|
383
|
+
const networkLogs: Record<string, NetworkLog> = {};
|
|
384
|
+
|
|
385
|
+
page.on('dialog', async (dialog) => {
|
|
386
|
+
log(`Appear ${dialog.type()} dialog: ${dialog.message()}`);
|
|
387
|
+
try {
|
|
388
|
+
await dialog.accept();
|
|
389
|
+
} catch (error) {
|
|
390
|
+
log(`Error: ${error}`);
|
|
391
|
+
}
|
|
392
|
+
log(`Accept ${dialog.type()} dialog`);
|
|
393
|
+
});
|
|
394
|
+
|
|
395
|
+
if (!isExternal) {
|
|
396
|
+
page.on('request', (request) => {
|
|
397
|
+
const url = parseUrl(request.url(), options)!;
|
|
398
|
+
networkLogs[request.url()] = {
|
|
399
|
+
url,
|
|
400
|
+
status: null,
|
|
401
|
+
contentLength: 0,
|
|
402
|
+
contentType: '',
|
|
403
|
+
isError: false,
|
|
404
|
+
request: {
|
|
405
|
+
ts: Date.now(),
|
|
406
|
+
headers: request.headers(),
|
|
407
|
+
method: request.method(),
|
|
408
|
+
},
|
|
409
|
+
};
|
|
410
|
+
});
|
|
411
|
+
|
|
412
|
+
const uniqueRes = new Set<string>();
|
|
413
|
+
page.on('response', (response) => {
|
|
414
|
+
const resURL = parseUrl(response.url(), options)!;
|
|
415
|
+
|
|
416
|
+
if (uniqueRes.has(resURL.withoutHash)) {
|
|
417
|
+
return;
|
|
418
|
+
}
|
|
419
|
+
if (resURL.withoutHash === url.withoutHash) {
|
|
420
|
+
return;
|
|
421
|
+
}
|
|
422
|
+
uniqueRes.add(resURL.withoutHash);
|
|
423
|
+
|
|
424
|
+
const headers = response.headers();
|
|
425
|
+
const status = response.status();
|
|
426
|
+
const statusText = response.statusText();
|
|
427
|
+
const contentType = headers['content-type']?.split(';')[0] || null;
|
|
428
|
+
const contentLength =
|
|
429
|
+
Number.parseInt(headers['content-length'] ?? '', 10) || null;
|
|
430
|
+
const request = networkLogs[resURL.withoutHash]!;
|
|
431
|
+
const log: NetworkLog = {
|
|
432
|
+
...request,
|
|
433
|
+
response: {
|
|
434
|
+
ts: Date.now(),
|
|
435
|
+
status,
|
|
436
|
+
statusText,
|
|
437
|
+
fromCache: response.fromCache(),
|
|
438
|
+
headers,
|
|
439
|
+
},
|
|
440
|
+
status,
|
|
441
|
+
isError: isError(status),
|
|
442
|
+
contentType: contentType || '',
|
|
443
|
+
contentLength: contentLength || 0,
|
|
444
|
+
};
|
|
445
|
+
|
|
446
|
+
const referredLink: Omit<Resource, 'uid'> = {
|
|
447
|
+
url: resURL,
|
|
448
|
+
isExternal: resURL.hostname !== url.hostname,
|
|
449
|
+
isError: log.isError,
|
|
450
|
+
status,
|
|
451
|
+
statusText,
|
|
452
|
+
contentType,
|
|
453
|
+
contentLength,
|
|
454
|
+
compress: detectCompress(headers),
|
|
455
|
+
cdn: detectCDN(headers),
|
|
456
|
+
headers: headers,
|
|
457
|
+
};
|
|
458
|
+
|
|
459
|
+
rLog('Fetched: %s', resURL.href);
|
|
460
|
+
void this.emit('resourceResponse', {
|
|
461
|
+
pid: process.pid,
|
|
462
|
+
url,
|
|
463
|
+
log,
|
|
464
|
+
resource: referredLink,
|
|
465
|
+
});
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
void this.emit('changePhase', {
|
|
470
|
+
pid: process.pid,
|
|
471
|
+
name: 'openPage',
|
|
472
|
+
url: this.#url,
|
|
473
|
+
isExternal,
|
|
474
|
+
message: '',
|
|
475
|
+
});
|
|
476
|
+
|
|
477
|
+
if (url.username && url.password) {
|
|
478
|
+
// await page.authenticate({ username: url.username, password: url.password });
|
|
479
|
+
await page.setExtraHTTPHeaders({
|
|
480
|
+
Authorization: `Basic ${Buffer.from(`${url.username}:${url.password}`).toString('base64')}`,
|
|
481
|
+
});
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
const res = await page.goto(url.withoutHashAndAuth);
|
|
485
|
+
|
|
486
|
+
if (!res) {
|
|
487
|
+
throw new Error('The method Page.goto returned null');
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// console.log(res.ok());
|
|
491
|
+
// console.log(res.status());
|
|
492
|
+
// console.log(res.statusText());
|
|
493
|
+
// console.log(res.request().headers());
|
|
494
|
+
// console.log(res.request().response()?.headers());
|
|
495
|
+
// console.log(res.securityDetails());
|
|
496
|
+
|
|
497
|
+
const destUrl = parseUrl(page.url(), options)!;
|
|
498
|
+
const redirectPaths = res
|
|
499
|
+
.request()
|
|
500
|
+
.redirectChain()
|
|
501
|
+
.map((req) => req.url());
|
|
502
|
+
if (destUrl.withoutHash !== url.withoutHash) {
|
|
503
|
+
redirectPaths.push(destUrl.withoutHash);
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
if (destUrl.hostname !== url.hostname) {
|
|
507
|
+
isExternal = true;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
const status = res.status();
|
|
511
|
+
const statusText = res.statusText();
|
|
512
|
+
const responseHeaders = res.headers();
|
|
513
|
+
const contentType = responseHeaders['content-type']?.split(';')[0] || null;
|
|
514
|
+
const _contentLength = Number.parseInt(responseHeaders['content-length'] ?? '');
|
|
515
|
+
const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
|
|
516
|
+
|
|
517
|
+
if (contentType !== 'text/html') {
|
|
518
|
+
return {
|
|
519
|
+
url,
|
|
520
|
+
isTarget: false,
|
|
521
|
+
isExternal,
|
|
522
|
+
redirectPaths,
|
|
523
|
+
status,
|
|
524
|
+
statusText,
|
|
525
|
+
contentType,
|
|
526
|
+
contentLength,
|
|
527
|
+
responseHeaders,
|
|
528
|
+
meta: {
|
|
529
|
+
title: '',
|
|
530
|
+
},
|
|
531
|
+
imageList: [],
|
|
532
|
+
anchorList: [],
|
|
533
|
+
html: '',
|
|
534
|
+
isSkipped: false,
|
|
535
|
+
};
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
void this.emit('changePhase', {
|
|
539
|
+
pid: process.pid,
|
|
540
|
+
name: 'loadDOMContent',
|
|
541
|
+
url: this.#url,
|
|
542
|
+
isExternal,
|
|
543
|
+
message: '',
|
|
544
|
+
});
|
|
545
|
+
|
|
546
|
+
await page
|
|
547
|
+
.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 5000 })
|
|
548
|
+
.catch(() => {});
|
|
549
|
+
|
|
550
|
+
void this.emit('changePhase', {
|
|
551
|
+
pid: process.pid,
|
|
552
|
+
name: 'getHTML',
|
|
553
|
+
url: this.#url,
|
|
554
|
+
isExternal,
|
|
555
|
+
message: '',
|
|
556
|
+
});
|
|
557
|
+
|
|
558
|
+
const { title, html } = await page.evaluate(() => {
|
|
559
|
+
/* global document */
|
|
560
|
+
return {
|
|
561
|
+
title: document.title,
|
|
562
|
+
html: document.documentElement.outerHTML,
|
|
563
|
+
};
|
|
564
|
+
});
|
|
565
|
+
|
|
566
|
+
if (isExternal) {
|
|
567
|
+
return {
|
|
568
|
+
url,
|
|
569
|
+
isTarget: false,
|
|
570
|
+
isExternal,
|
|
571
|
+
redirectPaths,
|
|
572
|
+
status,
|
|
573
|
+
statusText,
|
|
574
|
+
contentType,
|
|
575
|
+
contentLength,
|
|
576
|
+
responseHeaders,
|
|
577
|
+
meta: {
|
|
578
|
+
title,
|
|
579
|
+
},
|
|
580
|
+
imageList: [],
|
|
581
|
+
anchorList: [],
|
|
582
|
+
html,
|
|
583
|
+
isSkipped: false,
|
|
584
|
+
};
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
void this.emit('changePhase', {
|
|
588
|
+
pid: process.pid,
|
|
589
|
+
name: 'waitNetworkIdleZero',
|
|
590
|
+
url: this.#url,
|
|
591
|
+
isExternal,
|
|
592
|
+
message: '',
|
|
593
|
+
});
|
|
594
|
+
|
|
595
|
+
await page
|
|
596
|
+
.waitForNavigation({ waitUntil: 'networkidle0', timeout: 5000 })
|
|
597
|
+
.catch(() => {});
|
|
598
|
+
|
|
599
|
+
void this.emit('changePhase', {
|
|
600
|
+
pid: process.pid,
|
|
601
|
+
name: 'getAnchors',
|
|
602
|
+
url: this.#url,
|
|
603
|
+
isExternal,
|
|
604
|
+
message: '',
|
|
605
|
+
});
|
|
606
|
+
const anchorList = await getAnchorList(page, options);
|
|
607
|
+
|
|
608
|
+
void this.emit('changePhase', {
|
|
609
|
+
pid: process.pid,
|
|
610
|
+
name: 'getMeta',
|
|
611
|
+
url: this.#url,
|
|
612
|
+
isExternal,
|
|
613
|
+
message: '',
|
|
614
|
+
});
|
|
615
|
+
const meta = await getMeta(page);
|
|
616
|
+
|
|
617
|
+
const imageList = isGettingImages ? await this.#fetchImages(page, isExternal) : [];
|
|
618
|
+
|
|
619
|
+
return {
|
|
620
|
+
url,
|
|
621
|
+
isTarget: true,
|
|
622
|
+
isExternal,
|
|
623
|
+
redirectPaths,
|
|
624
|
+
status,
|
|
625
|
+
statusText,
|
|
626
|
+
contentType,
|
|
627
|
+
contentLength,
|
|
628
|
+
responseHeaders,
|
|
629
|
+
meta,
|
|
630
|
+
anchorList,
|
|
631
|
+
imageList,
|
|
632
|
+
html,
|
|
633
|
+
isSkipped: false,
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
// eslint-disable-next-line no-restricted-syntax
|
|
638
|
+
@retry()
|
|
639
|
+
async #fetchHead(url: ExURL, isExternal: boolean) {
|
|
640
|
+
return await fetchDestination(url, isExternal);
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
// eslint-disable-next-line no-restricted-syntax
|
|
644
|
+
@retry({
|
|
645
|
+
timeout: 5 * 60 * 1000, // 5sec
|
|
646
|
+
fallback: [],
|
|
647
|
+
})
|
|
648
|
+
async #fetchImages(page: Page, isExternal: boolean): Promise<ImageElement[]> {
|
|
649
|
+
const imageList: ImageElement[] = [];
|
|
650
|
+
void this.emit('changePhase', {
|
|
651
|
+
pid: process.pid,
|
|
652
|
+
name: 'setViewport',
|
|
653
|
+
url: this.#url,
|
|
654
|
+
isExternal,
|
|
655
|
+
message: '1280x800',
|
|
656
|
+
});
|
|
657
|
+
await page.setViewport({ width: 1280, height: 800 });
|
|
658
|
+
|
|
659
|
+
void this.emit('changePhase', {
|
|
660
|
+
pid: process.pid,
|
|
661
|
+
name: 'scrollToBottom',
|
|
662
|
+
url: this.#url,
|
|
663
|
+
isExternal,
|
|
664
|
+
message: '1280x800',
|
|
665
|
+
});
|
|
666
|
+
await autoScroll(page, 800);
|
|
667
|
+
|
|
668
|
+
void this.emit('changePhase', {
|
|
669
|
+
pid: process.pid,
|
|
670
|
+
name: 'getImages',
|
|
671
|
+
url: this.#url,
|
|
672
|
+
isExternal,
|
|
673
|
+
message: '1280x800',
|
|
674
|
+
});
|
|
675
|
+
const imageListDesktop = await getImageList(page, 1280);
|
|
676
|
+
|
|
677
|
+
void this.emit('changePhase', {
|
|
678
|
+
pid: process.pid,
|
|
679
|
+
name: 'setViewport',
|
|
680
|
+
url: this.#url,
|
|
681
|
+
isExternal,
|
|
682
|
+
message: '320x568',
|
|
683
|
+
});
|
|
684
|
+
await page.setViewport({
|
|
685
|
+
width: 320,
|
|
686
|
+
height: 568,
|
|
687
|
+
deviceScaleFactor: 2,
|
|
688
|
+
isMobile: true,
|
|
689
|
+
hasTouch: true,
|
|
690
|
+
});
|
|
691
|
+
|
|
692
|
+
void this.emit('changePhase', {
|
|
693
|
+
pid: process.pid,
|
|
694
|
+
name: 'scrollToBottom',
|
|
695
|
+
url: this.#url,
|
|
696
|
+
isExternal,
|
|
697
|
+
message: '320x568',
|
|
698
|
+
});
|
|
699
|
+
await autoScroll(page, 568);
|
|
700
|
+
|
|
701
|
+
void this.emit('changePhase', {
|
|
702
|
+
pid: process.pid,
|
|
703
|
+
name: 'getImages',
|
|
704
|
+
url: this.#url,
|
|
705
|
+
isExternal,
|
|
706
|
+
message: '320x568',
|
|
707
|
+
});
|
|
708
|
+
const imageListMobile = await getImageList(page, 320);
|
|
709
|
+
|
|
710
|
+
imageList.push(...imageListDesktop, ...imageListMobile);
|
|
711
|
+
return imageList;
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
async function autoScroll(page: Page, height: number) {
|
|
716
|
+
await page.evaluate(async (height: number) => {
|
|
717
|
+
/* global window */
|
|
718
|
+
await new Promise<void>((resolve) => {
|
|
719
|
+
let totalHeight = 0;
|
|
720
|
+
const distance = height;
|
|
721
|
+
const timer = setInterval(() => {
|
|
722
|
+
const scrollHeight = document.body.scrollHeight;
|
|
723
|
+
window.scrollBy(0, distance);
|
|
724
|
+
totalHeight += distance;
|
|
725
|
+
|
|
726
|
+
if (totalHeight >= scrollHeight || totalHeight >= 50_000) {
|
|
727
|
+
clearInterval(timer);
|
|
728
|
+
resolve();
|
|
729
|
+
}
|
|
730
|
+
}, 100);
|
|
731
|
+
});
|
|
732
|
+
}, height);
|
|
733
|
+
}
|