@d-zero/beholder 0.1.29 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/README.md +172 -477
- package/package.json +7 -11
- package/src/debug.ts +5 -2
- package/src/dom-evaluation.ts +195 -65
- package/src/index.ts +27 -3
- package/src/is-error.spec.ts +33 -0
- package/src/is-error.ts +10 -0
- package/src/keyword-check.spec.ts +45 -4
- package/src/keyword-check.ts +5 -3
- package/src/parse-url.spec.ts +35 -0
- package/src/parse-url.ts +26 -0
- package/src/scraper.ts +338 -300
- package/src/types.ts +345 -258
- package/LICENSE +0 -21
- package/dist/debug.d.ts +0 -6
- package/dist/debug.js +0 -6
- package/dist/dom-evaluation.d.ts +0 -51
- package/dist/dom-evaluation.js +0 -147
- package/dist/events.d.ts +0 -32
- package/dist/events.js +0 -15
- package/dist/fetch-destination.d.ts +0 -8
- package/dist/fetch-destination.js +0 -145
- package/dist/index.d.ts +0 -4
- package/dist/index.js +0 -4
- package/dist/keyword-check.d.ts +0 -6
- package/dist/keyword-check.js +0 -15
- package/dist/net-timeout-error.d.ts +0 -3
- package/dist/net-timeout-error.js +0 -3
- package/dist/scraper.d.ts +0 -15
- package/dist/scraper.js +0 -621
- package/dist/sub-process-runner.d.ts +0 -12
- package/dist/sub-process-runner.js +0 -180
- package/dist/sub-process.d.ts +0 -1
- package/dist/sub-process.js +0 -67
- package/dist/types.d.ts +0 -271
- package/dist/types.js +0 -1
- package/dist/utils.d.ts +0 -16
- package/dist/utils.js +0 -69
- package/src/events.ts +0 -21
- package/src/fetch-destination.ts +0 -173
- package/src/net-timeout-error.ts +0 -3
- package/src/sub-process-runner.ts +0 -220
- package/src/sub-process.ts +0 -86
- package/src/utils.ts +0 -89
- package/tsconfig.tsbuildinfo +0 -1
package/dist/scraper.js
DELETED
|
@@ -1,621 +0,0 @@
|
|
|
1
|
-
var __runInitializers = (this && this.__runInitializers) || function (thisArg, initializers, value) {
|
|
2
|
-
var useValue = arguments.length > 2;
|
|
3
|
-
for (var i = 0; i < initializers.length; i++) {
|
|
4
|
-
value = useValue ? initializers[i].call(thisArg, value) : initializers[i].call(thisArg);
|
|
5
|
-
}
|
|
6
|
-
return useValue ? value : void 0;
|
|
7
|
-
};
|
|
8
|
-
var __esDecorate = (this && this.__esDecorate) || function (ctor, descriptorIn, decorators, contextIn, initializers, extraInitializers) {
|
|
9
|
-
function accept(f) { if (f !== void 0 && typeof f !== "function") throw new TypeError("Function expected"); return f; }
|
|
10
|
-
var kind = contextIn.kind, key = kind === "getter" ? "get" : kind === "setter" ? "set" : "value";
|
|
11
|
-
var target = !descriptorIn && ctor ? contextIn["static"] ? ctor : ctor.prototype : null;
|
|
12
|
-
var descriptor = descriptorIn || (target ? Object.getOwnPropertyDescriptor(target, contextIn.name) : {});
|
|
13
|
-
var _, done = false;
|
|
14
|
-
for (var i = decorators.length - 1; i >= 0; i--) {
|
|
15
|
-
var context = {};
|
|
16
|
-
for (var p in contextIn) context[p] = p === "access" ? {} : contextIn[p];
|
|
17
|
-
for (var p in contextIn.access) context.access[p] = contextIn.access[p];
|
|
18
|
-
context.addInitializer = function (f) { if (done) throw new TypeError("Cannot add initializers after decoration has completed"); extraInitializers.push(accept(f || null)); };
|
|
19
|
-
var result = (0, decorators[i])(kind === "accessor" ? { get: descriptor.get, set: descriptor.set } : descriptor[key], context);
|
|
20
|
-
if (kind === "accessor") {
|
|
21
|
-
if (result === void 0) continue;
|
|
22
|
-
if (result === null || typeof result !== "object") throw new TypeError("Object expected");
|
|
23
|
-
if (_ = accept(result.get)) descriptor.get = _;
|
|
24
|
-
if (_ = accept(result.set)) descriptor.set = _;
|
|
25
|
-
if (_ = accept(result.init)) initializers.unshift(_);
|
|
26
|
-
}
|
|
27
|
-
else if (_ = accept(result)) {
|
|
28
|
-
if (kind === "field") initializers.unshift(_);
|
|
29
|
-
else descriptor[key] = _;
|
|
30
|
-
}
|
|
31
|
-
}
|
|
32
|
-
if (target) Object.defineProperty(target, contextIn.name, descriptor);
|
|
33
|
-
done = true;
|
|
34
|
-
};
|
|
35
|
-
var __setFunctionName = (this && this.__setFunctionName) || function (f, name, prefix) {
|
|
36
|
-
if (typeof name === "symbol") name = name.description ? "[".concat(name.description, "]") : "";
|
|
37
|
-
return Object.defineProperty(f, "name", { configurable: true, value: prefix ? "".concat(prefix, " ", name) : name });
|
|
38
|
-
};
|
|
39
|
-
import { beforePageScan } from '@d-zero/puppeteer-page-scan';
|
|
40
|
-
import { parseUrl } from '@d-zero/shared/parse-url';
|
|
41
|
-
import { retry } from '@d-zero/shared/retry';
|
|
42
|
-
import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
43
|
-
import { launch } from 'puppeteer';
|
|
44
|
-
import { resourceLog, scraperLog } from './debug.js';
|
|
45
|
-
import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
|
|
46
|
-
import { fetchDestination } from './fetch-destination.js';
|
|
47
|
-
import { keywordCheck } from './keyword-check.js';
|
|
48
|
-
import { detectCDN, detectCompress, isError } from './utils.js';
|
|
49
|
-
const pid = `${process.pid}`;
|
|
50
|
-
const log = scraperLog.extend(pid);
|
|
51
|
-
const rLog = resourceLog.extend(pid);
|
|
52
|
-
const LAUNCH_BROWSER_TIMEOUT = 1000 * 30;
|
|
53
|
-
let Scraper = (() => {
|
|
54
|
-
let _classSuper = TypedAwaitEventEmitter;
|
|
55
|
-
let _instanceExtraInitializers = [];
|
|
56
|
-
let _private_bootBrowser_decorators;
|
|
57
|
-
let _private_bootBrowser_descriptor;
|
|
58
|
-
let _private_createPage_decorators;
|
|
59
|
-
let _private_createPage_descriptor;
|
|
60
|
-
let _private_fetchData_decorators;
|
|
61
|
-
let _private_fetchData_descriptor;
|
|
62
|
-
let _private_fetchHead_decorators;
|
|
63
|
-
let _private_fetchHead_descriptor;
|
|
64
|
-
let _private_fetchImages_decorators;
|
|
65
|
-
let _private_fetchImages_descriptor;
|
|
66
|
-
return class Scraper extends _classSuper {
|
|
67
|
-
static {
|
|
68
|
-
const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
|
|
69
|
-
_private_bootBrowser_decorators = [retry()];
|
|
70
|
-
_private_createPage_decorators = [retry()];
|
|
71
|
-
_private_fetchData_decorators = [retry({
|
|
72
|
-
timeout: 1 * 60 * 1000, // 1sec,
|
|
73
|
-
})];
|
|
74
|
-
_private_fetchHead_decorators = [retry()];
|
|
75
|
-
_private_fetchImages_decorators = [retry({
|
|
76
|
-
timeout: 5 * 60 * 1000, // 5sec
|
|
77
|
-
fallback: [],
|
|
78
|
-
})];
|
|
79
|
-
__esDecorate(this, _private_bootBrowser_descriptor = { value: __setFunctionName(async function (isExternal, executablePath, headless) {
|
|
80
|
-
if (!this.#browser) {
|
|
81
|
-
void this.emit('changePhase', {
|
|
82
|
-
pid: process.pid,
|
|
83
|
-
name: 'launchBrowser',
|
|
84
|
-
url: this.#url,
|
|
85
|
-
isExternal,
|
|
86
|
-
message: executablePath || '(executablePath is default)',
|
|
87
|
-
});
|
|
88
|
-
const browser = await launch({
|
|
89
|
-
headless,
|
|
90
|
-
timeout: LAUNCH_BROWSER_TIMEOUT,
|
|
91
|
-
executablePath: executablePath ?? undefined,
|
|
92
|
-
args: [
|
|
93
|
-
// TODO: Optional lang
|
|
94
|
-
'--lang=ja',
|
|
95
|
-
'--no-zygote',
|
|
96
|
-
'--ignore-certificate-errors',
|
|
97
|
-
],
|
|
98
|
-
}).catch((error) => {
|
|
99
|
-
if (error instanceof Error) {
|
|
100
|
-
return error;
|
|
101
|
-
}
|
|
102
|
-
throw error;
|
|
103
|
-
});
|
|
104
|
-
if (browser instanceof Error) {
|
|
105
|
-
void this.emit('error', {
|
|
106
|
-
pid: process.pid,
|
|
107
|
-
url: this.#url,
|
|
108
|
-
shutdown: false,
|
|
109
|
-
error: browser,
|
|
110
|
-
});
|
|
111
|
-
throw browser;
|
|
112
|
-
}
|
|
113
|
-
this.#browser = browser;
|
|
114
|
-
}
|
|
115
|
-
else if (!this.#browser.isConnected()) {
|
|
116
|
-
await this.#browser.close();
|
|
117
|
-
}
|
|
118
|
-
return this.#browser;
|
|
119
|
-
}, "#bootBrowser") }, _private_bootBrowser_decorators, { kind: "method", name: "#bootBrowser", static: false, private: true, access: { has: obj => #bootBrowser in obj, get: obj => obj.#bootBrowser }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
120
|
-
__esDecorate(this, _private_createPage_descriptor = { value: __setFunctionName(async function (isExternal, executablePath, headless) {
|
|
121
|
-
const browser = await this.#bootBrowser(isExternal, executablePath, headless);
|
|
122
|
-
void this.emit('changePhase', {
|
|
123
|
-
pid: process.pid,
|
|
124
|
-
name: 'newPage',
|
|
125
|
-
url: this.#url,
|
|
126
|
-
isExternal,
|
|
127
|
-
message: '',
|
|
128
|
-
});
|
|
129
|
-
const page = await browser.newPage();
|
|
130
|
-
page.setDefaultNavigationTimeout(0);
|
|
131
|
-
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36');
|
|
132
|
-
await page.setExtraHTTPHeaders({
|
|
133
|
-
// TODO: Optional lang
|
|
134
|
-
'Accept-Language': 'ja-JP',
|
|
135
|
-
});
|
|
136
|
-
return page;
|
|
137
|
-
}, "#createPage") }, _private_createPage_decorators, { kind: "method", name: "#createPage", static: false, private: true, access: { has: obj => #createPage in obj, get: obj => obj.#createPage }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
138
|
-
__esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, isGettingImages, options) {
|
|
139
|
-
const networkLogs = {};
|
|
140
|
-
page.on('dialog', async (dialog) => {
|
|
141
|
-
log(`Appear ${dialog.type()} dialog: ${dialog.message()}`);
|
|
142
|
-
try {
|
|
143
|
-
await dialog.accept();
|
|
144
|
-
}
|
|
145
|
-
catch (error) {
|
|
146
|
-
log(`Error: ${error}`);
|
|
147
|
-
}
|
|
148
|
-
log(`Accept ${dialog.type()} dialog`);
|
|
149
|
-
});
|
|
150
|
-
if (!isExternal) {
|
|
151
|
-
page.on('request', (request) => {
|
|
152
|
-
const url = parseUrl(request.url(), options);
|
|
153
|
-
networkLogs[request.url()] = {
|
|
154
|
-
url,
|
|
155
|
-
status: null,
|
|
156
|
-
contentLength: 0,
|
|
157
|
-
contentType: '',
|
|
158
|
-
isError: false,
|
|
159
|
-
request: {
|
|
160
|
-
ts: Date.now(),
|
|
161
|
-
headers: request.headers(),
|
|
162
|
-
method: request.method(),
|
|
163
|
-
},
|
|
164
|
-
};
|
|
165
|
-
});
|
|
166
|
-
const uniqueRes = new Set();
|
|
167
|
-
page.on('response', (response) => {
|
|
168
|
-
const resURL = parseUrl(response.url(), options);
|
|
169
|
-
if (uniqueRes.has(resURL.withoutHash)) {
|
|
170
|
-
return;
|
|
171
|
-
}
|
|
172
|
-
if (resURL.withoutHash === url.withoutHash) {
|
|
173
|
-
return;
|
|
174
|
-
}
|
|
175
|
-
uniqueRes.add(resURL.withoutHash);
|
|
176
|
-
const headers = response.headers();
|
|
177
|
-
const status = response.status();
|
|
178
|
-
const statusText = response.statusText();
|
|
179
|
-
const contentType = headers['content-type']?.split(';')[0] || null;
|
|
180
|
-
const contentLength = Number.parseInt(headers['content-length'] ?? '', 10) || null;
|
|
181
|
-
const request = networkLogs[resURL.withoutHash];
|
|
182
|
-
const log = {
|
|
183
|
-
...request,
|
|
184
|
-
response: {
|
|
185
|
-
ts: Date.now(),
|
|
186
|
-
status,
|
|
187
|
-
statusText,
|
|
188
|
-
fromCache: response.fromCache(),
|
|
189
|
-
headers,
|
|
190
|
-
},
|
|
191
|
-
status,
|
|
192
|
-
isError: isError(status),
|
|
193
|
-
contentType: contentType || '',
|
|
194
|
-
contentLength: contentLength || 0,
|
|
195
|
-
};
|
|
196
|
-
const referredLink = {
|
|
197
|
-
url: resURL,
|
|
198
|
-
isExternal: resURL.hostname !== url.hostname,
|
|
199
|
-
isError: log.isError,
|
|
200
|
-
status,
|
|
201
|
-
statusText,
|
|
202
|
-
contentType,
|
|
203
|
-
contentLength,
|
|
204
|
-
compress: detectCompress(headers),
|
|
205
|
-
cdn: detectCDN(headers),
|
|
206
|
-
headers: headers,
|
|
207
|
-
};
|
|
208
|
-
rLog('Fetched: %s', resURL.href);
|
|
209
|
-
void this.emit('resourceResponse', {
|
|
210
|
-
pid: process.pid,
|
|
211
|
-
url,
|
|
212
|
-
log,
|
|
213
|
-
resource: referredLink,
|
|
214
|
-
});
|
|
215
|
-
});
|
|
216
|
-
}
|
|
217
|
-
void this.emit('changePhase', {
|
|
218
|
-
pid: process.pid,
|
|
219
|
-
name: 'openPage',
|
|
220
|
-
url: this.#url,
|
|
221
|
-
isExternal,
|
|
222
|
-
message: '',
|
|
223
|
-
});
|
|
224
|
-
if (url.username && url.password) {
|
|
225
|
-
await page.setExtraHTTPHeaders({
|
|
226
|
-
Authorization: `Basic ${Buffer.from(`${url.username}:${url.password}`).toString('base64')}`,
|
|
227
|
-
});
|
|
228
|
-
}
|
|
229
|
-
const res = await page.goto(url.withoutHashAndAuth);
|
|
230
|
-
if (!res) {
|
|
231
|
-
throw new Error('The method Page.goto returned null');
|
|
232
|
-
}
|
|
233
|
-
const destUrl = parseUrl(page.url(), options);
|
|
234
|
-
const redirectPaths = res
|
|
235
|
-
.request()
|
|
236
|
-
.redirectChain()
|
|
237
|
-
.map((req) => req.url());
|
|
238
|
-
if (destUrl.withoutHash !== url.withoutHash) {
|
|
239
|
-
redirectPaths.push(destUrl.withoutHash);
|
|
240
|
-
}
|
|
241
|
-
if (destUrl.hostname !== url.hostname) {
|
|
242
|
-
isExternal = true;
|
|
243
|
-
}
|
|
244
|
-
const status = res.status();
|
|
245
|
-
const statusText = res.statusText();
|
|
246
|
-
const responseHeaders = res.headers();
|
|
247
|
-
const contentType = responseHeaders['content-type']?.split(';')[0] || null;
|
|
248
|
-
const _contentLength = Number.parseInt(responseHeaders['content-length'] ?? '');
|
|
249
|
-
const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
|
|
250
|
-
if (contentType !== 'text/html') {
|
|
251
|
-
return {
|
|
252
|
-
url,
|
|
253
|
-
isTarget: false,
|
|
254
|
-
isExternal,
|
|
255
|
-
redirectPaths,
|
|
256
|
-
status,
|
|
257
|
-
statusText,
|
|
258
|
-
contentType,
|
|
259
|
-
contentLength,
|
|
260
|
-
responseHeaders,
|
|
261
|
-
meta: {
|
|
262
|
-
title: '',
|
|
263
|
-
},
|
|
264
|
-
imageList: [],
|
|
265
|
-
anchorList: [],
|
|
266
|
-
html: '',
|
|
267
|
-
isSkipped: false,
|
|
268
|
-
};
|
|
269
|
-
}
|
|
270
|
-
void this.emit('changePhase', {
|
|
271
|
-
pid: process.pid,
|
|
272
|
-
name: 'loadDOMContent',
|
|
273
|
-
url: this.#url,
|
|
274
|
-
isExternal,
|
|
275
|
-
message: '',
|
|
276
|
-
});
|
|
277
|
-
await page
|
|
278
|
-
.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 5000 })
|
|
279
|
-
.catch(() => { });
|
|
280
|
-
void this.emit('changePhase', {
|
|
281
|
-
pid: process.pid,
|
|
282
|
-
name: 'getHTML',
|
|
283
|
-
url: this.#url,
|
|
284
|
-
isExternal,
|
|
285
|
-
message: '',
|
|
286
|
-
});
|
|
287
|
-
const { title, html } = await page.evaluate(() => {
|
|
288
|
-
/* global document */
|
|
289
|
-
return {
|
|
290
|
-
title: document.title,
|
|
291
|
-
html: document.documentElement.outerHTML,
|
|
292
|
-
};
|
|
293
|
-
});
|
|
294
|
-
if (isExternal) {
|
|
295
|
-
return {
|
|
296
|
-
url,
|
|
297
|
-
isTarget: false,
|
|
298
|
-
isExternal,
|
|
299
|
-
redirectPaths,
|
|
300
|
-
status,
|
|
301
|
-
statusText,
|
|
302
|
-
contentType,
|
|
303
|
-
contentLength,
|
|
304
|
-
responseHeaders,
|
|
305
|
-
meta: {
|
|
306
|
-
title,
|
|
307
|
-
},
|
|
308
|
-
imageList: [],
|
|
309
|
-
anchorList: [],
|
|
310
|
-
html,
|
|
311
|
-
isSkipped: false,
|
|
312
|
-
};
|
|
313
|
-
}
|
|
314
|
-
void this.emit('changePhase', {
|
|
315
|
-
pid: process.pid,
|
|
316
|
-
name: 'waitNetworkIdleZero',
|
|
317
|
-
url: this.#url,
|
|
318
|
-
isExternal,
|
|
319
|
-
message: '',
|
|
320
|
-
});
|
|
321
|
-
await page
|
|
322
|
-
.waitForNavigation({ waitUntil: 'networkidle0', timeout: 5000 })
|
|
323
|
-
.catch(() => { });
|
|
324
|
-
void this.emit('changePhase', {
|
|
325
|
-
pid: process.pid,
|
|
326
|
-
name: 'getAnchors',
|
|
327
|
-
url: this.#url,
|
|
328
|
-
isExternal,
|
|
329
|
-
message: '',
|
|
330
|
-
});
|
|
331
|
-
const anchorList = await getAnchorList(page, options);
|
|
332
|
-
void this.emit('changePhase', {
|
|
333
|
-
pid: process.pid,
|
|
334
|
-
name: 'getMeta',
|
|
335
|
-
url: this.#url,
|
|
336
|
-
isExternal,
|
|
337
|
-
message: '',
|
|
338
|
-
});
|
|
339
|
-
const meta = await getMeta(page);
|
|
340
|
-
const imageList = isGettingImages ? await this.#fetchImages(page, isExternal) : [];
|
|
341
|
-
return {
|
|
342
|
-
url,
|
|
343
|
-
isTarget: true,
|
|
344
|
-
isExternal,
|
|
345
|
-
redirectPaths,
|
|
346
|
-
status,
|
|
347
|
-
statusText,
|
|
348
|
-
contentType,
|
|
349
|
-
contentLength,
|
|
350
|
-
responseHeaders,
|
|
351
|
-
meta,
|
|
352
|
-
anchorList,
|
|
353
|
-
imageList,
|
|
354
|
-
html,
|
|
355
|
-
isSkipped: false,
|
|
356
|
-
};
|
|
357
|
-
}, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
358
|
-
__esDecorate(this, _private_fetchHead_descriptor = { value: __setFunctionName(async function (url, isExternal) {
|
|
359
|
-
return await fetchDestination(url, isExternal);
|
|
360
|
-
}, "#fetchHead") }, _private_fetchHead_decorators, { kind: "method", name: "#fetchHead", static: false, private: true, access: { has: obj => #fetchHead in obj, get: obj => obj.#fetchHead }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
361
|
-
__esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, isExternal) {
|
|
362
|
-
const url = this.#url.withoutHashAndAuth;
|
|
363
|
-
const imageList = [];
|
|
364
|
-
const devices = [
|
|
365
|
-
{ name: 'desktop', width: 1280 },
|
|
366
|
-
{ name: 'mobile', width: 320, resolution: 2 },
|
|
367
|
-
];
|
|
368
|
-
for (const device of devices) {
|
|
369
|
-
void this.emit('changePhase', {
|
|
370
|
-
pid: process.pid,
|
|
371
|
-
name: 'setViewport',
|
|
372
|
-
url: this.#url,
|
|
373
|
-
isExternal,
|
|
374
|
-
message: device.name,
|
|
375
|
-
});
|
|
376
|
-
await beforePageScan(page, url, {
|
|
377
|
-
name: device.name,
|
|
378
|
-
width: device.width,
|
|
379
|
-
resolution: device.resolution,
|
|
380
|
-
timeout: 5000,
|
|
381
|
-
});
|
|
382
|
-
void this.emit('changePhase', {
|
|
383
|
-
pid: process.pid,
|
|
384
|
-
name: 'getImages',
|
|
385
|
-
url: this.#url,
|
|
386
|
-
isExternal,
|
|
387
|
-
message: device.name,
|
|
388
|
-
});
|
|
389
|
-
const images = await getImageList(page, device.width);
|
|
390
|
-
imageList.push(...images);
|
|
391
|
-
}
|
|
392
|
-
return imageList;
|
|
393
|
-
}, "#fetchImages") }, _private_fetchImages_decorators, { kind: "method", name: "#fetchImages", static: false, private: true, access: { has: obj => #fetchImages in obj, get: obj => obj.#fetchImages }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
394
|
-
if (_metadata) Object.defineProperty(this, Symbol.metadata, { enumerable: true, configurable: true, writable: true, value: _metadata });
|
|
395
|
-
}
|
|
396
|
-
#browser = (__runInitializers(this, _instanceExtraInitializers), null);
|
|
397
|
-
#url = null;
|
|
398
|
-
async destroy(isExternal) {
|
|
399
|
-
log('Scraper destroys self');
|
|
400
|
-
if (!this.#url) {
|
|
401
|
-
throw new Error('The instance is already destroyed.');
|
|
402
|
-
}
|
|
403
|
-
if (!this.#browser) {
|
|
404
|
-
void this.emit('destroyed', {
|
|
405
|
-
pid: process.pid,
|
|
406
|
-
});
|
|
407
|
-
void this.emit('changePhase', {
|
|
408
|
-
pid: process.pid,
|
|
409
|
-
name: 'destroyed',
|
|
410
|
-
url: this.#url,
|
|
411
|
-
isExternal,
|
|
412
|
-
message: '',
|
|
413
|
-
});
|
|
414
|
-
return;
|
|
415
|
-
}
|
|
416
|
-
while (!this.#browser.isConnected()) {
|
|
417
|
-
log('Browser closes all pages');
|
|
418
|
-
const pages = await this.#browser.pages();
|
|
419
|
-
for (const page of pages) {
|
|
420
|
-
page.removeAllListeners();
|
|
421
|
-
if (!page.isClosed) {
|
|
422
|
-
await page.close();
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
log('Browser closes self');
|
|
426
|
-
await this.#browser.close();
|
|
427
|
-
log('Browser disconnects');
|
|
428
|
-
await this.#browser.disconnect();
|
|
429
|
-
}
|
|
430
|
-
log('Scraper discards browser');
|
|
431
|
-
this.#browser = null;
|
|
432
|
-
void this.emit('destroyed', {
|
|
433
|
-
pid: process.pid,
|
|
434
|
-
});
|
|
435
|
-
void this.emit('changePhase', {
|
|
436
|
-
pid: process.pid,
|
|
437
|
-
name: 'destroyed',
|
|
438
|
-
url: this.#url,
|
|
439
|
-
isExternal,
|
|
440
|
-
message: '',
|
|
441
|
-
});
|
|
442
|
-
}
|
|
443
|
-
async scrapeStart(url, options, isSkip = false) {
|
|
444
|
-
const isExternal = options?.isExternal ?? false;
|
|
445
|
-
const isGettingImages = options?.isGettingImages ?? true;
|
|
446
|
-
const excludeKeywords = options?.excludeKeywords ?? [];
|
|
447
|
-
const executablePath = options?.executablePath ?? null;
|
|
448
|
-
const isTitleOnly = options?.isTitleOnly ?? false;
|
|
449
|
-
this.#url = url;
|
|
450
|
-
void this.emit('changePhase', {
|
|
451
|
-
pid: process.pid,
|
|
452
|
-
name: 'scrapeStart',
|
|
453
|
-
url: this.#url,
|
|
454
|
-
isExternal,
|
|
455
|
-
message: '',
|
|
456
|
-
});
|
|
457
|
-
if (isSkip) {
|
|
458
|
-
void this.emit('ignoreAndSkip', {
|
|
459
|
-
pid: process.pid,
|
|
460
|
-
url: this.#url,
|
|
461
|
-
reason: {
|
|
462
|
-
matchedText: this.#url.pathname || '',
|
|
463
|
-
excludeKeywords,
|
|
464
|
-
},
|
|
465
|
-
});
|
|
466
|
-
void this.emit('changePhase', {
|
|
467
|
-
pid: process.pid,
|
|
468
|
-
name: 'ignoreAndSkip',
|
|
469
|
-
url: this.#url,
|
|
470
|
-
isExternal,
|
|
471
|
-
message: 'Matched: excluded path',
|
|
472
|
-
});
|
|
473
|
-
return;
|
|
474
|
-
}
|
|
475
|
-
if (!this.#url.isHTTP) {
|
|
476
|
-
const result = {
|
|
477
|
-
url: this.#url,
|
|
478
|
-
isTarget: false,
|
|
479
|
-
isExternal,
|
|
480
|
-
redirectPaths: [],
|
|
481
|
-
status: -1,
|
|
482
|
-
statusText: '__THIS_IS_NOT_HTTP_PROTOCOL__',
|
|
483
|
-
contentType: null,
|
|
484
|
-
contentLength: null,
|
|
485
|
-
responseHeaders: {},
|
|
486
|
-
meta: {
|
|
487
|
-
title: '',
|
|
488
|
-
},
|
|
489
|
-
imageList: [],
|
|
490
|
-
anchorList: [],
|
|
491
|
-
html: '',
|
|
492
|
-
isSkipped: false,
|
|
493
|
-
};
|
|
494
|
-
void this.emit('scrapeEnd', {
|
|
495
|
-
pid: process.pid,
|
|
496
|
-
url: this.#url,
|
|
497
|
-
timestamp: Date.now(),
|
|
498
|
-
result,
|
|
499
|
-
});
|
|
500
|
-
void this.emit('changePhase', {
|
|
501
|
-
pid: process.pid,
|
|
502
|
-
name: 'scrapeEnd',
|
|
503
|
-
url: this.#url,
|
|
504
|
-
isExternal,
|
|
505
|
-
message: '',
|
|
506
|
-
});
|
|
507
|
-
return;
|
|
508
|
-
}
|
|
509
|
-
void this.emit('changePhase', {
|
|
510
|
-
pid: process.pid,
|
|
511
|
-
name: 'touchHead',
|
|
512
|
-
url: this.#url,
|
|
513
|
-
isExternal,
|
|
514
|
-
message: '',
|
|
515
|
-
});
|
|
516
|
-
let result = await this.#fetchHead(url, isExternal);
|
|
517
|
-
if (result instanceof Error) {
|
|
518
|
-
log('Error(FETCH_HEAD): %s', url.href);
|
|
519
|
-
void this.emit('error', {
|
|
520
|
-
pid: process.pid,
|
|
521
|
-
url: this.#url,
|
|
522
|
-
shutdown: false,
|
|
523
|
-
error: result,
|
|
524
|
-
});
|
|
525
|
-
result = null;
|
|
526
|
-
}
|
|
527
|
-
if (result && isTitleOnly) {
|
|
528
|
-
void this.emit('scrapeEnd', {
|
|
529
|
-
pid: process.pid,
|
|
530
|
-
url: this.#url,
|
|
531
|
-
timestamp: Date.now(),
|
|
532
|
-
result: {
|
|
533
|
-
...result,
|
|
534
|
-
isTarget: false,
|
|
535
|
-
},
|
|
536
|
-
});
|
|
537
|
-
return;
|
|
538
|
-
}
|
|
539
|
-
if (result === null || result.contentType === 'text/html') {
|
|
540
|
-
const headlessMode = url.isSecure ? true : 'shell';
|
|
541
|
-
const page = await this.#createPage(isExternal, executablePath, headlessMode);
|
|
542
|
-
result = await this.#fetchData(page, url, isExternal, isGettingImages, options).catch((error) => {
|
|
543
|
-
if (error instanceof Error) {
|
|
544
|
-
return error;
|
|
545
|
-
}
|
|
546
|
-
return new Error(error);
|
|
547
|
-
});
|
|
548
|
-
if (result instanceof Error) {
|
|
549
|
-
log('Error(FETCH_DATA): %s', url.href);
|
|
550
|
-
void this.emit('error', {
|
|
551
|
-
pid: process.pid,
|
|
552
|
-
url: this.#url,
|
|
553
|
-
shutdown: true,
|
|
554
|
-
error: result,
|
|
555
|
-
});
|
|
556
|
-
await this.destroy(isExternal);
|
|
557
|
-
return;
|
|
558
|
-
}
|
|
559
|
-
page.removeAllListeners();
|
|
560
|
-
if (!page.isClosed) {
|
|
561
|
-
await page.close();
|
|
562
|
-
}
|
|
563
|
-
if (!result.isSkipped) {
|
|
564
|
-
const checkedKeyword = keywordCheck(result.html, excludeKeywords);
|
|
565
|
-
if (checkedKeyword) {
|
|
566
|
-
result = {
|
|
567
|
-
url,
|
|
568
|
-
isSkipped: true,
|
|
569
|
-
matched: {
|
|
570
|
-
type: 'keyword',
|
|
571
|
-
text: checkedKeyword,
|
|
572
|
-
excludeKeywords,
|
|
573
|
-
},
|
|
574
|
-
};
|
|
575
|
-
}
|
|
576
|
-
}
|
|
577
|
-
if (result.isSkipped) {
|
|
578
|
-
if (result.matched.type === 'path') {
|
|
579
|
-
return;
|
|
580
|
-
}
|
|
581
|
-
void this.emit('ignoreAndSkip', {
|
|
582
|
-
pid: process.pid,
|
|
583
|
-
url: this.#url,
|
|
584
|
-
reason: {
|
|
585
|
-
matchedText: result.matched.text,
|
|
586
|
-
excludeKeywords,
|
|
587
|
-
},
|
|
588
|
-
});
|
|
589
|
-
void this.emit('changePhase', {
|
|
590
|
-
pid: process.pid,
|
|
591
|
-
name: 'ignoreAndSkip',
|
|
592
|
-
url: this.#url,
|
|
593
|
-
isExternal,
|
|
594
|
-
message: `Matched: "${result.matched.text}"`,
|
|
595
|
-
});
|
|
596
|
-
return;
|
|
597
|
-
}
|
|
598
|
-
}
|
|
599
|
-
void this.emit('scrapeEnd', {
|
|
600
|
-
pid: process.pid,
|
|
601
|
-
url: this.#url,
|
|
602
|
-
timestamp: Date.now(),
|
|
603
|
-
result,
|
|
604
|
-
});
|
|
605
|
-
void this.emit('changePhase', {
|
|
606
|
-
pid: process.pid,
|
|
607
|
-
name: 'scrapeEnd',
|
|
608
|
-
url: this.#url,
|
|
609
|
-
isExternal,
|
|
610
|
-
message: '',
|
|
611
|
-
});
|
|
612
|
-
return result;
|
|
613
|
-
}
|
|
614
|
-
get #bootBrowser() { return _private_bootBrowser_descriptor.value; }
|
|
615
|
-
get #createPage() { return _private_createPage_descriptor.value; }
|
|
616
|
-
get #fetchData() { return _private_fetchData_descriptor.value; }
|
|
617
|
-
get #fetchHead() { return _private_fetchHead_descriptor.value; }
|
|
618
|
-
get #fetchImages() { return _private_fetchImages_descriptor.value; }
|
|
619
|
-
};
|
|
620
|
-
})();
|
|
621
|
-
export default Scraper;
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import type { ScraperOptions } from './scraper.js';
|
|
2
|
-
import type { SubProcessRunnerEventTypes, ExURL } from './types.js';
|
|
3
|
-
import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
4
|
-
export default class SubProcessRunner extends TypedAwaitEventEmitter<SubProcessRunnerEventTypes> {
|
|
5
|
-
#private;
|
|
6
|
-
get state(): "waiting" | "running";
|
|
7
|
-
constructor(resetTime: number);
|
|
8
|
-
destory(): void;
|
|
9
|
-
getUndeadPid(): number[];
|
|
10
|
-
kill(): void;
|
|
11
|
-
start(url: ExURL, options: ScraperOptions, isSkip: boolean, interval: number): void;
|
|
12
|
-
}
|