@d-zero/beholder 0.1.29 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CHANGELOG.md +11 -0
  2. package/README.md +172 -477
  3. package/dist/debug.d.ts +4 -1
  4. package/dist/debug.js +5 -2
  5. package/dist/dom-evaluation.d.ts +72 -14
  6. package/dist/dom-evaluation.js +169 -43
  7. package/dist/index.d.ts +20 -3
  8. package/dist/index.js +15 -3
  9. package/dist/is-error.d.ts +8 -0
  10. package/dist/is-error.js +10 -0
  11. package/dist/keyword-check.d.ts +5 -3
  12. package/dist/keyword-check.js +5 -3
  13. package/dist/parse-url.d.ts +14 -0
  14. package/dist/parse-url.js +23 -0
  15. package/dist/scraper.d.ts +39 -13
  16. package/dist/scraper.js +300 -263
  17. package/dist/types.d.ts +286 -214
  18. package/dist/types.js +6 -0
  19. package/package.json +7 -10
  20. package/src/debug.ts +5 -2
  21. package/src/dom-evaluation.ts +195 -65
  22. package/src/index.ts +27 -3
  23. package/src/is-error.spec.ts +33 -0
  24. package/src/is-error.ts +10 -0
  25. package/src/keyword-check.spec.ts +45 -4
  26. package/src/keyword-check.ts +5 -3
  27. package/src/parse-url.spec.ts +35 -0
  28. package/src/parse-url.ts +26 -0
  29. package/src/scraper.ts +338 -300
  30. package/src/types.ts +345 -258
  31. package/tsconfig.tsbuildinfo +1 -1
  32. package/dist/events.d.ts +0 -32
  33. package/dist/events.js +0 -15
  34. package/dist/fetch-destination.d.ts +0 -8
  35. package/dist/fetch-destination.js +0 -145
  36. package/dist/net-timeout-error.d.ts +0 -3
  37. package/dist/net-timeout-error.js +0 -3
  38. package/dist/sub-process-runner.d.ts +0 -12
  39. package/dist/sub-process-runner.js +0 -180
  40. package/dist/sub-process.d.ts +0 -1
  41. package/dist/sub-process.js +0 -67
  42. package/dist/utils.d.ts +0 -16
  43. package/dist/utils.js +0 -69
  44. package/src/events.ts +0 -21
  45. package/src/fetch-destination.ts +0 -173
  46. package/src/net-timeout-error.ts +0 -3
  47. package/src/sub-process-runner.ts +0 -220
  48. package/src/sub-process.ts +0 -86
  49. package/src/utils.ts +0 -89
@@ -1,180 +0,0 @@
1
- import childProcess from 'node:child_process';
2
- import path from 'node:path';
3
- import { delay } from '@d-zero/shared/delay';
4
- import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
5
- import { isType } from 'typescript-fsa';
6
- import { scraperLog } from './debug.js';
7
- import { scraperEvent, subProcessEvent } from './events.js';
8
- const __filename = new globalThis.URL(import.meta.url).pathname;
9
- const __dirname = path.dirname(__filename);
10
- const SUB_PROCESS_PATH = path.resolve(__dirname, 'sub-process');
11
- export default class SubProcessRunner extends TypedAwaitEventEmitter {
12
- #resetTime;
13
- #scrapedTimes = 0;
14
- #state = 'waiting';
15
- #subProcess = null;
16
- #undeadPid = new Set();
17
- get state() {
18
- return this.#state;
19
- }
20
- constructor(resetTime) {
21
- super();
22
- this.#resetTime = resetTime;
23
- }
24
- destory() {
25
- const pid = this.#subProcess?.pid;
26
- if (this.#subProcess) {
27
- scraperLog('Destroys child_process (%d)', pid);
28
- this.#subProcess.send(subProcessEvent.destroy());
29
- if (pid) {
30
- void this.emit('reset', {
31
- pid,
32
- });
33
- void this.emit('changePhase', {
34
- pid,
35
- name: 'reset',
36
- url: null,
37
- isExternal: false,
38
- message: 'Reseting sub-process',
39
- });
40
- }
41
- this.#destroyed();
42
- return;
43
- }
44
- this.#destroyed();
45
- }
46
- getUndeadPid() {
47
- return [...this.#undeadPid];
48
- }
49
- kill() {
50
- if (!this.#subProcess) {
51
- return;
52
- }
53
- if (!this.#subProcess.killed) {
54
- scraperLog('Kills(SIGKILL) child_process (%d) ', this.#subProcess.pid);
55
- this.#subProcess.kill('SIGKILL');
56
- return;
57
- }
58
- scraperLog('child_process(%d) is already killed', this.#subProcess.pid);
59
- }
60
- start(url, options, isSkip, interval) {
61
- if (this.#state === 'running') {
62
- throw new Error(`Sub Routine (PID: ${this.#subProcess?.pid}) is already running`);
63
- }
64
- void this.#scrape(url, options, isSkip, interval);
65
- }
66
- #createSubProcess(url) {
67
- const subProcess = childProcess.fork(SUB_PROCESS_PATH, {
68
- detached: false,
69
- });
70
- subProcess.on('message', (action) => {
71
- if (isType(action, scraperEvent.changePhase)) {
72
- void this.emit('changePhase', action.payload);
73
- }
74
- if (isType(action, scraperEvent.ignoreAndSkip)) {
75
- this.#finished();
76
- }
77
- if (isType(action, scraperEvent.scrapeEnd)) {
78
- this.#finished();
79
- }
80
- if (isType(action, scraperEvent.destroyed)) {
81
- this.#destroyed();
82
- }
83
- if (isType(action, scraperEvent.ignoreAndSkip) ||
84
- isType(action, scraperEvent.resourceResponse) ||
85
- isType(action, scraperEvent.scrapeEnd) ||
86
- isType(action, scraperEvent.destroyed)) {
87
- void this.emit('scrapeEvent', action);
88
- }
89
- if (isType(action, scraperEvent.error)) {
90
- const error = new Error(action.payload.error.message);
91
- error.name = action.payload.error.name;
92
- error.stack = action.payload.error.stack;
93
- const _action = {
94
- type: action.type,
95
- payload: {
96
- pid: action.payload.pid,
97
- url: action.payload.url,
98
- shutdown: action.payload.shutdown,
99
- error,
100
- },
101
- };
102
- void this.emit('scrapeEvent', _action);
103
- }
104
- });
105
- subProcess.on('disconnect', () => {
106
- if (subProcess.killed) {
107
- scraperLog('child_process(%d) is disconnected and killed', subProcess.pid);
108
- return;
109
- }
110
- scraperLog('child_process(%d) is disconnected but not killed', subProcess.pid);
111
- scraperLog('Retries to kill(SIGTERM) child_process(%d)', subProcess.pid);
112
- subProcess.kill('SIGTERM');
113
- void this.emit('changePhase', {
114
- pid: subProcess.pid,
115
- name: 'disconnect',
116
- url: null,
117
- isExternal: false,
118
- message: 'Disconnecting sub-process',
119
- });
120
- });
121
- subProcess.on('error', (e) => {
122
- void this.emit('error', {
123
- pid: subProcess.pid,
124
- url,
125
- shutdown: true,
126
- error: e instanceof Error ? e : new Error(`${e}`),
127
- });
128
- });
129
- return subProcess;
130
- }
131
- #destroyed() {
132
- if (this.#subProcess && !this.#subProcess.killed && this.#subProcess.pid) {
133
- scraperLog('Add child_process(%d) to the undead PID list', this.#subProcess.pid);
134
- this.#undeadPid.add(this.#subProcess.pid);
135
- }
136
- this.#subProcess = null;
137
- this.#scrapedTimes = 0;
138
- this.#finally();
139
- }
140
- #finally() {
141
- this.#state = 'waiting';
142
- }
143
- #finished() {
144
- this.#scrapedTimes += 1;
145
- if (this.#scrapedTimes >= this.#resetTime) {
146
- this.destory();
147
- }
148
- else {
149
- this.#finally();
150
- }
151
- }
152
- async #scrape(url, options, isSkip, interval) {
153
- if (!this.#subProcess) {
154
- this.#subProcess = this.#createSubProcess(url);
155
- void this.emit('changePhase', {
156
- pid: this.#subProcess.pid,
157
- name: 'boot',
158
- url,
159
- isExternal: options.isExternal,
160
- message: 'Booting sub-process',
161
- });
162
- }
163
- this.#state = 'running';
164
- interval = Math.max(interval, 0);
165
- if (interval) {
166
- await delay(interval);
167
- }
168
- this.#subProcess.send(subProcessEvent.start({
169
- url,
170
- isExternal: options.isExternal,
171
- isGettingImages: options.isGettingImages,
172
- excludeKeywords: options.excludeKeywords,
173
- executablePath: options.executablePath,
174
- disableQueries: options.disableQueries ?? false,
175
- isSkip,
176
- isTitleOnly: options.isTitleOnly,
177
- screenshot: options.screenshot,
178
- }));
179
- }
180
- }
@@ -1 +0,0 @@
1
- export {};
@@ -1,67 +0,0 @@
1
- // Run on child_process
2
- import { isType } from 'typescript-fsa';
3
- import { scraperLog } from './debug.js';
4
- import { scraperEvent, subProcessEvent } from './events.js';
5
- import Scraper from './scraper.js';
6
- const log = scraperLog.extend(`${process.pid}`);
7
- process.title = 'beholder';
8
- const scraper = new Scraper();
9
- scraper.on('resourceResponse', (context) => {
10
- if (process.connected) {
11
- process.send(scraperEvent.resourceResponse(context));
12
- }
13
- });
14
- scraper.on('ignoreAndSkip', (context) => {
15
- if (process.connected) {
16
- process.send(scraperEvent.ignoreAndSkip(context));
17
- }
18
- });
19
- scraper.on('scrapeEnd', (context) => {
20
- if (process.connected) {
21
- process.send(scraperEvent.scrapeEnd(context));
22
- }
23
- });
24
- scraper.on('error', (context) => {
25
- if (process.connected) {
26
- const _context = {
27
- ...context,
28
- error: {
29
- name: context.error.name,
30
- message: context.error.message,
31
- stack: context.error.stack,
32
- },
33
- };
34
- process.send(scraperEvent.error(_context));
35
- }
36
- });
37
- scraper.on('changePhase', (context) => {
38
- if (process.connected) {
39
- process.send(scraperEvent.changePhase(context));
40
- }
41
- });
42
- process.on('message', async (action) => {
43
- if (isType(action, subProcessEvent.start)) {
44
- void scraper.scrapeStart(action.payload.url, {
45
- isExternal: action.payload.isExternal,
46
- isGettingImages: action.payload.isGettingImages,
47
- excludeKeywords: action.payload.excludeKeywords,
48
- executablePath: action.payload.executablePath,
49
- disableQueries: action.payload.disableQueries,
50
- isTitleOnly: action.payload.isTitleOnly,
51
- screenshot: action.payload.screenshot,
52
- }, action.payload.isSkip);
53
- }
54
- if (isType(action, subProcessEvent.destroy)) {
55
- await scraper.destroy(false);
56
- }
57
- });
58
- scraper.on('destroyed', (context) => {
59
- if (process.connected) {
60
- process.send(scraperEvent.destroyed(context));
61
- log('disconnects process');
62
- process.disconnect();
63
- }
64
- });
65
- process.on('disconnect', () => {
66
- log('Process is disconnected');
67
- });
package/dist/utils.d.ts DELETED
@@ -1,16 +0,0 @@
1
- import type { CDNType, CompressType } from './types.js';
2
- /**
3
- *
4
- * @param status
5
- */
6
- export declare function isError(status: number): boolean;
7
- /**
8
- *
9
- * @param headers
10
- */
11
- export declare function detectCompress(headers: Record<string, string | string[] | undefined>): false | CompressType;
12
- /**
13
- *
14
- * @param headers
15
- */
16
- export declare function detectCDN(headers: Record<string, string | string[] | undefined>): false | CDNType;
package/dist/utils.js DELETED
@@ -1,69 +0,0 @@
1
- /**
2
- *
3
- * @param status
4
- */
5
- export function isError(status) {
6
- return !(200 <= status && status < 400);
7
- }
8
- /**
9
- *
10
- * @param headers
11
- */
12
- export function detectCompress(headers) {
13
- const enc = 'content-encoding' in headers && typeof headers['content-encoding'] === 'string'
14
- ? headers['content-encoding']
15
- : '';
16
- if (/gzip/i.test(enc)) {
17
- return 'gzip';
18
- }
19
- if (/br/i.test(enc)) {
20
- return 'br';
21
- }
22
- if (/compress/i.test(enc)) {
23
- return 'compress';
24
- }
25
- if (/deflate/i.test(enc)) {
26
- return 'deflate';
27
- }
28
- // cspell:disable-next
29
- if (/sdch/i.test(enc)) {
30
- // cspell:disable-next
31
- return 'sdch';
32
- }
33
- // cspell:disable-next
34
- if (/vcdiff/i.test(enc)) {
35
- // cspell:disable-next
36
- return 'vcdiff';
37
- }
38
- // cspell:disable-next
39
- if (/xdelta/i.test(enc)) {
40
- // cspell:disable-next
41
- return 'xdelta';
42
- }
43
- return false;
44
- }
45
- /**
46
- *
47
- * @param headers
48
- */
49
- export function detectCDN(headers) {
50
- if ('X-Akamai-Transformed' in headers) {
51
- return 'Akamai';
52
- }
53
- if ('x-amz-cf-pop' in headers) {
54
- return 'Amazon CloudFront';
55
- }
56
- if ('X-IIJ-Cache' in headers) {
57
- return 'IIJ';
58
- }
59
- if (typeof headers.server === 'string') {
60
- if (/cloudflare/i.test(headers.server)) {
61
- return 'Cloudflare';
62
- }
63
- if (/amazons3/i.test(headers.server)) {
64
- return 'Amazon S3';
65
- }
66
- return false;
67
- }
68
- return false;
69
- }
package/src/events.ts DELETED
@@ -1,21 +0,0 @@
1
- import type { ScrapeEventTypes, SubProcessEventTypes } from './types.js';
2
-
3
- import { actionCreatorFactory } from 'typescript-fsa';
4
-
5
- const scraperEventCreator = actionCreatorFactory('@@scraper');
6
- const subProcessEventCreator = actionCreatorFactory('@@sub-process');
7
-
8
- export const subProcessEvent = {
9
- start: subProcessEventCreator<SubProcessEventTypes['start']>('start'),
10
- destroy: subProcessEventCreator<SubProcessEventTypes['destroy']>('destroy'),
11
- };
12
-
13
- export const scraperEvent = {
14
- ignoreAndSkip: scraperEventCreator<ScrapeEventTypes['ignoreAndSkip']>('ignoreAndSkip'),
15
- resourceResponse:
16
- scraperEventCreator<ScrapeEventTypes['resourceResponse']>('resourceResponse'),
17
- scrapeEnd: scraperEventCreator<ScrapeEventTypes['scrapeEnd']>('scrapeEnd'),
18
- destroyed: scraperEventCreator<ScrapeEventTypes['destroyed']>('destroyed'),
19
- error: scraperEventCreator<ScrapeEventTypes['error']>('error'),
20
- changePhase: scraperEventCreator<ScrapeEventTypes['changePhase']>('changePhase'),
21
- };
@@ -1,173 +0,0 @@
1
- import type { PageData, ExURL } from './types.js';
2
- import type { FollowResponse, RedirectableRequest } from 'follow-redirects';
3
- import type { ClientRequest, IncomingMessage, RequestOptions } from 'node:http';
4
-
5
- import { delay } from '@d-zero/shared/delay';
6
- import redirects from 'follow-redirects';
7
-
8
- import NetTimeoutError from './net-timeout-error.js';
9
-
10
- const cacheMap = new Map<string, PageData | Error>();
11
-
12
- /**
13
- *
14
- * @param url
15
- * @param isExternal
16
- * @param method
17
- */
18
- export async function fetchDestination(
19
- url: ExURL,
20
- isExternal: boolean,
21
- method = 'HEAD',
22
- ): Promise<PageData> {
23
- if (cacheMap.has(url.withoutHash)) {
24
- const cache = cacheMap.get(url.withoutHash)!;
25
- if (cache instanceof Error) {
26
- throw cache;
27
- }
28
- return cache;
29
- }
30
-
31
- const result = await Promise.race([
32
- _fetchHead(url, isExternal, method).catch((error) => new Error(error)),
33
- (async () => {
34
- await delay(10 * 1000);
35
- return new NetTimeoutError();
36
- })(),
37
- ]);
38
-
39
- cacheMap.set(url.withoutHash, result);
40
- if (result instanceof Error) {
41
- throw result;
42
- }
43
-
44
- return result;
45
- }
46
-
47
- /**
48
- *
49
- * @param url
50
- * @param isExternal
51
- * @param method
52
- */
53
- async function _fetchHead(url: ExURL, isExternal: boolean, method: string) {
54
- return new Promise<PageData>((resolve, reject) => {
55
- const hostHeader = url.port ? `${url.hostname}:${url.port}` : url.hostname;
56
- const request: RequestOptions = {
57
- protocol: url.protocol,
58
- hostname: url.hostname,
59
- port: url.port || undefined,
60
- path: url.pathname,
61
- method,
62
- headers: {
63
- host: hostHeader,
64
- Connection: 'keep-alive',
65
- Pragma: 'no-cache',
66
- 'Cache-Control': 'no-cache',
67
- 'Upgrade-Insecure-Requests': 1,
68
- // TODO: 'User-Agent': userAgent,
69
- Accept:
70
- 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', // cspell:disable-line
71
- 'Accept-Encoding': 'gzip, deflate',
72
- 'Accept-Language':
73
- 'ja,en;q=0.9,zh;q=0.8,en-US;q=0.7,pl;q=0.6,de;q=0.5,zh-CN;q=0.4,zh-TW;q=0.3,th;q=0.2,ko;q=0.1,fr;q=0.1',
74
- },
75
- };
76
-
77
- if (url.username && url.password) {
78
- request.auth = `${url.username}:${url.password}`;
79
- }
80
-
81
- let req: RedirectableRequest<ClientRequest, IncomingMessage>;
82
- const response = (res: IncomingMessage & FollowResponse) => {
83
- res.on('data', () => {});
84
- res.on('end', async () => {
85
- const redirectPaths = res.redirects.map((r) => r.url);
86
- const _contentLength = Number.parseInt(res.headers['content-length'] || '');
87
- const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
88
- let rep: PageData = {
89
- url,
90
- isTarget: !isExternal,
91
- isExternal,
92
- redirectPaths,
93
- status: res.statusCode || 0,
94
- statusText: res.statusMessage || '',
95
- contentType: res.headers['content-type']?.split(';')[0] || null,
96
- contentLength,
97
- responseHeaders: res.headers,
98
- meta: {
99
- title: '',
100
- },
101
- imageList: [],
102
- anchorList: [],
103
- html: '',
104
- isSkipped: false,
105
- };
106
-
107
- if (rep.status === 405) {
108
- if (method === 'GET') {
109
- reject(`Method Not Allowed: ${url} ${rep.statusText}`);
110
- return;
111
- }
112
- const rr = await fetchDestination(url, isExternal, 'GET').catch(
113
- (error) => error,
114
- );
115
- if (rr) {
116
- rep = rr;
117
- } else {
118
- reject(rr);
119
- }
120
- }
121
-
122
- if (rep.status === 501) {
123
- if (method === 'GET') {
124
- reject(`Method Not Implemented: ${url} ${rep.statusText}`);
125
- return;
126
- }
127
- await delay(5 * 1000);
128
- const rr = await fetchDestination(url, isExternal, 'GET').catch(
129
- (error) => error,
130
- );
131
- if (rr) {
132
- rep = rr;
133
- } else {
134
- reject(rr);
135
- }
136
- }
137
-
138
- if (rep.status === 503) {
139
- if (method === 'GET') {
140
- reject(`Retrying failed: ${url} ${rep.statusText}`);
141
- return;
142
- }
143
- await delay(5 * 1000);
144
- const rr = await fetchDestination(url, isExternal, 'GET').catch(
145
- (error) => error,
146
- );
147
- if (rr) {
148
- rep = rr;
149
- } else {
150
- reject(rr);
151
- }
152
- }
153
-
154
- resolve(rep);
155
- });
156
- };
157
- if (url.protocol === 'https:') {
158
- req = redirects.https.request(
159
- {
160
- ...request,
161
- rejectUnauthorized: false,
162
- },
163
- response,
164
- );
165
- } else {
166
- req = redirects.http.request(request, response);
167
- }
168
- req.on('error', (error) => {
169
- reject(error);
170
- });
171
- req.end();
172
- });
173
- }
@@ -1,3 +0,0 @@
1
- export default class NetTimeoutError extends Error {
2
- override name = 'NetTimeoutError';
3
- }