@d-zero/beholder 0.1.29 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/README.md +172 -477
- package/dist/debug.d.ts +4 -1
- package/dist/debug.js +5 -2
- package/dist/dom-evaluation.d.ts +72 -14
- package/dist/dom-evaluation.js +169 -43
- package/dist/index.d.ts +20 -3
- package/dist/index.js +15 -3
- package/dist/is-error.d.ts +8 -0
- package/dist/is-error.js +10 -0
- package/dist/keyword-check.d.ts +5 -3
- package/dist/keyword-check.js +5 -3
- package/dist/parse-url.d.ts +14 -0
- package/dist/parse-url.js +23 -0
- package/dist/scraper.d.ts +39 -13
- package/dist/scraper.js +300 -263
- package/dist/types.d.ts +286 -214
- package/dist/types.js +6 -0
- package/package.json +7 -10
- package/src/debug.ts +5 -2
- package/src/dom-evaluation.ts +195 -65
- package/src/index.ts +27 -3
- package/src/is-error.spec.ts +33 -0
- package/src/is-error.ts +10 -0
- package/src/keyword-check.spec.ts +45 -4
- package/src/keyword-check.ts +5 -3
- package/src/parse-url.spec.ts +35 -0
- package/src/parse-url.ts +26 -0
- package/src/scraper.ts +338 -300
- package/src/types.ts +345 -258
- package/tsconfig.tsbuildinfo +1 -1
- package/dist/events.d.ts +0 -32
- package/dist/events.js +0 -15
- package/dist/fetch-destination.d.ts +0 -8
- package/dist/fetch-destination.js +0 -145
- package/dist/net-timeout-error.d.ts +0 -3
- package/dist/net-timeout-error.js +0 -3
- package/dist/sub-process-runner.d.ts +0 -12
- package/dist/sub-process-runner.js +0 -180
- package/dist/sub-process.d.ts +0 -1
- package/dist/sub-process.js +0 -67
- package/dist/utils.d.ts +0 -16
- package/dist/utils.js +0 -69
- package/src/events.ts +0 -21
- package/src/fetch-destination.ts +0 -173
- package/src/net-timeout-error.ts +0 -3
- package/src/sub-process-runner.ts +0 -220
- package/src/sub-process.ts +0 -86
- package/src/utils.ts +0 -89
|
@@ -1,220 +0,0 @@
|
|
|
1
|
-
import type { ScraperOptions } from './scraper.js';
|
|
2
|
-
import type { SubProcessRunnerEventTypes, ExURL } from './types.js';
|
|
3
|
-
import type { AnyAction } from 'typescript-fsa';
|
|
4
|
-
|
|
5
|
-
import childProcess from 'node:child_process';
|
|
6
|
-
import path from 'node:path';
|
|
7
|
-
|
|
8
|
-
import { delay } from '@d-zero/shared/delay';
|
|
9
|
-
import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
10
|
-
import { isType } from 'typescript-fsa';
|
|
11
|
-
|
|
12
|
-
import { scraperLog } from './debug.js';
|
|
13
|
-
import { scraperEvent, subProcessEvent } from './events.js';
|
|
14
|
-
|
|
15
|
-
const __filename = new globalThis.URL(import.meta.url).pathname;
|
|
16
|
-
const __dirname = path.dirname(__filename);
|
|
17
|
-
const SUB_PROCESS_PATH = path.resolve(__dirname, 'sub-process');
|
|
18
|
-
|
|
19
|
-
export default class SubProcessRunner extends TypedAwaitEventEmitter<SubProcessRunnerEventTypes> {
|
|
20
|
-
readonly #resetTime: number;
|
|
21
|
-
#scrapedTimes = 0;
|
|
22
|
-
#state: 'waiting' | 'running' = 'waiting';
|
|
23
|
-
#subProcess: childProcess.ChildProcess | null = null;
|
|
24
|
-
#undeadPid = new Set<number>();
|
|
25
|
-
|
|
26
|
-
get state() {
|
|
27
|
-
return this.#state;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
constructor(resetTime: number) {
|
|
31
|
-
super();
|
|
32
|
-
this.#resetTime = resetTime;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
destory() {
|
|
36
|
-
const pid = this.#subProcess?.pid;
|
|
37
|
-
if (this.#subProcess) {
|
|
38
|
-
scraperLog('Destroys child_process (%d)', pid);
|
|
39
|
-
this.#subProcess.send(subProcessEvent.destroy());
|
|
40
|
-
|
|
41
|
-
if (pid) {
|
|
42
|
-
void this.emit('reset', {
|
|
43
|
-
pid,
|
|
44
|
-
});
|
|
45
|
-
void this.emit('changePhase', {
|
|
46
|
-
pid,
|
|
47
|
-
name: 'reset',
|
|
48
|
-
url: null,
|
|
49
|
-
isExternal: false,
|
|
50
|
-
message: 'Reseting sub-process',
|
|
51
|
-
});
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
this.#destroyed();
|
|
55
|
-
return;
|
|
56
|
-
}
|
|
57
|
-
this.#destroyed();
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
getUndeadPid() {
|
|
61
|
-
return [...this.#undeadPid];
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
kill() {
|
|
65
|
-
if (!this.#subProcess) {
|
|
66
|
-
return;
|
|
67
|
-
}
|
|
68
|
-
if (!this.#subProcess.killed) {
|
|
69
|
-
scraperLog('Kills(SIGKILL) child_process (%d) ', this.#subProcess.pid);
|
|
70
|
-
this.#subProcess.kill('SIGKILL');
|
|
71
|
-
return;
|
|
72
|
-
}
|
|
73
|
-
scraperLog('child_process(%d) is already killed', this.#subProcess.pid);
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
start(url: ExURL, options: ScraperOptions, isSkip: boolean, interval: number) {
|
|
77
|
-
if (this.#state === 'running') {
|
|
78
|
-
throw new Error(`Sub Routine (PID: ${this.#subProcess?.pid}) is already running`);
|
|
79
|
-
}
|
|
80
|
-
void this.#scrape(url, options, isSkip, interval);
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
#createSubProcess(url: ExURL) {
|
|
84
|
-
const subProcess = childProcess.fork(SUB_PROCESS_PATH, {
|
|
85
|
-
detached: false,
|
|
86
|
-
});
|
|
87
|
-
|
|
88
|
-
subProcess.on('message', (action: AnyAction) => {
|
|
89
|
-
if (isType(action, scraperEvent.changePhase)) {
|
|
90
|
-
void this.emit('changePhase', action.payload);
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
if (isType(action, scraperEvent.ignoreAndSkip)) {
|
|
94
|
-
this.#finished();
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
if (isType(action, scraperEvent.scrapeEnd)) {
|
|
98
|
-
this.#finished();
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
if (isType(action, scraperEvent.destroyed)) {
|
|
102
|
-
this.#destroyed();
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
if (
|
|
106
|
-
isType(action, scraperEvent.ignoreAndSkip) ||
|
|
107
|
-
isType(action, scraperEvent.resourceResponse) ||
|
|
108
|
-
isType(action, scraperEvent.scrapeEnd) ||
|
|
109
|
-
isType(action, scraperEvent.destroyed)
|
|
110
|
-
) {
|
|
111
|
-
void this.emit('scrapeEvent', action);
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
if (isType(action, scraperEvent.error)) {
|
|
115
|
-
const error = new Error(action.payload.error.message);
|
|
116
|
-
error.name = action.payload.error.name;
|
|
117
|
-
error.stack = action.payload.error.stack;
|
|
118
|
-
const _action = {
|
|
119
|
-
type: action.type,
|
|
120
|
-
payload: {
|
|
121
|
-
pid: action.payload.pid,
|
|
122
|
-
url: action.payload.url,
|
|
123
|
-
shutdown: action.payload.shutdown,
|
|
124
|
-
error,
|
|
125
|
-
},
|
|
126
|
-
};
|
|
127
|
-
void this.emit('scrapeEvent', _action);
|
|
128
|
-
}
|
|
129
|
-
});
|
|
130
|
-
|
|
131
|
-
subProcess.on('disconnect', () => {
|
|
132
|
-
if (subProcess.killed) {
|
|
133
|
-
scraperLog('child_process(%d) is disconnected and killed', subProcess.pid);
|
|
134
|
-
return;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
scraperLog('child_process(%d) is disconnected but not killed', subProcess.pid);
|
|
138
|
-
scraperLog('Retries to kill(SIGTERM) child_process(%d)', subProcess.pid);
|
|
139
|
-
subProcess.kill('SIGTERM');
|
|
140
|
-
|
|
141
|
-
void this.emit('changePhase', {
|
|
142
|
-
pid: subProcess.pid,
|
|
143
|
-
name: 'disconnect',
|
|
144
|
-
url: null,
|
|
145
|
-
isExternal: false,
|
|
146
|
-
message: 'Disconnecting sub-process',
|
|
147
|
-
});
|
|
148
|
-
});
|
|
149
|
-
|
|
150
|
-
subProcess.on('error', (e) => {
|
|
151
|
-
void this.emit('error', {
|
|
152
|
-
pid: subProcess.pid,
|
|
153
|
-
url,
|
|
154
|
-
shutdown: true,
|
|
155
|
-
error: e instanceof Error ? e : new Error(`${e}`),
|
|
156
|
-
});
|
|
157
|
-
});
|
|
158
|
-
|
|
159
|
-
return subProcess;
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
#destroyed() {
|
|
163
|
-
if (this.#subProcess && !this.#subProcess.killed && this.#subProcess.pid) {
|
|
164
|
-
scraperLog('Add child_process(%d) to the undead PID list', this.#subProcess.pid);
|
|
165
|
-
this.#undeadPid.add(this.#subProcess.pid);
|
|
166
|
-
}
|
|
167
|
-
this.#subProcess = null;
|
|
168
|
-
this.#scrapedTimes = 0;
|
|
169
|
-
this.#finally();
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
#finally() {
|
|
173
|
-
this.#state = 'waiting';
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
#finished() {
|
|
177
|
-
this.#scrapedTimes += 1;
|
|
178
|
-
|
|
179
|
-
if (this.#scrapedTimes >= this.#resetTime) {
|
|
180
|
-
this.destory();
|
|
181
|
-
} else {
|
|
182
|
-
this.#finally();
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
async #scrape(url: ExURL, options: ScraperOptions, isSkip: boolean, interval: number) {
|
|
187
|
-
if (!this.#subProcess) {
|
|
188
|
-
this.#subProcess = this.#createSubProcess(url);
|
|
189
|
-
|
|
190
|
-
void this.emit('changePhase', {
|
|
191
|
-
pid: this.#subProcess.pid,
|
|
192
|
-
name: 'boot',
|
|
193
|
-
url,
|
|
194
|
-
isExternal: options.isExternal,
|
|
195
|
-
message: 'Booting sub-process',
|
|
196
|
-
});
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
this.#state = 'running';
|
|
200
|
-
|
|
201
|
-
interval = Math.max(interval, 0);
|
|
202
|
-
if (interval) {
|
|
203
|
-
await delay(interval);
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
this.#subProcess.send(
|
|
207
|
-
subProcessEvent.start({
|
|
208
|
-
url,
|
|
209
|
-
isExternal: options.isExternal,
|
|
210
|
-
isGettingImages: options.isGettingImages,
|
|
211
|
-
excludeKeywords: options.excludeKeywords,
|
|
212
|
-
executablePath: options.executablePath,
|
|
213
|
-
disableQueries: options.disableQueries ?? false,
|
|
214
|
-
isSkip,
|
|
215
|
-
isTitleOnly: options.isTitleOnly,
|
|
216
|
-
screenshot: options.screenshot,
|
|
217
|
-
}),
|
|
218
|
-
);
|
|
219
|
-
}
|
|
220
|
-
}
|
package/src/sub-process.ts
DELETED
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
// Run on child_process
|
|
2
|
-
|
|
3
|
-
import type { AnyAction } from 'typescript-fsa';
|
|
4
|
-
|
|
5
|
-
import { isType } from 'typescript-fsa';
|
|
6
|
-
|
|
7
|
-
import { scraperLog } from './debug.js';
|
|
8
|
-
import { scraperEvent, subProcessEvent } from './events.js';
|
|
9
|
-
import Scraper from './scraper.js';
|
|
10
|
-
|
|
11
|
-
const log = scraperLog.extend(`${process.pid}`);
|
|
12
|
-
|
|
13
|
-
process.title = 'beholder';
|
|
14
|
-
const scraper = new Scraper();
|
|
15
|
-
|
|
16
|
-
scraper.on('resourceResponse', (context) => {
|
|
17
|
-
if (process.connected) {
|
|
18
|
-
process.send!(scraperEvent.resourceResponse(context));
|
|
19
|
-
}
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
scraper.on('ignoreAndSkip', (context) => {
|
|
23
|
-
if (process.connected) {
|
|
24
|
-
process.send!(scraperEvent.ignoreAndSkip(context));
|
|
25
|
-
}
|
|
26
|
-
});
|
|
27
|
-
|
|
28
|
-
scraper.on('scrapeEnd', (context) => {
|
|
29
|
-
if (process.connected) {
|
|
30
|
-
process.send!(scraperEvent.scrapeEnd(context));
|
|
31
|
-
}
|
|
32
|
-
});
|
|
33
|
-
|
|
34
|
-
scraper.on('error', (context) => {
|
|
35
|
-
if (process.connected) {
|
|
36
|
-
const _context = {
|
|
37
|
-
...context,
|
|
38
|
-
error: {
|
|
39
|
-
name: context.error.name,
|
|
40
|
-
message: context.error.message,
|
|
41
|
-
stack: context.error.stack,
|
|
42
|
-
},
|
|
43
|
-
};
|
|
44
|
-
process.send!(scraperEvent.error(_context));
|
|
45
|
-
}
|
|
46
|
-
});
|
|
47
|
-
|
|
48
|
-
scraper.on('changePhase', (context) => {
|
|
49
|
-
if (process.connected) {
|
|
50
|
-
process.send!(scraperEvent.changePhase(context));
|
|
51
|
-
}
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
process.on('message', async (action: AnyAction) => {
|
|
55
|
-
if (isType(action, subProcessEvent.start)) {
|
|
56
|
-
void scraper.scrapeStart(
|
|
57
|
-
action.payload.url,
|
|
58
|
-
{
|
|
59
|
-
isExternal: action.payload.isExternal,
|
|
60
|
-
isGettingImages: action.payload.isGettingImages,
|
|
61
|
-
excludeKeywords: action.payload.excludeKeywords,
|
|
62
|
-
executablePath: action.payload.executablePath,
|
|
63
|
-
disableQueries: action.payload.disableQueries,
|
|
64
|
-
isTitleOnly: action.payload.isTitleOnly,
|
|
65
|
-
screenshot: action.payload.screenshot,
|
|
66
|
-
},
|
|
67
|
-
action.payload.isSkip,
|
|
68
|
-
);
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
if (isType(action, subProcessEvent.destroy)) {
|
|
72
|
-
await scraper.destroy(false);
|
|
73
|
-
}
|
|
74
|
-
});
|
|
75
|
-
|
|
76
|
-
scraper.on('destroyed', (context) => {
|
|
77
|
-
if (process.connected) {
|
|
78
|
-
process.send!(scraperEvent.destroyed(context));
|
|
79
|
-
log('disconnects process');
|
|
80
|
-
process.disconnect();
|
|
81
|
-
}
|
|
82
|
-
});
|
|
83
|
-
|
|
84
|
-
process.on('disconnect', () => {
|
|
85
|
-
log('Process is disconnected');
|
|
86
|
-
});
|
package/src/utils.ts
DELETED
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
import type { CDNType, CompressType } from './types.js';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
*
|
|
5
|
-
* @param status
|
|
6
|
-
*/
|
|
7
|
-
export function isError(status: number) {
|
|
8
|
-
return !(200 <= status && status < 400);
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
/**
|
|
12
|
-
*
|
|
13
|
-
* @param headers
|
|
14
|
-
*/
|
|
15
|
-
export function detectCompress(
|
|
16
|
-
headers: Record<string, string | string[] | undefined>,
|
|
17
|
-
): false | CompressType {
|
|
18
|
-
const enc =
|
|
19
|
-
'content-encoding' in headers && typeof headers['content-encoding'] === 'string'
|
|
20
|
-
? headers['content-encoding']
|
|
21
|
-
: '';
|
|
22
|
-
if (/gzip/i.test(enc)) {
|
|
23
|
-
return 'gzip';
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
if (/br/i.test(enc)) {
|
|
27
|
-
return 'br';
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
if (/compress/i.test(enc)) {
|
|
31
|
-
return 'compress';
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
if (/deflate/i.test(enc)) {
|
|
35
|
-
return 'deflate';
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
// cspell:disable-next
|
|
39
|
-
if (/sdch/i.test(enc)) {
|
|
40
|
-
// cspell:disable-next
|
|
41
|
-
return 'sdch';
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
// cspell:disable-next
|
|
45
|
-
if (/vcdiff/i.test(enc)) {
|
|
46
|
-
// cspell:disable-next
|
|
47
|
-
return 'vcdiff';
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
// cspell:disable-next
|
|
51
|
-
if (/xdelta/i.test(enc)) {
|
|
52
|
-
// cspell:disable-next
|
|
53
|
-
return 'xdelta';
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
return false;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
/**
|
|
60
|
-
*
|
|
61
|
-
* @param headers
|
|
62
|
-
*/
|
|
63
|
-
export function detectCDN(
|
|
64
|
-
headers: Record<string, string | string[] | undefined>,
|
|
65
|
-
): false | CDNType {
|
|
66
|
-
if ('X-Akamai-Transformed' in headers) {
|
|
67
|
-
return 'Akamai';
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
if ('x-amz-cf-pop' in headers) {
|
|
71
|
-
return 'Amazon CloudFront';
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
if ('X-IIJ-Cache' in headers) {
|
|
75
|
-
return 'IIJ';
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
if (typeof headers.server === 'string') {
|
|
79
|
-
if (/cloudflare/i.test(headers.server)) {
|
|
80
|
-
return 'Cloudflare';
|
|
81
|
-
}
|
|
82
|
-
if (/amazons3/i.test(headers.server)) {
|
|
83
|
-
return 'Amazon S3';
|
|
84
|
-
}
|
|
85
|
-
return false;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
return false;
|
|
89
|
-
}
|