@d-zero/beholder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +10 -0
  2. package/LICENSE +21 -0
  3. package/README.md +5 -0
  4. package/dist/debug.d.ts +6 -0
  5. package/dist/debug.js +6 -0
  6. package/dist/dom-evaluation.d.ts +24 -0
  7. package/dist/dom-evaluation.js +114 -0
  8. package/dist/events.d.ts +32 -0
  9. package/dist/events.js +15 -0
  10. package/dist/fetch-destination.d.ts +2 -0
  11. package/dist/fetch-destination.js +132 -0
  12. package/dist/index.d.ts +4 -0
  13. package/dist/index.js +4 -0
  14. package/dist/keyword-check.d.ts +1 -0
  15. package/dist/keyword-check.js +10 -0
  16. package/dist/net-timeout-error.d.ts +3 -0
  17. package/dist/net-timeout-error.js +3 -0
  18. package/dist/network.d.ts +2 -0
  19. package/dist/network.js +132 -0
  20. package/dist/scraper.d.ts +15 -0
  21. package/dist/scraper.js +678 -0
  22. package/dist/sub-process-runner.d.ts +12 -0
  23. package/dist/sub-process-runner.js +180 -0
  24. package/dist/sub-process.d.ts +1 -0
  25. package/dist/sub-process.js +67 -0
  26. package/dist/types.d.ts +271 -0
  27. package/dist/types.js +1 -0
  28. package/dist/utils.d.ts +5 -0
  29. package/dist/utils.js +142 -0
  30. package/package.json +34 -0
  31. package/src/debug.ts +7 -0
  32. package/src/dom-evaluation.ts +175 -0
  33. package/src/events.ts +21 -0
  34. package/src/fetch-destination.ts +160 -0
  35. package/src/index.ts +4 -0
  36. package/src/keyword-check.spec.ts +8 -0
  37. package/src/keyword-check.ts +12 -0
  38. package/src/net-timeout-error.ts +3 -0
  39. package/src/scraper.ts +733 -0
  40. package/src/sub-process-runner.ts +220 -0
  41. package/src/sub-process.ts +86 -0
  42. package/src/types.ts +341 -0
  43. package/src/utils.ts +171 -0
  44. package/tsconfig.json +15 -0
  45. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,220 @@
1
+ import type { ScraperOptions } from './scraper.js';
2
+ import type { SubProcessRunnerEventTypes, ExURL } from './types.js';
3
+ import type { AnyAction } from 'typescript-fsa';
4
+
5
+ import childProcess from 'node:child_process';
6
+ import path from 'node:path';
7
+
8
+ import { delay } from '@d-zero/shared/delay';
9
+ import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
10
+ import { isType } from 'typescript-fsa';
11
+
12
+ import { scraperLog } from './debug.js';
13
+ import { scraperEvent, subProcessEvent } from './events.js';
14
+
15
+ const __filename = new global.URL(import.meta.url).pathname;
16
+ const __dirname = path.dirname(__filename);
17
+ const SUB_PROCESS_PATH = path.resolve(__dirname, 'sub-process');
18
+
19
+ export default class SubProcessRunner extends TypedAwaitEventEmitter<SubProcessRunnerEventTypes> {
20
+ readonly #resetTime: number;
21
+ #scrapedTimes = 0;
22
+ #state: 'waiting' | 'running' = 'waiting';
23
+ #subProcess: childProcess.ChildProcess | null = null;
24
+ #undeadPid = new Set<number>();
25
+
26
+ get state() {
27
+ return this.#state;
28
+ }
29
+
30
+ constructor(resetTime: number) {
31
+ super();
32
+ this.#resetTime = resetTime;
33
+ }
34
+
35
+ destory() {
36
+ const pid = this.#subProcess?.pid;
37
+ if (this.#subProcess) {
38
+ scraperLog('Destroys child_process (%d)', pid);
39
+ this.#subProcess.send(subProcessEvent.destroy());
40
+
41
+ if (pid) {
42
+ void this.emit('reset', {
43
+ pid,
44
+ });
45
+ void this.emit('changePhase', {
46
+ pid,
47
+ name: 'reset',
48
+ url: null,
49
+ isExternal: false,
50
+ message: 'Reseting sub-process',
51
+ });
52
+ }
53
+
54
+ this.#destroyed();
55
+ return;
56
+ }
57
+ this.#destroyed();
58
+ }
59
+
60
+ getUndeadPid() {
61
+ return [...this.#undeadPid];
62
+ }
63
+
64
+ kill() {
65
+ if (!this.#subProcess) {
66
+ return;
67
+ }
68
+ if (!this.#subProcess.killed) {
69
+ scraperLog('Kills(SIGKILL) child_process (%d) ', this.#subProcess.pid);
70
+ this.#subProcess.kill('SIGKILL');
71
+ return;
72
+ }
73
+ scraperLog('child_process(%d) is already killed', this.#subProcess.pid);
74
+ }
75
+
76
+ start(url: ExURL, options: ScraperOptions, isSkip: boolean, interval: number) {
77
+ if (this.#state === 'running') {
78
+ throw new Error(`Sub Routine (PID: ${this.#subProcess?.pid}) is already running`);
79
+ }
80
+ void this.#scrape(url, options, isSkip, interval);
81
+ }
82
+
83
+ #createSubProcess(url: ExURL) {
84
+ const subProcess = childProcess.fork(SUB_PROCESS_PATH, {
85
+ detached: false,
86
+ });
87
+
88
+ subProcess.on('message', (action: AnyAction) => {
89
+ if (isType(action, scraperEvent.changePhase)) {
90
+ void this.emit('changePhase', action.payload);
91
+ }
92
+
93
+ if (isType(action, scraperEvent.ignoreAndSkip)) {
94
+ this.#finished();
95
+ }
96
+
97
+ if (isType(action, scraperEvent.scrapeEnd)) {
98
+ this.#finished();
99
+ }
100
+
101
+ if (isType(action, scraperEvent.destroyed)) {
102
+ this.#destroyed();
103
+ }
104
+
105
+ if (
106
+ isType(action, scraperEvent.ignoreAndSkip) ||
107
+ isType(action, scraperEvent.resourceResponse) ||
108
+ isType(action, scraperEvent.scrapeEnd) ||
109
+ isType(action, scraperEvent.destroyed)
110
+ ) {
111
+ void this.emit('scrapeEvent', action);
112
+ }
113
+
114
+ if (isType(action, scraperEvent.error)) {
115
+ const error = new Error(action.payload.error.message);
116
+ error.name = action.payload.error.name;
117
+ error.stack = action.payload.error.stack;
118
+ const _action = {
119
+ type: action.type,
120
+ payload: {
121
+ pid: action.payload.pid,
122
+ url: action.payload.url,
123
+ shutdown: action.payload.shutdown,
124
+ error,
125
+ },
126
+ };
127
+ void this.emit('scrapeEvent', _action);
128
+ }
129
+ });
130
+
131
+ subProcess.on('disconnect', () => {
132
+ if (subProcess.killed) {
133
+ scraperLog('child_process(%d) is disconnected and killed', subProcess.pid);
134
+ return;
135
+ }
136
+
137
+ scraperLog('child_process(%d) is disconnected but not killed', subProcess.pid);
138
+ scraperLog('Retries to kill(SIGTERM) child_process(%d)', subProcess.pid);
139
+ subProcess.kill('SIGTERM');
140
+
141
+ void this.emit('changePhase', {
142
+ pid: subProcess.pid,
143
+ name: 'disconnect',
144
+ url: null,
145
+ isExternal: false,
146
+ message: 'Disconnecting sub-process',
147
+ });
148
+ });
149
+
150
+ subProcess.on('error', (e) => {
151
+ void this.emit('error', {
152
+ pid: subProcess.pid,
153
+ url,
154
+ shutdown: true,
155
+ error: e instanceof Error ? e : new Error(`${e}`),
156
+ });
157
+ });
158
+
159
+ return subProcess;
160
+ }
161
+
162
+ #destroyed() {
163
+ if (this.#subProcess && !this.#subProcess.killed && this.#subProcess.pid) {
164
+ scraperLog('Add child_process(%d) to the undead PID list', this.#subProcess.pid);
165
+ this.#undeadPid.add(this.#subProcess.pid);
166
+ }
167
+ this.#subProcess = null;
168
+ this.#scrapedTimes = 0;
169
+ this.#finally();
170
+ }
171
+
172
+ #finally() {
173
+ this.#state = 'waiting';
174
+ }
175
+
176
+ #finished() {
177
+ this.#scrapedTimes += 1;
178
+
179
+ if (this.#scrapedTimes >= this.#resetTime) {
180
+ this.destory();
181
+ } else {
182
+ this.#finally();
183
+ }
184
+ }
185
+
186
+ async #scrape(url: ExURL, options: ScraperOptions, isSkip: boolean, interval: number) {
187
+ if (!this.#subProcess) {
188
+ this.#subProcess = this.#createSubProcess(url);
189
+
190
+ void this.emit('changePhase', {
191
+ pid: this.#subProcess.pid,
192
+ name: 'boot',
193
+ url,
194
+ isExternal: options.isExternal,
195
+ message: 'Booting sub-process',
196
+ });
197
+ }
198
+
199
+ this.#state = 'running';
200
+
201
+ interval = Math.max(interval, 0);
202
+ if (interval) {
203
+ await delay(interval);
204
+ }
205
+
206
+ this.#subProcess.send(
207
+ subProcessEvent.start({
208
+ url,
209
+ isExternal: options.isExternal,
210
+ isGettingImages: options.isGettingImages,
211
+ excludeKeywords: options.excludeKeywords,
212
+ executablePath: options.executablePath,
213
+ disableQueries: options.disableQueries ?? false,
214
+ isSkip,
215
+ isTitleOnly: options.isTitleOnly,
216
+ screenshot: options.screenshot,
217
+ }),
218
+ );
219
+ }
220
+ }
@@ -0,0 +1,86 @@
1
+ // Run on child_process
2
+
3
+ import type { AnyAction } from 'typescript-fsa';
4
+
5
+ import { isType } from 'typescript-fsa';
6
+
7
+ import { scraperLog } from './debug.js';
8
+ import { scraperEvent, subProcessEvent } from './events.js';
9
+ import Scraper from './scraper.js';
10
+
11
+ const log = scraperLog.extend(`${process.pid}`);
12
+
13
+ process.title = 'beholder';
14
+ const scraper = new Scraper();
15
+
16
+ scraper.on('resourceResponse', (context) => {
17
+ if (process.connected) {
18
+ process.send!(scraperEvent.resourceResponse(context));
19
+ }
20
+ });
21
+
22
+ scraper.on('ignoreAndSkip', (context) => {
23
+ if (process.connected) {
24
+ process.send!(scraperEvent.ignoreAndSkip(context));
25
+ }
26
+ });
27
+
28
+ scraper.on('scrapeEnd', (context) => {
29
+ if (process.connected) {
30
+ process.send!(scraperEvent.scrapeEnd(context));
31
+ }
32
+ });
33
+
34
+ scraper.on('error', (context) => {
35
+ if (process.connected) {
36
+ const _context = {
37
+ ...context,
38
+ error: {
39
+ name: context.error.name,
40
+ message: context.error.message,
41
+ stack: context.error.stack,
42
+ },
43
+ };
44
+ process.send!(scraperEvent.error(_context));
45
+ }
46
+ });
47
+
48
+ scraper.on('changePhase', (context) => {
49
+ if (process.connected) {
50
+ process.send!(scraperEvent.changePhase(context));
51
+ }
52
+ });
53
+
54
+ process.on('message', async (action: AnyAction) => {
55
+ if (isType(action, subProcessEvent.start)) {
56
+ void scraper.scrapeStart(
57
+ action.payload.url,
58
+ {
59
+ isExternal: action.payload.isExternal,
60
+ isGettingImages: action.payload.isGettingImages,
61
+ excludeKeywords: action.payload.excludeKeywords,
62
+ executablePath: action.payload.executablePath,
63
+ disableQueries: action.payload.disableQueries,
64
+ isTitleOnly: action.payload.isTitleOnly,
65
+ screenshot: action.payload.screenshot,
66
+ },
67
+ action.payload.isSkip,
68
+ );
69
+ }
70
+
71
+ if (isType(action, subProcessEvent.destroy)) {
72
+ await scraper.destroy(false);
73
+ }
74
+ });
75
+
76
+ scraper.on('destroyed', (context) => {
77
+ if (process.connected) {
78
+ process.send!(scraperEvent.destroyed(context));
79
+ log('disconnects process');
80
+ process.disconnect();
81
+ }
82
+ });
83
+
84
+ process.on('disconnect', () => {
85
+ log('Process is disconnected');
86
+ });
package/src/types.ts ADDED
@@ -0,0 +1,341 @@
1
+ import type { Action } from 'typescript-fsa';
2
+
3
+ export type ScrapeEvent = {
4
+ pid: number | undefined;
5
+ url: ExURL;
6
+ };
7
+
8
+ export type ScrapeErrorEvent = ScrapeEvent & {
9
+ shutdown: boolean;
10
+ error: {
11
+ name: string;
12
+ message: string;
13
+ stack?: string;
14
+ };
15
+ };
16
+
17
+ export type ScrapeEventTypes = {
18
+ ignoreAndSkip: ScrapeEvent & {
19
+ reason: {
20
+ matchedText: string;
21
+ excludeKeywords: string[];
22
+ };
23
+ };
24
+ resourceResponse: ScrapeEvent & {
25
+ log: NetworkLog;
26
+ resource: Omit<Resource, 'uid'>;
27
+ };
28
+ scrapeEnd: ScrapeEvent & {
29
+ timestamp: number;
30
+ result: PageData;
31
+ };
32
+ destroyed: Omit<ScrapeEvent, 'url'>;
33
+ error: ScrapeErrorEvent;
34
+ changePhase: ChangePhaseEvent;
35
+ };
36
+
37
+ export type ChangePhaseEvent = {
38
+ pid: number;
39
+ name:
40
+ | 'scrapeStart'
41
+ | 'launchBrowser'
42
+ | 'touchHead'
43
+ | 'touchHeadTimeout'
44
+ | 'newPage'
45
+ | 'openPage'
46
+ | 'loadDOMContent'
47
+ | 'waitNetworkIdleZero'
48
+ | 'getHTML'
49
+ | 'setViewport'
50
+ | 'scrollToBottom'
51
+ | 'getImages'
52
+ | 'getAnchors'
53
+ | 'getMeta'
54
+ | 'ignoreAndSkip'
55
+ | 'scrapeEnd'
56
+ | 'beforeDestroy'
57
+ | 'destroyed';
58
+ url: ExURL | null;
59
+ isExternal: boolean;
60
+ message: string;
61
+ };
62
+
63
+ export type AnyScrapeEvent = ScrapeEventTypes[keyof ScrapeEventTypes];
64
+
65
+ export type SubProcessEventTypes = {
66
+ start: {
67
+ url: ExURL;
68
+ isExternal: boolean;
69
+ isGettingImages: boolean;
70
+ excludeKeywords: string[];
71
+ executablePath: string | null;
72
+ isSkip: boolean;
73
+ isTitleOnly: boolean;
74
+ screenshot: string | null;
75
+ } & Required<ParseURLOptions>;
76
+ destroy: void;
77
+ };
78
+
79
+ export type SubProcessEvent = {
80
+ pid: number | undefined;
81
+ };
82
+
83
+ export type SubProcessChangeEvent =
84
+ | ChangePhaseEvent
85
+ | {
86
+ pid: number | undefined;
87
+ name: 'reset' | 'boot' | 'disconnect';
88
+ url: ExURL | null;
89
+ isExternal: boolean;
90
+ message: string;
91
+ };
92
+
93
+ export type SubProcessRunnerEventTypes = {
94
+ reset: SubProcessEvent;
95
+ scrapeEvent: Action<AnyScrapeEvent>;
96
+ changePhase: SubProcessChangeEvent;
97
+ error: ScrapeErrorEvent;
98
+ };
99
+
100
+ export type ExURL = {
101
+ /**
102
+ * Full URL (optimized)
103
+ */
104
+ href: string;
105
+
106
+ /**
107
+ * Full URL that before parse
108
+ */
109
+ _originUrlString: string;
110
+
111
+ /**
112
+ * Full URL without hash
113
+ */
114
+ withoutHash: string;
115
+
116
+ /**
117
+ * Full URL without hash and Authentication
118
+ */
119
+ withoutHashAndAuth: string;
120
+
121
+ /**
122
+ * Protocol or URI scheme (includes ":")
123
+ * - case-insensitive
124
+ */
125
+ protocol: string;
126
+
127
+ /**
128
+ * Whether protocol is HTTP or HTTPS
129
+ */
130
+ isHTTP: boolean;
131
+
132
+ /**
133
+ * Whether protocol is HTTPS
134
+ */
135
+ isSecure: boolean;
136
+
137
+ /**
138
+ * User name of authentication
139
+ */
140
+ username: string | null;
141
+
142
+ /**
143
+ * Password of authentication
144
+ */
145
+ password: string | null;
146
+
147
+ /**
148
+ * Host name
149
+ *
150
+ * - case-insensitive
151
+ * - encode non-ASCII characters
152
+ * - without port number
153
+ */
154
+ hostname: string;
155
+
156
+ /**
157
+ * Port number
158
+ */
159
+ port: string | null;
160
+
161
+ /**
162
+ * Path part
163
+ *
164
+ * It is only `/` if pathname is empty
165
+ *
166
+ * - case-sensitive
167
+ */
168
+ pathname: string | null;
169
+
170
+ /**
171
+ * Array of path
172
+ */
173
+ paths: string[];
174
+
175
+ /**
176
+ * Depth of paths
177
+ */
178
+ depth: number;
179
+
180
+ /**
181
+ * Directory name of paths
182
+ *
183
+ * It is null if it is `/` only
184
+ */
185
+ dirname: string | null;
186
+
187
+ /**
188
+ * Base name of paths (File name without file extension)
189
+ */
190
+ basename: string | null;
191
+
192
+ /**
193
+ * Whether index page (It's true if basename is null)
194
+ */
195
+ isIndex: boolean;
196
+
197
+ /**
198
+ * File extension name (inclues ".")
199
+ */
200
+ extname: string | null;
201
+
202
+ /**
203
+ * Search query (without `?`)
204
+ *
205
+ * - case-sensitive
206
+ */
207
+ query: string | null;
208
+
209
+ /**
210
+ * Hash (includes `#`)
211
+ *
212
+ * - case-sensitive
213
+ */
214
+ hash: string | null;
215
+ };
216
+
217
+ export type ParseURLOptions = {
218
+ disableQueries?: boolean;
219
+ };
220
+
221
+ export type PageData = {
222
+ url: ExURL;
223
+ redirectPaths: string[];
224
+ isTarget: boolean;
225
+ isExternal: boolean;
226
+ status: number;
227
+ statusText: string;
228
+ contentType: string | null;
229
+ contentLength: number | null;
230
+ responseHeaders: Record<string, string | string[] | undefined> | null;
231
+ meta: Meta;
232
+ anchorList: AnchorData[];
233
+ imageList: ImageElement[];
234
+ html: string;
235
+ isSkipped: false;
236
+ };
237
+
238
+ export type Meta = {
239
+ lang?: string;
240
+ title: string;
241
+ description?: string;
242
+ keywords?: string;
243
+ noindex?: boolean;
244
+ nofollow?: boolean;
245
+ noarchive?: boolean;
246
+ canonical?: string;
247
+ alternate?: string;
248
+ 'og:type'?: string;
249
+ 'og:title'?: string;
250
+ 'og:site_name'?: string;
251
+ 'og:description'?: string;
252
+ 'og:url'?: string;
253
+ 'og:image'?: string;
254
+ 'twitter:card'?: string;
255
+ };
256
+
257
+ export type AnchorData = {
258
+ /**
259
+ * Extracts the value of the `href` attribute from anchor element (`<a>` `<area>`)
260
+ */
261
+ href: ExURL;
262
+
263
+ /**
264
+ * The accessible name of the anchor element
265
+ */
266
+ textContent: string;
267
+ };
268
+
269
+ export type ImageElement = {
270
+ src: string;
271
+ currentSrc: string;
272
+ alt: string;
273
+ width: number;
274
+ height: number;
275
+ naturalWidth: number;
276
+ naturalHeight: number;
277
+ isLazy: boolean;
278
+ viewportWidth: number;
279
+ sourceCode: string;
280
+ };
281
+
282
+ export type NetworkLog = {
283
+ url: ExURL;
284
+ status: number | null;
285
+ contentLength: number;
286
+ contentType: string;
287
+ isError: boolean;
288
+ request: {
289
+ ts: number;
290
+ headers: Record<string, string>;
291
+ method: string;
292
+ };
293
+ response?: {
294
+ ts: number;
295
+ status: number;
296
+ statusText: string;
297
+ fromCache: boolean;
298
+ headers: Record<string, string>;
299
+ };
300
+ };
301
+
302
+ export type Resource = {
303
+ url: ExURL;
304
+ isExternal: boolean;
305
+ isError: boolean;
306
+ status: number | null;
307
+ statusText: string | null;
308
+ contentType: string | null;
309
+ contentLength: number | null;
310
+ compress: false | CompressType;
311
+ cdn: false | CDNType;
312
+ headers: Record<string, string | string[] | undefined> | null;
313
+ };
314
+
315
+ export type CompressType =
316
+ | 'gzip'
317
+ | 'compress'
318
+ | 'deflate'
319
+ | 'br'
320
+ | 'sdch' // cspell:disable-line
321
+ | 'vcdiff' // cspell:disable-line
322
+ | 'xdelta'; // cspell:disable-line
323
+
324
+ export type CDNType = 'Amazon S3' | 'Amazon CloudFront' | 'IIJ' | 'Cloudflare' | 'Akamai';
325
+
326
+ export type HTTPMethod = 'HEAD' | 'GET' | 'POST' | 'PATCH' | 'PUT' | 'DELETE' | 'OPTIONS';
327
+
328
+ export type SkippedPageData = {
329
+ isSkipped: true;
330
+ url: ExURL;
331
+ matched:
332
+ | {
333
+ type: 'keyword';
334
+ text: string;
335
+ excludeKeywords: string[];
336
+ }
337
+ | {
338
+ type: 'path';
339
+ excludes: string[];
340
+ };
341
+ };