@d-zero/beholder 0.1.29 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/README.md +172 -477
- package/package.json +7 -11
- package/src/debug.ts +5 -2
- package/src/dom-evaluation.ts +195 -65
- package/src/index.ts +27 -3
- package/src/is-error.spec.ts +33 -0
- package/src/is-error.ts +10 -0
- package/src/keyword-check.spec.ts +45 -4
- package/src/keyword-check.ts +5 -3
- package/src/parse-url.spec.ts +35 -0
- package/src/parse-url.ts +26 -0
- package/src/scraper.ts +338 -300
- package/src/types.ts +345 -258
- package/LICENSE +0 -21
- package/dist/debug.d.ts +0 -6
- package/dist/debug.js +0 -6
- package/dist/dom-evaluation.d.ts +0 -51
- package/dist/dom-evaluation.js +0 -147
- package/dist/events.d.ts +0 -32
- package/dist/events.js +0 -15
- package/dist/fetch-destination.d.ts +0 -8
- package/dist/fetch-destination.js +0 -145
- package/dist/index.d.ts +0 -4
- package/dist/index.js +0 -4
- package/dist/keyword-check.d.ts +0 -6
- package/dist/keyword-check.js +0 -15
- package/dist/net-timeout-error.d.ts +0 -3
- package/dist/net-timeout-error.js +0 -3
- package/dist/scraper.d.ts +0 -15
- package/dist/scraper.js +0 -621
- package/dist/sub-process-runner.d.ts +0 -12
- package/dist/sub-process-runner.js +0 -180
- package/dist/sub-process.d.ts +0 -1
- package/dist/sub-process.js +0 -67
- package/dist/types.d.ts +0 -271
- package/dist/types.js +0 -1
- package/dist/utils.d.ts +0 -16
- package/dist/utils.js +0 -69
- package/src/events.ts +0 -21
- package/src/fetch-destination.ts +0 -173
- package/src/net-timeout-error.ts +0 -3
- package/src/sub-process-runner.ts +0 -220
- package/src/sub-process.ts +0 -86
- package/src/utils.ts +0 -89
- package/tsconfig.tsbuildinfo +0 -1
|
@@ -1,180 +0,0 @@
|
|
|
1
|
-
import childProcess from 'node:child_process';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
import { delay } from '@d-zero/shared/delay';
|
|
4
|
-
import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
5
|
-
import { isType } from 'typescript-fsa';
|
|
6
|
-
import { scraperLog } from './debug.js';
|
|
7
|
-
import { scraperEvent, subProcessEvent } from './events.js';
|
|
8
|
-
const __filename = new globalThis.URL(import.meta.url).pathname;
|
|
9
|
-
const __dirname = path.dirname(__filename);
|
|
10
|
-
const SUB_PROCESS_PATH = path.resolve(__dirname, 'sub-process');
|
|
11
|
-
export default class SubProcessRunner extends TypedAwaitEventEmitter {
|
|
12
|
-
#resetTime;
|
|
13
|
-
#scrapedTimes = 0;
|
|
14
|
-
#state = 'waiting';
|
|
15
|
-
#subProcess = null;
|
|
16
|
-
#undeadPid = new Set();
|
|
17
|
-
get state() {
|
|
18
|
-
return this.#state;
|
|
19
|
-
}
|
|
20
|
-
constructor(resetTime) {
|
|
21
|
-
super();
|
|
22
|
-
this.#resetTime = resetTime;
|
|
23
|
-
}
|
|
24
|
-
destory() {
|
|
25
|
-
const pid = this.#subProcess?.pid;
|
|
26
|
-
if (this.#subProcess) {
|
|
27
|
-
scraperLog('Destroys child_process (%d)', pid);
|
|
28
|
-
this.#subProcess.send(subProcessEvent.destroy());
|
|
29
|
-
if (pid) {
|
|
30
|
-
void this.emit('reset', {
|
|
31
|
-
pid,
|
|
32
|
-
});
|
|
33
|
-
void this.emit('changePhase', {
|
|
34
|
-
pid,
|
|
35
|
-
name: 'reset',
|
|
36
|
-
url: null,
|
|
37
|
-
isExternal: false,
|
|
38
|
-
message: 'Reseting sub-process',
|
|
39
|
-
});
|
|
40
|
-
}
|
|
41
|
-
this.#destroyed();
|
|
42
|
-
return;
|
|
43
|
-
}
|
|
44
|
-
this.#destroyed();
|
|
45
|
-
}
|
|
46
|
-
getUndeadPid() {
|
|
47
|
-
return [...this.#undeadPid];
|
|
48
|
-
}
|
|
49
|
-
kill() {
|
|
50
|
-
if (!this.#subProcess) {
|
|
51
|
-
return;
|
|
52
|
-
}
|
|
53
|
-
if (!this.#subProcess.killed) {
|
|
54
|
-
scraperLog('Kills(SIGKILL) child_process (%d) ', this.#subProcess.pid);
|
|
55
|
-
this.#subProcess.kill('SIGKILL');
|
|
56
|
-
return;
|
|
57
|
-
}
|
|
58
|
-
scraperLog('child_process(%d) is already killed', this.#subProcess.pid);
|
|
59
|
-
}
|
|
60
|
-
start(url, options, isSkip, interval) {
|
|
61
|
-
if (this.#state === 'running') {
|
|
62
|
-
throw new Error(`Sub Routine (PID: ${this.#subProcess?.pid}) is already running`);
|
|
63
|
-
}
|
|
64
|
-
void this.#scrape(url, options, isSkip, interval);
|
|
65
|
-
}
|
|
66
|
-
#createSubProcess(url) {
|
|
67
|
-
const subProcess = childProcess.fork(SUB_PROCESS_PATH, {
|
|
68
|
-
detached: false,
|
|
69
|
-
});
|
|
70
|
-
subProcess.on('message', (action) => {
|
|
71
|
-
if (isType(action, scraperEvent.changePhase)) {
|
|
72
|
-
void this.emit('changePhase', action.payload);
|
|
73
|
-
}
|
|
74
|
-
if (isType(action, scraperEvent.ignoreAndSkip)) {
|
|
75
|
-
this.#finished();
|
|
76
|
-
}
|
|
77
|
-
if (isType(action, scraperEvent.scrapeEnd)) {
|
|
78
|
-
this.#finished();
|
|
79
|
-
}
|
|
80
|
-
if (isType(action, scraperEvent.destroyed)) {
|
|
81
|
-
this.#destroyed();
|
|
82
|
-
}
|
|
83
|
-
if (isType(action, scraperEvent.ignoreAndSkip) ||
|
|
84
|
-
isType(action, scraperEvent.resourceResponse) ||
|
|
85
|
-
isType(action, scraperEvent.scrapeEnd) ||
|
|
86
|
-
isType(action, scraperEvent.destroyed)) {
|
|
87
|
-
void this.emit('scrapeEvent', action);
|
|
88
|
-
}
|
|
89
|
-
if (isType(action, scraperEvent.error)) {
|
|
90
|
-
const error = new Error(action.payload.error.message);
|
|
91
|
-
error.name = action.payload.error.name;
|
|
92
|
-
error.stack = action.payload.error.stack;
|
|
93
|
-
const _action = {
|
|
94
|
-
type: action.type,
|
|
95
|
-
payload: {
|
|
96
|
-
pid: action.payload.pid,
|
|
97
|
-
url: action.payload.url,
|
|
98
|
-
shutdown: action.payload.shutdown,
|
|
99
|
-
error,
|
|
100
|
-
},
|
|
101
|
-
};
|
|
102
|
-
void this.emit('scrapeEvent', _action);
|
|
103
|
-
}
|
|
104
|
-
});
|
|
105
|
-
subProcess.on('disconnect', () => {
|
|
106
|
-
if (subProcess.killed) {
|
|
107
|
-
scraperLog('child_process(%d) is disconnected and killed', subProcess.pid);
|
|
108
|
-
return;
|
|
109
|
-
}
|
|
110
|
-
scraperLog('child_process(%d) is disconnected but not killed', subProcess.pid);
|
|
111
|
-
scraperLog('Retries to kill(SIGTERM) child_process(%d)', subProcess.pid);
|
|
112
|
-
subProcess.kill('SIGTERM');
|
|
113
|
-
void this.emit('changePhase', {
|
|
114
|
-
pid: subProcess.pid,
|
|
115
|
-
name: 'disconnect',
|
|
116
|
-
url: null,
|
|
117
|
-
isExternal: false,
|
|
118
|
-
message: 'Disconnecting sub-process',
|
|
119
|
-
});
|
|
120
|
-
});
|
|
121
|
-
subProcess.on('error', (e) => {
|
|
122
|
-
void this.emit('error', {
|
|
123
|
-
pid: subProcess.pid,
|
|
124
|
-
url,
|
|
125
|
-
shutdown: true,
|
|
126
|
-
error: e instanceof Error ? e : new Error(`${e}`),
|
|
127
|
-
});
|
|
128
|
-
});
|
|
129
|
-
return subProcess;
|
|
130
|
-
}
|
|
131
|
-
#destroyed() {
|
|
132
|
-
if (this.#subProcess && !this.#subProcess.killed && this.#subProcess.pid) {
|
|
133
|
-
scraperLog('Add child_process(%d) to the undead PID list', this.#subProcess.pid);
|
|
134
|
-
this.#undeadPid.add(this.#subProcess.pid);
|
|
135
|
-
}
|
|
136
|
-
this.#subProcess = null;
|
|
137
|
-
this.#scrapedTimes = 0;
|
|
138
|
-
this.#finally();
|
|
139
|
-
}
|
|
140
|
-
#finally() {
|
|
141
|
-
this.#state = 'waiting';
|
|
142
|
-
}
|
|
143
|
-
#finished() {
|
|
144
|
-
this.#scrapedTimes += 1;
|
|
145
|
-
if (this.#scrapedTimes >= this.#resetTime) {
|
|
146
|
-
this.destory();
|
|
147
|
-
}
|
|
148
|
-
else {
|
|
149
|
-
this.#finally();
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
async #scrape(url, options, isSkip, interval) {
|
|
153
|
-
if (!this.#subProcess) {
|
|
154
|
-
this.#subProcess = this.#createSubProcess(url);
|
|
155
|
-
void this.emit('changePhase', {
|
|
156
|
-
pid: this.#subProcess.pid,
|
|
157
|
-
name: 'boot',
|
|
158
|
-
url,
|
|
159
|
-
isExternal: options.isExternal,
|
|
160
|
-
message: 'Booting sub-process',
|
|
161
|
-
});
|
|
162
|
-
}
|
|
163
|
-
this.#state = 'running';
|
|
164
|
-
interval = Math.max(interval, 0);
|
|
165
|
-
if (interval) {
|
|
166
|
-
await delay(interval);
|
|
167
|
-
}
|
|
168
|
-
this.#subProcess.send(subProcessEvent.start({
|
|
169
|
-
url,
|
|
170
|
-
isExternal: options.isExternal,
|
|
171
|
-
isGettingImages: options.isGettingImages,
|
|
172
|
-
excludeKeywords: options.excludeKeywords,
|
|
173
|
-
executablePath: options.executablePath,
|
|
174
|
-
disableQueries: options.disableQueries ?? false,
|
|
175
|
-
isSkip,
|
|
176
|
-
isTitleOnly: options.isTitleOnly,
|
|
177
|
-
screenshot: options.screenshot,
|
|
178
|
-
}));
|
|
179
|
-
}
|
|
180
|
-
}
|
package/dist/sub-process.d.ts
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
package/dist/sub-process.js
DELETED
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
// Run on child_process
|
|
2
|
-
import { isType } from 'typescript-fsa';
|
|
3
|
-
import { scraperLog } from './debug.js';
|
|
4
|
-
import { scraperEvent, subProcessEvent } from './events.js';
|
|
5
|
-
import Scraper from './scraper.js';
|
|
6
|
-
const log = scraperLog.extend(`${process.pid}`);
|
|
7
|
-
process.title = 'beholder';
|
|
8
|
-
const scraper = new Scraper();
|
|
9
|
-
scraper.on('resourceResponse', (context) => {
|
|
10
|
-
if (process.connected) {
|
|
11
|
-
process.send(scraperEvent.resourceResponse(context));
|
|
12
|
-
}
|
|
13
|
-
});
|
|
14
|
-
scraper.on('ignoreAndSkip', (context) => {
|
|
15
|
-
if (process.connected) {
|
|
16
|
-
process.send(scraperEvent.ignoreAndSkip(context));
|
|
17
|
-
}
|
|
18
|
-
});
|
|
19
|
-
scraper.on('scrapeEnd', (context) => {
|
|
20
|
-
if (process.connected) {
|
|
21
|
-
process.send(scraperEvent.scrapeEnd(context));
|
|
22
|
-
}
|
|
23
|
-
});
|
|
24
|
-
scraper.on('error', (context) => {
|
|
25
|
-
if (process.connected) {
|
|
26
|
-
const _context = {
|
|
27
|
-
...context,
|
|
28
|
-
error: {
|
|
29
|
-
name: context.error.name,
|
|
30
|
-
message: context.error.message,
|
|
31
|
-
stack: context.error.stack,
|
|
32
|
-
},
|
|
33
|
-
};
|
|
34
|
-
process.send(scraperEvent.error(_context));
|
|
35
|
-
}
|
|
36
|
-
});
|
|
37
|
-
scraper.on('changePhase', (context) => {
|
|
38
|
-
if (process.connected) {
|
|
39
|
-
process.send(scraperEvent.changePhase(context));
|
|
40
|
-
}
|
|
41
|
-
});
|
|
42
|
-
process.on('message', async (action) => {
|
|
43
|
-
if (isType(action, subProcessEvent.start)) {
|
|
44
|
-
void scraper.scrapeStart(action.payload.url, {
|
|
45
|
-
isExternal: action.payload.isExternal,
|
|
46
|
-
isGettingImages: action.payload.isGettingImages,
|
|
47
|
-
excludeKeywords: action.payload.excludeKeywords,
|
|
48
|
-
executablePath: action.payload.executablePath,
|
|
49
|
-
disableQueries: action.payload.disableQueries,
|
|
50
|
-
isTitleOnly: action.payload.isTitleOnly,
|
|
51
|
-
screenshot: action.payload.screenshot,
|
|
52
|
-
}, action.payload.isSkip);
|
|
53
|
-
}
|
|
54
|
-
if (isType(action, subProcessEvent.destroy)) {
|
|
55
|
-
await scraper.destroy(false);
|
|
56
|
-
}
|
|
57
|
-
});
|
|
58
|
-
scraper.on('destroyed', (context) => {
|
|
59
|
-
if (process.connected) {
|
|
60
|
-
process.send(scraperEvent.destroyed(context));
|
|
61
|
-
log('disconnects process');
|
|
62
|
-
process.disconnect();
|
|
63
|
-
}
|
|
64
|
-
});
|
|
65
|
-
process.on('disconnect', () => {
|
|
66
|
-
log('Process is disconnected');
|
|
67
|
-
});
|
package/dist/types.d.ts
DELETED
|
@@ -1,271 +0,0 @@
|
|
|
1
|
-
import type { Action } from 'typescript-fsa';
|
|
2
|
-
export type ScrapeEvent = {
|
|
3
|
-
pid: number | undefined;
|
|
4
|
-
url: ExURL;
|
|
5
|
-
};
|
|
6
|
-
export type ScrapeErrorEvent = ScrapeEvent & {
|
|
7
|
-
shutdown: boolean;
|
|
8
|
-
error: {
|
|
9
|
-
name: string;
|
|
10
|
-
message: string;
|
|
11
|
-
stack?: string;
|
|
12
|
-
};
|
|
13
|
-
};
|
|
14
|
-
export type ScrapeEventTypes = {
|
|
15
|
-
ignoreAndSkip: ScrapeEvent & {
|
|
16
|
-
reason: {
|
|
17
|
-
matchedText: string;
|
|
18
|
-
excludeKeywords: string[];
|
|
19
|
-
};
|
|
20
|
-
};
|
|
21
|
-
resourceResponse: ScrapeEvent & {
|
|
22
|
-
log: NetworkLog;
|
|
23
|
-
resource: Omit<Resource, 'uid'>;
|
|
24
|
-
};
|
|
25
|
-
scrapeEnd: ScrapeEvent & {
|
|
26
|
-
timestamp: number;
|
|
27
|
-
result: PageData;
|
|
28
|
-
};
|
|
29
|
-
destroyed: Omit<ScrapeEvent, 'url'>;
|
|
30
|
-
error: ScrapeErrorEvent;
|
|
31
|
-
changePhase: ChangePhaseEvent;
|
|
32
|
-
};
|
|
33
|
-
export type ChangePhaseEvent = {
|
|
34
|
-
pid: number;
|
|
35
|
-
name: 'scrapeStart' | 'launchBrowser' | 'touchHead' | 'touchHeadTimeout' | 'newPage' | 'openPage' | 'loadDOMContent' | 'waitNetworkIdleZero' | 'getHTML' | 'setViewport' | 'scrollToBottom' | 'getImages' | 'getAnchors' | 'getMeta' | 'ignoreAndSkip' | 'scrapeEnd' | 'beforeDestroy' | 'destroyed';
|
|
36
|
-
url: ExURL | null;
|
|
37
|
-
isExternal: boolean;
|
|
38
|
-
message: string;
|
|
39
|
-
};
|
|
40
|
-
export type AnyScrapeEvent = ScrapeEventTypes[keyof ScrapeEventTypes];
|
|
41
|
-
export type SubProcessEventTypes = {
|
|
42
|
-
start: {
|
|
43
|
-
url: ExURL;
|
|
44
|
-
isExternal: boolean;
|
|
45
|
-
isGettingImages: boolean;
|
|
46
|
-
excludeKeywords: string[];
|
|
47
|
-
executablePath: string | null;
|
|
48
|
-
isSkip: boolean;
|
|
49
|
-
isTitleOnly: boolean;
|
|
50
|
-
screenshot: string | null;
|
|
51
|
-
} & Required<ParseURLOptions>;
|
|
52
|
-
destroy: void;
|
|
53
|
-
};
|
|
54
|
-
export type SubProcessEvent = {
|
|
55
|
-
pid: number | undefined;
|
|
56
|
-
};
|
|
57
|
-
export type SubProcessChangeEvent = ChangePhaseEvent | {
|
|
58
|
-
pid: number | undefined;
|
|
59
|
-
name: 'reset' | 'boot' | 'disconnect';
|
|
60
|
-
url: ExURL | null;
|
|
61
|
-
isExternal: boolean;
|
|
62
|
-
message: string;
|
|
63
|
-
};
|
|
64
|
-
export type SubProcessRunnerEventTypes = {
|
|
65
|
-
reset: SubProcessEvent;
|
|
66
|
-
scrapeEvent: Action<AnyScrapeEvent>;
|
|
67
|
-
changePhase: SubProcessChangeEvent;
|
|
68
|
-
error: ScrapeErrorEvent;
|
|
69
|
-
};
|
|
70
|
-
export type ExURL = {
|
|
71
|
-
/**
|
|
72
|
-
* Full URL (optimized)
|
|
73
|
-
*/
|
|
74
|
-
href: string;
|
|
75
|
-
/**
|
|
76
|
-
* Full URL that before parse
|
|
77
|
-
*/
|
|
78
|
-
_originUrlString: string;
|
|
79
|
-
/**
|
|
80
|
-
* Full URL without hash
|
|
81
|
-
*/
|
|
82
|
-
withoutHash: string;
|
|
83
|
-
/**
|
|
84
|
-
* Full URL without hash and Authentication
|
|
85
|
-
*/
|
|
86
|
-
withoutHashAndAuth: string;
|
|
87
|
-
/**
|
|
88
|
-
* Protocol or URI scheme (includes ":")
|
|
89
|
-
* - case-insensitive
|
|
90
|
-
*/
|
|
91
|
-
protocol: string;
|
|
92
|
-
/**
|
|
93
|
-
* Whether protocol is HTTP or HTTPS
|
|
94
|
-
*/
|
|
95
|
-
isHTTP: boolean;
|
|
96
|
-
/**
|
|
97
|
-
* Whether protocol is HTTPS
|
|
98
|
-
*/
|
|
99
|
-
isSecure: boolean;
|
|
100
|
-
/**
|
|
101
|
-
* User name of authentication
|
|
102
|
-
*/
|
|
103
|
-
username: string | null;
|
|
104
|
-
/**
|
|
105
|
-
* Password of authentication
|
|
106
|
-
*/
|
|
107
|
-
password: string | null;
|
|
108
|
-
/**
|
|
109
|
-
* Host name
|
|
110
|
-
*
|
|
111
|
-
* - case-insensitive
|
|
112
|
-
* - encode non-ASCII characters
|
|
113
|
-
* - without port number
|
|
114
|
-
*/
|
|
115
|
-
hostname: string;
|
|
116
|
-
/**
|
|
117
|
-
* Port number
|
|
118
|
-
*/
|
|
119
|
-
port: string | null;
|
|
120
|
-
/**
|
|
121
|
-
* Path part
|
|
122
|
-
*
|
|
123
|
-
* It is only `/` if pathname is empty
|
|
124
|
-
*
|
|
125
|
-
* - case-sensitive
|
|
126
|
-
*/
|
|
127
|
-
pathname: string | null;
|
|
128
|
-
/**
|
|
129
|
-
* Array of path
|
|
130
|
-
*/
|
|
131
|
-
paths: string[];
|
|
132
|
-
/**
|
|
133
|
-
* Depth of paths
|
|
134
|
-
*/
|
|
135
|
-
depth: number;
|
|
136
|
-
/**
|
|
137
|
-
* Directory name of paths
|
|
138
|
-
*
|
|
139
|
-
* It is null if it is `/` only
|
|
140
|
-
*/
|
|
141
|
-
dirname: string | null;
|
|
142
|
-
/**
|
|
143
|
-
* Base name of paths (File name without file extension)
|
|
144
|
-
*/
|
|
145
|
-
basename: string | null;
|
|
146
|
-
/**
|
|
147
|
-
* Whether index page (It's true if basename is null)
|
|
148
|
-
*/
|
|
149
|
-
isIndex: boolean;
|
|
150
|
-
/**
|
|
151
|
-
* File extension name (inclues ".")
|
|
152
|
-
*/
|
|
153
|
-
extname: string | null;
|
|
154
|
-
/**
|
|
155
|
-
* Search query (without `?`)
|
|
156
|
-
*
|
|
157
|
-
* - case-sensitive
|
|
158
|
-
*/
|
|
159
|
-
query: string | null;
|
|
160
|
-
/**
|
|
161
|
-
* Hash (includes `#`)
|
|
162
|
-
*
|
|
163
|
-
* - case-sensitive
|
|
164
|
-
*/
|
|
165
|
-
hash: string | null;
|
|
166
|
-
};
|
|
167
|
-
export type ParseURLOptions = {
|
|
168
|
-
disableQueries?: boolean;
|
|
169
|
-
};
|
|
170
|
-
export type PageData = {
|
|
171
|
-
url: ExURL;
|
|
172
|
-
redirectPaths: string[];
|
|
173
|
-
isTarget: boolean;
|
|
174
|
-
isExternal: boolean;
|
|
175
|
-
status: number;
|
|
176
|
-
statusText: string;
|
|
177
|
-
contentType: string | null;
|
|
178
|
-
contentLength: number | null;
|
|
179
|
-
responseHeaders: Record<string, string | string[] | undefined> | null;
|
|
180
|
-
meta: Meta;
|
|
181
|
-
anchorList: AnchorData[];
|
|
182
|
-
imageList: ImageElement[];
|
|
183
|
-
html: string;
|
|
184
|
-
isSkipped: false;
|
|
185
|
-
};
|
|
186
|
-
export type Meta = {
|
|
187
|
-
lang?: string;
|
|
188
|
-
title: string;
|
|
189
|
-
description?: string;
|
|
190
|
-
keywords?: string;
|
|
191
|
-
noindex?: boolean;
|
|
192
|
-
nofollow?: boolean;
|
|
193
|
-
noarchive?: boolean;
|
|
194
|
-
canonical?: string;
|
|
195
|
-
alternate?: string;
|
|
196
|
-
'og:type'?: string;
|
|
197
|
-
'og:title'?: string;
|
|
198
|
-
'og:site_name'?: string;
|
|
199
|
-
'og:description'?: string;
|
|
200
|
-
'og:url'?: string;
|
|
201
|
-
'og:image'?: string;
|
|
202
|
-
'twitter:card'?: string;
|
|
203
|
-
};
|
|
204
|
-
export type AnchorData = {
|
|
205
|
-
/**
|
|
206
|
-
* Extracts the value of the `href` attribute from anchor element (`<a>` `<area>`)
|
|
207
|
-
*/
|
|
208
|
-
href: ExURL;
|
|
209
|
-
/**
|
|
210
|
-
* The accessible name of the anchor element
|
|
211
|
-
*/
|
|
212
|
-
textContent: string;
|
|
213
|
-
};
|
|
214
|
-
export type ImageElement = {
|
|
215
|
-
src: string;
|
|
216
|
-
currentSrc: string;
|
|
217
|
-
alt: string;
|
|
218
|
-
width: number;
|
|
219
|
-
height: number;
|
|
220
|
-
naturalWidth: number;
|
|
221
|
-
naturalHeight: number;
|
|
222
|
-
isLazy: boolean;
|
|
223
|
-
viewportWidth: number;
|
|
224
|
-
sourceCode: string;
|
|
225
|
-
};
|
|
226
|
-
export type NetworkLog = {
|
|
227
|
-
url: ExURL;
|
|
228
|
-
status: number | null;
|
|
229
|
-
contentLength: number;
|
|
230
|
-
contentType: string;
|
|
231
|
-
isError: boolean;
|
|
232
|
-
request: {
|
|
233
|
-
ts: number;
|
|
234
|
-
headers: Record<string, string>;
|
|
235
|
-
method: string;
|
|
236
|
-
};
|
|
237
|
-
response?: {
|
|
238
|
-
ts: number;
|
|
239
|
-
status: number;
|
|
240
|
-
statusText: string;
|
|
241
|
-
fromCache: boolean;
|
|
242
|
-
headers: Record<string, string>;
|
|
243
|
-
};
|
|
244
|
-
};
|
|
245
|
-
export type Resource = {
|
|
246
|
-
url: ExURL;
|
|
247
|
-
isExternal: boolean;
|
|
248
|
-
isError: boolean;
|
|
249
|
-
status: number | null;
|
|
250
|
-
statusText: string | null;
|
|
251
|
-
contentType: string | null;
|
|
252
|
-
contentLength: number | null;
|
|
253
|
-
compress: false | CompressType;
|
|
254
|
-
cdn: false | CDNType;
|
|
255
|
-
headers: Record<string, string | string[] | undefined> | null;
|
|
256
|
-
};
|
|
257
|
-
export type CompressType = 'gzip' | 'compress' | 'deflate' | 'br' | 'sdch' | 'vcdiff' | 'xdelta';
|
|
258
|
-
export type CDNType = 'Amazon S3' | 'Amazon CloudFront' | 'IIJ' | 'Cloudflare' | 'Akamai';
|
|
259
|
-
export type HTTPMethod = 'HEAD' | 'GET' | 'POST' | 'PATCH' | 'PUT' | 'DELETE' | 'OPTIONS';
|
|
260
|
-
export type SkippedPageData = {
|
|
261
|
-
isSkipped: true;
|
|
262
|
-
url: ExURL;
|
|
263
|
-
matched: {
|
|
264
|
-
type: 'keyword';
|
|
265
|
-
text: string;
|
|
266
|
-
excludeKeywords: string[];
|
|
267
|
-
} | {
|
|
268
|
-
type: 'path';
|
|
269
|
-
excludes: string[];
|
|
270
|
-
};
|
|
271
|
-
};
|
package/dist/types.js
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
package/dist/utils.d.ts
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import type { CDNType, CompressType } from './types.js';
|
|
2
|
-
/**
|
|
3
|
-
*
|
|
4
|
-
* @param status
|
|
5
|
-
*/
|
|
6
|
-
export declare function isError(status: number): boolean;
|
|
7
|
-
/**
|
|
8
|
-
*
|
|
9
|
-
* @param headers
|
|
10
|
-
*/
|
|
11
|
-
export declare function detectCompress(headers: Record<string, string | string[] | undefined>): false | CompressType;
|
|
12
|
-
/**
|
|
13
|
-
*
|
|
14
|
-
* @param headers
|
|
15
|
-
*/
|
|
16
|
-
export declare function detectCDN(headers: Record<string, string | string[] | undefined>): false | CDNType;
|
package/dist/utils.js
DELETED
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
*
|
|
3
|
-
* @param status
|
|
4
|
-
*/
|
|
5
|
-
export function isError(status) {
|
|
6
|
-
return !(200 <= status && status < 400);
|
|
7
|
-
}
|
|
8
|
-
/**
|
|
9
|
-
*
|
|
10
|
-
* @param headers
|
|
11
|
-
*/
|
|
12
|
-
export function detectCompress(headers) {
|
|
13
|
-
const enc = 'content-encoding' in headers && typeof headers['content-encoding'] === 'string'
|
|
14
|
-
? headers['content-encoding']
|
|
15
|
-
: '';
|
|
16
|
-
if (/gzip/i.test(enc)) {
|
|
17
|
-
return 'gzip';
|
|
18
|
-
}
|
|
19
|
-
if (/br/i.test(enc)) {
|
|
20
|
-
return 'br';
|
|
21
|
-
}
|
|
22
|
-
if (/compress/i.test(enc)) {
|
|
23
|
-
return 'compress';
|
|
24
|
-
}
|
|
25
|
-
if (/deflate/i.test(enc)) {
|
|
26
|
-
return 'deflate';
|
|
27
|
-
}
|
|
28
|
-
// cspell:disable-next
|
|
29
|
-
if (/sdch/i.test(enc)) {
|
|
30
|
-
// cspell:disable-next
|
|
31
|
-
return 'sdch';
|
|
32
|
-
}
|
|
33
|
-
// cspell:disable-next
|
|
34
|
-
if (/vcdiff/i.test(enc)) {
|
|
35
|
-
// cspell:disable-next
|
|
36
|
-
return 'vcdiff';
|
|
37
|
-
}
|
|
38
|
-
// cspell:disable-next
|
|
39
|
-
if (/xdelta/i.test(enc)) {
|
|
40
|
-
// cspell:disable-next
|
|
41
|
-
return 'xdelta';
|
|
42
|
-
}
|
|
43
|
-
return false;
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
*
|
|
47
|
-
* @param headers
|
|
48
|
-
*/
|
|
49
|
-
export function detectCDN(headers) {
|
|
50
|
-
if ('X-Akamai-Transformed' in headers) {
|
|
51
|
-
return 'Akamai';
|
|
52
|
-
}
|
|
53
|
-
if ('x-amz-cf-pop' in headers) {
|
|
54
|
-
return 'Amazon CloudFront';
|
|
55
|
-
}
|
|
56
|
-
if ('X-IIJ-Cache' in headers) {
|
|
57
|
-
return 'IIJ';
|
|
58
|
-
}
|
|
59
|
-
if (typeof headers.server === 'string') {
|
|
60
|
-
if (/cloudflare/i.test(headers.server)) {
|
|
61
|
-
return 'Cloudflare';
|
|
62
|
-
}
|
|
63
|
-
if (/amazons3/i.test(headers.server)) {
|
|
64
|
-
return 'Amazon S3';
|
|
65
|
-
}
|
|
66
|
-
return false;
|
|
67
|
-
}
|
|
68
|
-
return false;
|
|
69
|
-
}
|
package/src/events.ts
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import type { ScrapeEventTypes, SubProcessEventTypes } from './types.js';
|
|
2
|
-
|
|
3
|
-
import { actionCreatorFactory } from 'typescript-fsa';
|
|
4
|
-
|
|
5
|
-
const scraperEventCreator = actionCreatorFactory('@@scraper');
|
|
6
|
-
const subProcessEventCreator = actionCreatorFactory('@@sub-process');
|
|
7
|
-
|
|
8
|
-
export const subProcessEvent = {
|
|
9
|
-
start: subProcessEventCreator<SubProcessEventTypes['start']>('start'),
|
|
10
|
-
destroy: subProcessEventCreator<SubProcessEventTypes['destroy']>('destroy'),
|
|
11
|
-
};
|
|
12
|
-
|
|
13
|
-
export const scraperEvent = {
|
|
14
|
-
ignoreAndSkip: scraperEventCreator<ScrapeEventTypes['ignoreAndSkip']>('ignoreAndSkip'),
|
|
15
|
-
resourceResponse:
|
|
16
|
-
scraperEventCreator<ScrapeEventTypes['resourceResponse']>('resourceResponse'),
|
|
17
|
-
scrapeEnd: scraperEventCreator<ScrapeEventTypes['scrapeEnd']>('scrapeEnd'),
|
|
18
|
-
destroyed: scraperEventCreator<ScrapeEventTypes['destroyed']>('destroyed'),
|
|
19
|
-
error: scraperEventCreator<ScrapeEventTypes['error']>('error'),
|
|
20
|
-
changePhase: scraperEventCreator<ScrapeEventTypes['changePhase']>('changePhase'),
|
|
21
|
-
};
|