website-scrap-engine 0.8.7 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +245 -39
- package/lib/downloader/main.d.ts.map +1 -1
- package/lib/downloader/main.js +15 -16
- package/lib/downloader/main.js.map +1 -1
- package/lib/downloader/multi.d.ts.map +1 -1
- package/lib/downloader/multi.js +3 -4
- package/lib/downloader/multi.js.map +1 -1
- package/lib/downloader/pipeline-executor-impl.d.ts +6 -3
- package/lib/downloader/pipeline-executor-impl.d.ts.map +1 -1
- package/lib/downloader/pipeline-executor-impl.js +86 -2
- package/lib/downloader/pipeline-executor-impl.js.map +1 -1
- package/lib/downloader/single.d.ts.map +1 -1
- package/lib/downloader/single.js +4 -5
- package/lib/downloader/single.js.map +1 -1
- package/lib/downloader/worker-pool.d.ts.map +1 -1
- package/lib/downloader/worker-pool.js +12 -7
- package/lib/downloader/worker-pool.js.map +1 -1
- package/lib/downloader/worker-type.d.ts +2 -2
- package/lib/downloader/worker-type.d.ts.map +1 -1
- package/lib/downloader/worker.js +2 -4
- package/lib/downloader/worker.js.map +1 -1
- package/lib/index.d.ts +2 -0
- package/lib/index.d.ts.map +1 -1
- package/lib/index.js +1 -0
- package/lib/index.js.map +1 -1
- package/lib/life-cycle/adapters.d.ts +7 -1
- package/lib/life-cycle/adapters.d.ts.map +1 -1
- package/lib/life-cycle/adapters.js +6 -0
- package/lib/life-cycle/adapters.js.map +1 -1
- package/lib/life-cycle/default-life-cycle.d.ts.map +1 -1
- package/lib/life-cycle/default-life-cycle.js +3 -1
- package/lib/life-cycle/default-life-cycle.js.map +1 -1
- package/lib/life-cycle/default-status-listener.d.ts +4 -0
- package/lib/life-cycle/default-status-listener.d.ts.map +1 -0
- package/lib/life-cycle/default-status-listener.js +38 -0
- package/lib/life-cycle/default-status-listener.js.map +1 -0
- package/lib/life-cycle/download-resource.d.ts.map +1 -1
- package/lib/life-cycle/download-resource.js +13 -3
- package/lib/life-cycle/download-resource.js.map +1 -1
- package/lib/life-cycle/index.d.ts +1 -0
- package/lib/life-cycle/index.d.ts.map +1 -1
- package/lib/life-cycle/index.js +1 -0
- package/lib/life-cycle/index.js.map +1 -1
- package/lib/life-cycle/pipeline-executor.d.ts +10 -3
- package/lib/life-cycle/pipeline-executor.d.ts.map +1 -1
- package/lib/life-cycle/types.d.ts +56 -4
- package/lib/life-cycle/types.d.ts.map +1 -1
- package/lib/logger/default-logger.d.ts +3 -0
- package/lib/logger/default-logger.d.ts.map +1 -0
- package/lib/logger/default-logger.js +11 -0
- package/lib/logger/default-logger.js.map +1 -0
- package/lib/logger/log4js-adapter.d.ts +3 -0
- package/lib/logger/log4js-adapter.d.ts.map +1 -0
- package/lib/logger/log4js-adapter.js +143 -0
- package/lib/logger/log4js-adapter.js.map +1 -0
- package/lib/logger/logger-worker.d.ts +3 -4
- package/lib/logger/logger-worker.d.ts.map +1 -1
- package/lib/logger/logger-worker.js +21 -20
- package/lib/logger/logger-worker.js.map +1 -1
- package/lib/logger/logger.d.ts +13 -11
- package/lib/logger/logger.d.ts.map +1 -1
- package/lib/logger/logger.js +32 -14
- package/lib/logger/logger.js.map +1 -1
- package/lib/logger/types.d.ts +23 -0
- package/lib/logger/types.d.ts.map +1 -0
- package/lib/logger/types.js +2 -0
- package/lib/logger/types.js.map +1 -0
- package/lib/options.d.ts +5 -4
- package/lib/options.d.ts.map +1 -1
- package/lib/options.js +6 -4
- package/lib/options.js.map +1 -1
- package/package.json +7 -5
- package/src/downloader/main.ts +15 -14
- package/src/downloader/multi.ts +3 -5
- package/src/downloader/pipeline-executor-impl.ts +98 -2
- package/src/downloader/single.ts +4 -5
- package/src/downloader/worker-pool.ts +12 -6
- package/src/downloader/worker-type.ts +2 -2
- package/src/downloader/worker.ts +2 -7
- package/src/index.ts +2 -0
- package/src/life-cycle/adapters.ts +13 -0
- package/src/life-cycle/default-life-cycle.ts +3 -1
- package/src/life-cycle/default-status-listener.ts +40 -0
- package/src/life-cycle/download-resource.ts +13 -4
- package/src/life-cycle/index.ts +1 -0
- package/src/life-cycle/pipeline-executor.ts +16 -2
- package/src/life-cycle/types.ts +79 -3
- package/src/logger/default-logger.ts +12 -0
- package/src/logger/log4js-adapter.ts +147 -0
- package/src/logger/logger-worker.ts +24 -23
- package/src/logger/logger.ts +36 -16
- package/src/logger/types.ts +35 -0
- package/src/options.ts +11 -7
- package/lib/logger/config-logger.d.ts +0 -3
- package/lib/logger/config-logger.d.ts.map +0 -1
- package/lib/logger/config-logger.js +0 -92
- package/lib/logger/config-logger.js.map +0 -1
- package/src/logger/config-logger.ts +0 -95
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "website-scrap-engine",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.0",
|
|
4
4
|
"description": "Configurable website scraper in typescript",
|
|
5
5
|
"main": "lib/index.js",
|
|
6
6
|
"types": "lib/index.d.ts",
|
|
@@ -23,7 +23,6 @@
|
|
|
23
23
|
"cheerio": "^1.2.0",
|
|
24
24
|
"css-url-parser": "^1.1.4",
|
|
25
25
|
"got": "^13.0.0",
|
|
26
|
-
"log4js": "^6.9.1",
|
|
27
26
|
"mkdirp": "^3.0.1",
|
|
28
27
|
"p-queue": "^8.1.1",
|
|
29
28
|
"srcset": "^5.0.3",
|
|
@@ -33,13 +32,16 @@
|
|
|
33
32
|
"@jest/globals": "^30.1.1",
|
|
34
33
|
"@types/node": "^25.5.0",
|
|
35
34
|
"@types/urijs": "^1.19.26",
|
|
36
|
-
"@typescript-eslint/eslint-plugin": "^8.
|
|
37
|
-
"@typescript-eslint/parser": "^8.
|
|
35
|
+
"@typescript-eslint/eslint-plugin": "^8.58.0",
|
|
36
|
+
"@typescript-eslint/parser": "^8.58.0",
|
|
38
37
|
"eslint": "^9.39.2",
|
|
39
38
|
"jest": "^30.3.0",
|
|
40
|
-
"ts-jest": "^29.4.
|
|
39
|
+
"ts-jest": "^29.4.9",
|
|
41
40
|
"typescript": "^5.9.3"
|
|
42
41
|
},
|
|
42
|
+
"optionalDependencies": {
|
|
43
|
+
"log4js": "^6.9.1"
|
|
44
|
+
},
|
|
43
45
|
"overrides": {
|
|
44
46
|
"undici": "npm:@favware/skip-dependency@1.2.2"
|
|
45
47
|
},
|
package/src/downloader/main.ts
CHANGED
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
import PQueue from 'p-queue';
|
|
2
|
-
import type {HTTPError} from 'got';
|
|
3
2
|
import URI from 'urijs';
|
|
4
3
|
import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
|
|
5
4
|
import {mergeOverrideOptions} from '../options.js';
|
|
6
5
|
import type {RawResource, Resource} from '../resource.js';
|
|
7
6
|
import {normalizeResource, ResourceType} from '../resource.js';
|
|
8
|
-
import {
|
|
7
|
+
import {skip} from '../logger/logger.js';
|
|
8
|
+
import {setLogger} from '../logger/logger.js';
|
|
9
|
+
import {createDefaultLogger} from '../logger/default-logger.js';
|
|
9
10
|
import {importDefaultFromPath} from '../util.js';
|
|
10
11
|
import type {DownloaderStats, DownloaderWithMeta} from './types.js';
|
|
11
12
|
import {PipelineExecutorImpl} from './pipeline-executor-impl.js';
|
|
13
|
+
import type {InitSubmitFunc} from '../life-cycle/types.js';
|
|
12
14
|
|
|
13
15
|
export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
14
16
|
readonly queue: PQueue;
|
|
@@ -41,7 +43,7 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
|
41
43
|
// https://github.com/website-local/website-scrap-engine/issues/1113
|
|
42
44
|
this.queue.concurrency = options.concurrency;
|
|
43
45
|
this._pipeline = new PipelineExecutorImpl(options, options.req, options);
|
|
44
|
-
|
|
46
|
+
setLogger((options.createLogger ?? createDefaultLogger)(options));
|
|
45
47
|
return this._internalInit(options).then(() => {
|
|
46
48
|
this._isInit = true;
|
|
47
49
|
});
|
|
@@ -89,9 +91,12 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
|
89
91
|
await this._initOptions;
|
|
90
92
|
}
|
|
91
93
|
const pipeline = this.pipeline;
|
|
92
|
-
|
|
94
|
+
const submit: InitSubmitFunc = (url: string) => {
|
|
95
|
+
urlArr.push(url);
|
|
96
|
+
};
|
|
97
|
+
await pipeline.init(pipeline, this, submit);
|
|
93
98
|
// noinspection DuplicatedCode
|
|
94
|
-
for (let i = 0
|
|
99
|
+
for (let i = 0; i < urlArr.length; i++) {
|
|
95
100
|
let url: string | void = urlArr[i];
|
|
96
101
|
url = await pipeline.linkRedirect(url, null, null);
|
|
97
102
|
if (!url) continue;
|
|
@@ -114,6 +119,7 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
|
114
119
|
// noinspection DuplicatedCode
|
|
115
120
|
if (res.depth > this.options.maxDepth) {
|
|
116
121
|
skip.info('skipped max depth', res.url, res.refUrl, res.depth);
|
|
122
|
+
this.pipeline.notifyStatusChange(res, 'dispose');
|
|
117
123
|
return false;
|
|
118
124
|
}
|
|
119
125
|
let url: string;
|
|
@@ -146,15 +152,10 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
|
146
152
|
}
|
|
147
153
|
|
|
148
154
|
handleError(err: Error | unknown | null, cause: string, resource: RawResource): void {
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
} else if (err) {
|
|
154
|
-
error.error(cause, resource.url, resource.downloadLink, resource.refUrl, err);
|
|
155
|
-
} else {
|
|
156
|
-
error.error(cause, resource.url, resource.downloadLink, resource.refUrl);
|
|
157
|
-
}
|
|
155
|
+
resource.meta = resource.meta || {};
|
|
156
|
+
resource.meta['error'] = err;
|
|
157
|
+
resource.meta['errorCause'] = cause;
|
|
158
|
+
this.pipeline.notifyStatusChange(resource, 'error');
|
|
158
159
|
}
|
|
159
160
|
|
|
160
161
|
|
package/src/downloader/multi.ts
CHANGED
|
@@ -5,7 +5,6 @@ import type {RawResource, Resource} from '../resource.js';
|
|
|
5
5
|
import type {DownloadWorkerMessage} from './types.js';
|
|
6
6
|
import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
|
|
7
7
|
import type {DownloadResource} from '../life-cycle/types.js';
|
|
8
|
-
import {skip} from '../logger/logger.js';
|
|
9
8
|
import {AbstractDownloader} from './main.js';
|
|
10
9
|
|
|
11
10
|
export interface MultiThreadDownloaderOptions extends StaticDownloadOptions {
|
|
@@ -52,7 +51,7 @@ export class MultiThreadDownloader extends AbstractDownloader {
|
|
|
52
51
|
if (this.options.initialUrl) {
|
|
53
52
|
return this.addInitialResource(this.options.initialUrl);
|
|
54
53
|
} else {
|
|
55
|
-
return this.
|
|
54
|
+
return this.addInitialResource([]);
|
|
56
55
|
}
|
|
57
56
|
}
|
|
58
57
|
|
|
@@ -68,7 +67,7 @@ export class MultiThreadDownloader extends AbstractDownloader {
|
|
|
68
67
|
try {
|
|
69
68
|
r = await this.pipeline!.download(res);
|
|
70
69
|
if (!r) {
|
|
71
|
-
|
|
70
|
+
await this.pipeline.notifyStatusChange(res, 'download');
|
|
72
71
|
return;
|
|
73
72
|
}
|
|
74
73
|
} catch (e) {
|
|
@@ -94,8 +93,7 @@ export class MultiThreadDownloader extends AbstractDownloader {
|
|
|
94
93
|
}
|
|
95
94
|
this.downloadedUrl.add(res.url);
|
|
96
95
|
if (!msg) {
|
|
97
|
-
|
|
98
|
-
res.url, res.rawUrl, res.refUrl);
|
|
96
|
+
await this.pipeline.notifyStatusChange(res, 'processAfterDownload');
|
|
99
97
|
return;
|
|
100
98
|
}
|
|
101
99
|
if (msg.error) {
|
|
@@ -1,14 +1,22 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import {existsSync, statSync} from 'node:fs';
|
|
3
|
+
import type {Stats} from 'node:fs';
|
|
1
4
|
import type {StaticDownloadOptions} from '../options.js';
|
|
2
5
|
import type {
|
|
3
6
|
CreateResourceArgument,
|
|
7
|
+
RawResource,
|
|
4
8
|
Resource,
|
|
5
9
|
ResourceEncoding,
|
|
6
10
|
ResourceType
|
|
7
11
|
} from '../resource.js';
|
|
8
12
|
import type {
|
|
9
13
|
DownloadResource,
|
|
14
|
+
ExistingResourceAction,
|
|
15
|
+
ExistingResourceStage,
|
|
16
|
+
InitSubmitFunc,
|
|
10
17
|
ProcessingLifeCycle,
|
|
11
18
|
RequestOptions,
|
|
19
|
+
ResourceStatus,
|
|
12
20
|
SubmitResourceFunc
|
|
13
21
|
} from '../life-cycle/types.js';
|
|
14
22
|
// noinspection ES6PreferShortImport
|
|
@@ -28,11 +36,12 @@ export class PipelineExecutorImpl implements PipelineExecutor {
|
|
|
28
36
|
|
|
29
37
|
async init(
|
|
30
38
|
pipeline: PipelineExecutor,
|
|
31
|
-
downloader?: DownloaderWithMeta
|
|
39
|
+
downloader?: DownloaderWithMeta,
|
|
40
|
+
submit?: InitSubmitFunc
|
|
32
41
|
): Promise<void> {
|
|
33
42
|
if (!this.lifeCycle.init) return;
|
|
34
43
|
for (const init of this.lifeCycle.init) {
|
|
35
|
-
await init(pipeline, downloader);
|
|
44
|
+
await init(pipeline, downloader, submit);
|
|
36
45
|
}
|
|
37
46
|
}
|
|
38
47
|
|
|
@@ -156,6 +165,22 @@ export class PipelineExecutorImpl implements PipelineExecutor {
|
|
|
156
165
|
if (!options) {
|
|
157
166
|
options = this.options;
|
|
158
167
|
}
|
|
168
|
+
if (this.lifeCycle.existingResource) {
|
|
169
|
+
const action = this._checkExistingResource(res, 'download');
|
|
170
|
+
if (action === 'skip') {
|
|
171
|
+
res.shouldBeDiscardedFromDownload = true;
|
|
172
|
+
return undefined;
|
|
173
|
+
}
|
|
174
|
+
if (action === 'ifModifiedSince') {
|
|
175
|
+
const mtime = this._getExistingFileMtime(res);
|
|
176
|
+
if (mtime) {
|
|
177
|
+
requestOptions = Object.assign({}, requestOptions);
|
|
178
|
+
requestOptions.headers = Object.assign({}, requestOptions.headers, {
|
|
179
|
+
'if-modified-since': mtime
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
159
184
|
let downloadedResource: DownloadResource | Resource | void = res;
|
|
160
185
|
for (const download of this.lifeCycle.download) {
|
|
161
186
|
if ((downloadedResource = await download(
|
|
@@ -204,6 +229,29 @@ export class PipelineExecutorImpl implements PipelineExecutor {
|
|
|
204
229
|
if (!options) {
|
|
205
230
|
options = this.options;
|
|
206
231
|
}
|
|
232
|
+
if (this.lifeCycle.existingResource) {
|
|
233
|
+
const action = this._checkExistingResource(res, 'saveToDisk');
|
|
234
|
+
if (action === 'skip' || action === 'skipSave') {
|
|
235
|
+
return undefined;
|
|
236
|
+
}
|
|
237
|
+
if (action === 'ifModifiedSince') {
|
|
238
|
+
const remoteLastMod = res.meta?.headers?.['last-modified'];
|
|
239
|
+
if (remoteLastMod) {
|
|
240
|
+
const localPath = path.join(
|
|
241
|
+
res.localRoot ?? this.options.localRoot,
|
|
242
|
+
decodeURI(res.savePath)
|
|
243
|
+
);
|
|
244
|
+
try {
|
|
245
|
+
const localMtime = statSync(localPath).mtime;
|
|
246
|
+
if (new Date(remoteLastMod as string) <= localMtime) {
|
|
247
|
+
return undefined;
|
|
248
|
+
}
|
|
249
|
+
} catch {
|
|
250
|
+
// file removed between check and stat, proceed with save
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
}
|
|
207
255
|
let downloadedResource: DownloadResource | void = res;
|
|
208
256
|
for (const saveToDisk of this.lifeCycle.saveToDisk) {
|
|
209
257
|
if ((downloadedResource = await saveToDisk(
|
|
@@ -229,4 +277,52 @@ export class PipelineExecutorImpl implements PipelineExecutor {
|
|
|
229
277
|
}
|
|
230
278
|
}
|
|
231
279
|
|
|
280
|
+
async notifyStatusChange(
|
|
281
|
+
res: Resource | RawResource,
|
|
282
|
+
status: ResourceStatus
|
|
283
|
+
): Promise<void> {
|
|
284
|
+
if (!this.lifeCycle.statusChange?.length) return;
|
|
285
|
+
for (const listener of this.lifeCycle.statusChange) {
|
|
286
|
+
try {
|
|
287
|
+
const r = listener(res, status, this.options, this);
|
|
288
|
+
if (r) await r;
|
|
289
|
+
} catch {
|
|
290
|
+
// swallow
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
private _checkExistingResource(
|
|
296
|
+
res: Resource, stage: ExistingResourceStage
|
|
297
|
+
): ExistingResourceAction | void {
|
|
298
|
+
const localPath = path.join(
|
|
299
|
+
res.localRoot ?? this.options.localRoot,
|
|
300
|
+
decodeURI(res.savePath)
|
|
301
|
+
);
|
|
302
|
+
if (!existsSync(localPath)) return undefined;
|
|
303
|
+
let stat: Stats;
|
|
304
|
+
try {
|
|
305
|
+
stat = statSync(localPath);
|
|
306
|
+
} catch {
|
|
307
|
+
// TOCTOU: file deleted between existsSync and statSync
|
|
308
|
+
return undefined;
|
|
309
|
+
}
|
|
310
|
+
if (!stat.isFile()) return undefined;
|
|
311
|
+
return this.lifeCycle.existingResource!({
|
|
312
|
+
res, stage, localPath, stat, options: this.options
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
private _getExistingFileMtime(res: Resource): string | undefined {
|
|
317
|
+
const localPath = path.join(
|
|
318
|
+
res.localRoot ?? this.options.localRoot,
|
|
319
|
+
decodeURI(res.savePath)
|
|
320
|
+
);
|
|
321
|
+
try {
|
|
322
|
+
return statSync(localPath).mtime.toUTCString();
|
|
323
|
+
} catch {
|
|
324
|
+
return undefined;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
232
328
|
}
|
package/src/downloader/single.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import {AbstractDownloader} from './main.js';
|
|
2
2
|
import type {Resource} from '../resource.js';
|
|
3
3
|
import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
|
|
4
|
-
import {skip} from '../logger/logger.js';
|
|
5
4
|
import type {
|
|
6
5
|
DownloadResource,
|
|
7
6
|
SubmitResourceFunc
|
|
@@ -20,7 +19,7 @@ export class SingleThreadDownloader extends AbstractDownloader {
|
|
|
20
19
|
if (options.initialUrl) {
|
|
21
20
|
return this.addInitialResource(options.initialUrl);
|
|
22
21
|
} else {
|
|
23
|
-
return this.
|
|
22
|
+
return this.addInitialResource([]);
|
|
24
23
|
}
|
|
25
24
|
}
|
|
26
25
|
|
|
@@ -29,7 +28,7 @@ export class SingleThreadDownloader extends AbstractDownloader {
|
|
|
29
28
|
try {
|
|
30
29
|
r = await this.pipeline.download(res);
|
|
31
30
|
if (!r) {
|
|
32
|
-
|
|
31
|
+
await this.pipeline.notifyStatusChange(res, 'download');
|
|
33
32
|
return;
|
|
34
33
|
}
|
|
35
34
|
} catch (e) {
|
|
@@ -51,9 +50,9 @@ export class SingleThreadDownloader extends AbstractDownloader {
|
|
|
51
50
|
const processedResource: DownloadResource | void =
|
|
52
51
|
await this.pipeline.processAfterDownload(r, submit);
|
|
53
52
|
if (!processedResource) {
|
|
54
|
-
|
|
53
|
+
await this.pipeline.notifyStatusChange(r, 'processAfterDownload');
|
|
55
54
|
} else if (await this.pipeline.saveToDisk(processedResource)) {
|
|
56
|
-
|
|
55
|
+
await this.pipeline.notifyStatusChange(r, 'saveToDisk');
|
|
57
56
|
}
|
|
58
57
|
if (processedResource && processedResource.redirectedUrl &&
|
|
59
58
|
processedResource.redirectedUrl !== processedResource.url) {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import type {MessagePort, WorkerOptions} from 'node:worker_threads';
|
|
2
2
|
import {Worker} from 'node:worker_threads';
|
|
3
3
|
import type {URL} from 'node:url';
|
|
4
|
-
import
|
|
4
|
+
import {error as errorLogger, getLogger} from '../logger/logger.js';
|
|
5
5
|
import type {LogWorkerMessage} from './worker-type.js';
|
|
6
6
|
import type {
|
|
7
7
|
PendingPromise,
|
|
@@ -63,7 +63,7 @@ export class WorkerPool<T = unknown, R extends WorkerMessage = WorkerMessage> {
|
|
|
63
63
|
}
|
|
64
64
|
|
|
65
65
|
workerOnError(info: WorkerInfo, err: Error): void {
|
|
66
|
-
|
|
66
|
+
errorLogger.error('worker error', info.id, err);
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
onMessage(info: WorkerInfo, message: WorkerMessage): void {
|
|
@@ -76,14 +76,20 @@ export class WorkerPool<T = unknown, R extends WorkerMessage = WorkerMessage> {
|
|
|
76
76
|
|
|
77
77
|
takeLog(info: WorkerInfo, message: LogWorkerMessage): void {
|
|
78
78
|
if (!message?.body) {
|
|
79
|
-
|
|
79
|
+
errorLogger.warn('Invalid formatted log', info.id);
|
|
80
80
|
return;
|
|
81
81
|
}
|
|
82
|
-
const
|
|
82
|
+
const level = message.body.level;
|
|
83
|
+
const logType = message.body.logType;
|
|
84
|
+
if (!level || !logType) {
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
const log = getLogger();
|
|
88
|
+
const content = message.body.content;
|
|
83
89
|
if (content?.length) {
|
|
84
|
-
|
|
90
|
+
log[level](logType, info.id, ...content);
|
|
85
91
|
} else {
|
|
86
|
-
|
|
92
|
+
log[level](logType, info.id);
|
|
87
93
|
}
|
|
88
94
|
}
|
|
89
95
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import type {logLevels} from '../logger/logger-worker.js';
|
|
2
|
-
import type
|
|
2
|
+
import type {LogType} from '../logger/types.js';
|
|
3
3
|
import type {WorkerMessage, WorkerMessageType} from './types.js';
|
|
4
4
|
|
|
5
5
|
export interface WorkerLog<T = unknown> {
|
|
6
|
-
|
|
6
|
+
logType: LogType;
|
|
7
7
|
level: typeof logLevels[number];
|
|
8
8
|
content: T[];
|
|
9
9
|
}
|
package/src/downloader/worker.ts
CHANGED
|
@@ -7,7 +7,6 @@ import type {
|
|
|
7
7
|
} from '../life-cycle/types.js';
|
|
8
8
|
import type {RawResource, Resource} from '../resource.js';
|
|
9
9
|
import {normalizeResource, prepareResourceForClone} from '../resource.js';
|
|
10
|
-
import {skip} from '../logger/logger.js';
|
|
11
10
|
import {importDefaultFromPath} from '../util.js';
|
|
12
11
|
import type {DownloadWorkerMessage} from './types.js';
|
|
13
12
|
import {WorkerMessageType} from './types.js';
|
|
@@ -29,8 +28,6 @@ const asyncPipeline = asyncOptions.then(options => {
|
|
|
29
28
|
const pipeline: PipelineExecutor =
|
|
30
29
|
new PipelineExecutorImpl(options, options.req, options);
|
|
31
30
|
|
|
32
|
-
options.configureLogger(options.localRoot, options.logSubDir || '');
|
|
33
|
-
|
|
34
31
|
const init = pipeline.init(pipeline);
|
|
35
32
|
if (init && (init as Promise<void>).then) {
|
|
36
33
|
return init.then(() => pipeline);
|
|
@@ -58,11 +55,9 @@ parentPort?.addListener('message', async (msg: WorkerTaskMessage<RawResource>) =
|
|
|
58
55
|
const processedResource: DownloadResource | void =
|
|
59
56
|
await pipeline.processAfterDownload(downloadResource, submit);
|
|
60
57
|
if (!processedResource) {
|
|
61
|
-
|
|
62
|
-
downloadResource.url, downloadResource.refUrl);
|
|
58
|
+
await pipeline.notifyStatusChange(downloadResource, 'processAfterDownload');
|
|
63
59
|
} else if (await pipeline.saveToDisk(processedResource)) {
|
|
64
|
-
|
|
65
|
-
downloadResource.url, downloadResource.refUrl);
|
|
60
|
+
await pipeline.notifyStatusChange(downloadResource, 'saveToDisk');
|
|
66
61
|
}
|
|
67
62
|
|
|
68
63
|
if (processedResource && processedResource.redirectedUrl &&
|
package/src/index.ts
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
export * as logger from './logger/logger.js';
|
|
2
|
+
export type {Logger, LogType, CategoryLogger} from './logger/types.js';
|
|
3
|
+
export {createDefaultLogger} from './logger/default-logger.js';
|
|
2
4
|
export * as downloader from './downloader/index.js';
|
|
3
5
|
export * as lifeCycle from './life-cycle/index.js';
|
|
4
6
|
export * as io from './io.js';
|
|
@@ -4,6 +4,7 @@ import {ResourceType} from '../resource.js';
|
|
|
4
4
|
import type {
|
|
5
5
|
AsyncResult,
|
|
6
6
|
DownloadResource,
|
|
7
|
+
ExistingResourceFunc,
|
|
7
8
|
LinkRedirectFunc,
|
|
8
9
|
ProcessResourceAfterDownloadFunc,
|
|
9
10
|
ProcessResourceBeforeDownloadFunc,
|
|
@@ -132,3 +133,15 @@ export const processHtmlAsync = (fn: AsyncHtmlProcessFunc): ProcessResourceAfter
|
|
|
132
133
|
return res;
|
|
133
134
|
};
|
|
134
135
|
|
|
136
|
+
/** Skip download if local file already exists */
|
|
137
|
+
export const skipExisting = (): ExistingResourceFunc =>
|
|
138
|
+
({stage}) => stage === 'download' ? 'skip' : 'overwrite';
|
|
139
|
+
|
|
140
|
+
/** Re-download only if remote is newer (If-Modified-Since) */
|
|
141
|
+
export const preferNewerRemote = (): ExistingResourceFunc =>
|
|
142
|
+
() => 'ifModifiedSince';
|
|
143
|
+
|
|
144
|
+
/** Always overwrite (current default behavior, explicit) */
|
|
145
|
+
export const alwaysOverwrite = (): ExistingResourceFunc =>
|
|
146
|
+
() => 'overwrite';
|
|
147
|
+
|
|
@@ -13,6 +13,7 @@ import {saveResourceToDisk} from './save-resource-to-disk.js';
|
|
|
13
13
|
import {processRedirectedUrl} from './adapters.js';
|
|
14
14
|
import {downloadStreamingResource} from './download-streaming-resource.js';
|
|
15
15
|
import {readOrCopyLocalResource} from './read-or-copy-local-resource.js';
|
|
16
|
+
import {defaultStatusListener} from './default-status-listener.js';
|
|
16
17
|
|
|
17
18
|
/**
|
|
18
19
|
* Get a copy of default life cycle
|
|
@@ -37,6 +38,7 @@ export const defaultLifeCycle = (): ProcessingLifeCycle => ({
|
|
|
37
38
|
processSiteMap
|
|
38
39
|
],
|
|
39
40
|
saveToDisk: [saveHtmlToDisk, saveResourceToDisk],
|
|
40
|
-
dispose: []
|
|
41
|
+
dispose: [],
|
|
42
|
+
statusChange: [defaultStatusListener]
|
|
41
43
|
});
|
|
42
44
|
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import type {RawResource, Resource} from '../resource.js';
|
|
2
|
+
import type {ResourceStatus} from './types.js';
|
|
3
|
+
import {error, notFound, skip} from '../logger/logger.js';
|
|
4
|
+
|
|
5
|
+
export const defaultStatusListener = (
|
|
6
|
+
res: Resource | RawResource,
|
|
7
|
+
status: ResourceStatus
|
|
8
|
+
): void => {
|
|
9
|
+
switch (status) {
|
|
10
|
+
case 'processBeforeDownload':
|
|
11
|
+
break;
|
|
12
|
+
case 'createResource':
|
|
13
|
+
break;
|
|
14
|
+
case 'download':
|
|
15
|
+
skip.debug('discarded after download', res.url, res.rawUrl, res.refUrl);
|
|
16
|
+
break;
|
|
17
|
+
case 'processAfterDownload':
|
|
18
|
+
skip.warn('skipped downloaded resource', res.url, res.refUrl);
|
|
19
|
+
break;
|
|
20
|
+
case 'saveToDisk':
|
|
21
|
+
skip.warn('downloaded resource not saved', res.url, res.refUrl);
|
|
22
|
+
break;
|
|
23
|
+
case 'error':
|
|
24
|
+
if (res.meta?.['errorCause']) {
|
|
25
|
+
const err = res.meta['error'];
|
|
26
|
+
const cause = res.meta['errorCause'] as string;
|
|
27
|
+
if (err && (err as {name?: string}).name === 'HTTPError' &&
|
|
28
|
+
(err as {response?: {statusCode?: number}})?.response?.statusCode === 404) {
|
|
29
|
+
notFound.error(res.url, res.downloadLink, res.refUrl);
|
|
30
|
+
} else if (err) {
|
|
31
|
+
error.error(cause, res.url, res.downloadLink, res.refUrl, err);
|
|
32
|
+
} else {
|
|
33
|
+
error.error(cause, res.url, res.downloadLink, res.refUrl);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
break;
|
|
37
|
+
case 'dispose':
|
|
38
|
+
break;
|
|
39
|
+
}
|
|
40
|
+
};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type {BeforeRetryHook, OptionsInit, RequestError, Response} from 'got';
|
|
2
|
-
import got, {TimeoutError} from 'got';
|
|
2
|
+
import got, {HTTPError, TimeoutError} from 'got';
|
|
3
3
|
import type {DownloadResource, RequestOptions} from './types.js';
|
|
4
4
|
import type {Resource} from '../resource.js';
|
|
5
5
|
import {generateSavePath, ResourceType} from '../resource.js';
|
|
@@ -117,8 +117,16 @@ export async function requestForResource(
|
|
|
117
117
|
}
|
|
118
118
|
logger.request.info(res.url, downloadLink, res.refUrl,
|
|
119
119
|
res.encoding, res.type);
|
|
120
|
-
|
|
121
|
-
|
|
120
|
+
let response: Response<string | Buffer> | void;
|
|
121
|
+
try {
|
|
122
|
+
response = await getRetry(downloadLink, reqOptions);
|
|
123
|
+
} catch (e) {
|
|
124
|
+
if (e instanceof HTTPError &&
|
|
125
|
+
(e as HTTPError).response.statusCode === 304) {
|
|
126
|
+
return undefined;
|
|
127
|
+
}
|
|
128
|
+
throw e;
|
|
129
|
+
}
|
|
122
130
|
if (!response) {
|
|
123
131
|
const resource = res as Resource;
|
|
124
132
|
delete resource.downloadStartTimestamp;
|
|
@@ -195,7 +203,8 @@ export async function downloadResource(
|
|
|
195
203
|
}
|
|
196
204
|
}
|
|
197
205
|
if (nonHtml) {
|
|
198
|
-
logger.error.warn('Detected non-html content type',
|
|
206
|
+
logger.error.warn('Detected non-html content type for resource typed as',
|
|
207
|
+
downloadedResource.type,
|
|
199
208
|
downloadedResource.downloadLink, downloadedResource.rawUrl, contentType);
|
|
200
209
|
}
|
|
201
210
|
}
|
package/src/life-cycle/index.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
export * as adapter from './adapters.js';
|
|
2
2
|
export {defaultLifeCycle} from './default-life-cycle.js';
|
|
3
|
+
export {defaultStatusListener} from './default-status-listener.js';
|
|
3
4
|
export {detectResourceType} from './detect-resource-type.js';
|
|
4
5
|
export {
|
|
5
6
|
beforeRetryHook, getRetry, requestForResource, downloadResource
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
import type {Resource, ResourceEncoding, ResourceType} from '../resource.js';
|
|
1
|
+
import type {RawResource, Resource, ResourceEncoding, ResourceType} from '../resource.js';
|
|
2
2
|
import type {StaticDownloadOptions} from '../options.js';
|
|
3
3
|
import type {
|
|
4
4
|
AsyncResult,
|
|
5
5
|
DownloadResource,
|
|
6
|
+
InitSubmitFunc,
|
|
6
7
|
RequestOptions,
|
|
8
|
+
ResourceStatus,
|
|
7
9
|
SubmitResourceFunc
|
|
8
10
|
} from './types.js';
|
|
9
11
|
import type {Cheerio} from '../types.js';
|
|
@@ -16,7 +18,8 @@ export interface PipelineExecutor {
|
|
|
16
18
|
*/
|
|
17
19
|
init(
|
|
18
20
|
pipeline: PipelineExecutor,
|
|
19
|
-
downloader?: DownloaderWithMeta
|
|
21
|
+
downloader?: DownloaderWithMeta,
|
|
22
|
+
submit?: InitSubmitFunc
|
|
20
23
|
): AsyncResult<void>;
|
|
21
24
|
|
|
22
25
|
/**
|
|
@@ -99,4 +102,15 @@ export interface PipelineExecutor {
|
|
|
99
102
|
workerExitCode?: number
|
|
100
103
|
): AsyncResult<void>;
|
|
101
104
|
|
|
105
|
+
/**
|
|
106
|
+
* Notify status change listeners.
|
|
107
|
+
*
|
|
108
|
+
* All listeners always run (void return does not short-circuit),
|
|
109
|
+
* and thrown errors are swallowed.
|
|
110
|
+
*/
|
|
111
|
+
notifyStatusChange(
|
|
112
|
+
res: Resource | RawResource,
|
|
113
|
+
status: ResourceStatus
|
|
114
|
+
): AsyncResult<void>;
|
|
115
|
+
|
|
102
116
|
}
|