website-scrap-engine 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/downloader/adjust-concurrency.d.ts +2 -1
- package/lib/downloader/adjust-concurrency.d.ts.map +1 -0
- package/lib/downloader/adjust-concurrency.js +4 -8
- package/lib/downloader/adjust-concurrency.js.map +1 -1
- package/lib/downloader/index.d.ts +9 -8
- package/lib/downloader/index.d.ts.map +1 -0
- package/lib/downloader/index.js +8 -40
- package/lib/downloader/index.js.map +1 -1
- package/lib/downloader/main.d.ts +15 -6
- package/lib/downloader/main.d.ts.map +1 -0
- package/lib/downloader/main.js +49 -32
- package/lib/downloader/main.js.map +1 -1
- package/lib/downloader/multi.d.ts +7 -5
- package/lib/downloader/multi.d.ts.map +1 -0
- package/lib/downloader/multi.js +10 -17
- package/lib/downloader/multi.js.map +1 -1
- package/lib/downloader/pipeline-executor-impl.d.ts +8 -7
- package/lib/downloader/pipeline-executor-impl.d.ts.map +1 -0
- package/lib/downloader/pipeline-executor-impl.js +1 -5
- package/lib/downloader/pipeline-executor-impl.js.map +1 -1
- package/lib/downloader/single.d.ts +4 -3
- package/lib/downloader/single.d.ts.map +1 -0
- package/lib/downloader/single.js +7 -11
- package/lib/downloader/single.js.map +1 -1
- package/lib/downloader/types.d.ts +4 -4
- package/lib/downloader/types.d.ts.map +1 -0
- package/lib/downloader/types.js +2 -5
- package/lib/downloader/types.js.map +1 -1
- package/lib/downloader/worker-pool.d.ts +6 -7
- package/lib/downloader/worker-pool.d.ts.map +1 -0
- package/lib/downloader/worker-pool.js +7 -35
- package/lib/downloader/worker-pool.js.map +1 -1
- package/lib/downloader/worker-type.d.ts +4 -3
- package/lib/downloader/worker-type.d.ts.map +1 -0
- package/lib/downloader/worker-type.js +1 -2
- package/lib/downloader/worker.d.ts +1 -0
- package/lib/downloader/worker.d.ts.map +1 -0
- package/lib/downloader/worker.js +52 -27
- package/lib/downloader/worker.js.map +1 -1
- package/lib/index.d.ts +9 -8
- package/lib/index.d.ts.map +1 -0
- package/lib/index.js +7 -33
- package/lib/index.js.map +1 -1
- package/lib/io.d.ts +2 -1
- package/lib/io.d.ts.map +1 -0
- package/lib/io.js +17 -25
- package/lib/io.js.map +1 -1
- package/lib/life-cycle/adapters.d.ts +7 -5
- package/lib/life-cycle/adapters.d.ts.map +1 -0
- package/lib/life-cycle/adapters.js +18 -30
- package/lib/life-cycle/adapters.js.map +1 -1
- package/lib/life-cycle/default-life-cycle.d.ts +2 -1
- package/lib/life-cycle/default-life-cycle.d.ts.map +1 -0
- package/lib/life-cycle/default-life-cycle.js +28 -32
- package/lib/life-cycle/default-life-cycle.js.map +1 -1
- package/lib/life-cycle/detect-resource-type.d.ts +2 -1
- package/lib/life-cycle/detect-resource-type.d.ts.map +1 -0
- package/lib/life-cycle/detect-resource-type.js +12 -17
- package/lib/life-cycle/detect-resource-type.js.map +1 -1
- package/lib/life-cycle/download-resource.d.ts +6 -7
- package/lib/life-cycle/download-resource.d.ts.map +1 -0
- package/lib/life-cycle/download-resource.js +23 -52
- package/lib/life-cycle/download-resource.js.map +1 -1
- package/lib/life-cycle/download-streaming-resource.d.ts +6 -5
- package/lib/life-cycle/download-streaming-resource.d.ts.map +1 -0
- package/lib/life-cycle/download-streaming-resource.js +39 -74
- package/lib/life-cycle/download-streaming-resource.js.map +1 -1
- package/lib/life-cycle/index.d.ts +16 -15
- package/lib/life-cycle/index.d.ts.map +1 -0
- package/lib/life-cycle/index.js +14 -59
- package/lib/life-cycle/index.js.map +1 -1
- package/lib/life-cycle/pipeline-executor.d.ts +7 -6
- package/lib/life-cycle/pipeline-executor.d.ts.map +1 -0
- package/lib/life-cycle/pipeline-executor.js +1 -2
- package/lib/life-cycle/process-css.d.ts +5 -4
- package/lib/life-cycle/process-css.d.ts.map +1 -0
- package/lib/life-cycle/process-css.js +10 -18
- package/lib/life-cycle/process-css.js.map +1 -1
- package/lib/life-cycle/process-html-meta.d.ts +4 -3
- package/lib/life-cycle/process-html-meta.d.ts.map +1 -0
- package/lib/life-cycle/process-html-meta.js +11 -15
- package/lib/life-cycle/process-html-meta.js.map +1 -1
- package/lib/life-cycle/process-html.d.ts +4 -3
- package/lib/life-cycle/process-html.d.ts.map +1 -0
- package/lib/life-cycle/process-html.js +27 -31
- package/lib/life-cycle/process-html.js.map +1 -1
- package/lib/life-cycle/process-site-map.d.ts +4 -3
- package/lib/life-cycle/process-site-map.d.ts.map +1 -0
- package/lib/life-cycle/process-site-map.js +7 -11
- package/lib/life-cycle/process-site-map.js.map +1 -1
- package/lib/life-cycle/process-source-map.d.ts +4 -4
- package/lib/life-cycle/process-source-map.d.ts.map +1 -0
- package/lib/life-cycle/process-source-map.js +16 -21
- package/lib/life-cycle/process-source-map.js.map +1 -1
- package/lib/life-cycle/process-svg.d.ts +4 -3
- package/lib/life-cycle/process-svg.d.ts.map +1 -0
- package/lib/life-cycle/process-svg.js +17 -21
- package/lib/life-cycle/process-svg.js.map +1 -1
- package/lib/life-cycle/read-or-copy-local-resource.d.ts +4 -3
- package/lib/life-cycle/read-or-copy-local-resource.d.ts.map +1 -0
- package/lib/life-cycle/read-or-copy-local-resource.js +15 -42
- package/lib/life-cycle/read-or-copy-local-resource.js.map +1 -1
- package/lib/life-cycle/save-html-to-disk.d.ts +6 -4
- package/lib/life-cycle/save-html-to-disk.d.ts.map +1 -0
- package/lib/life-cycle/save-html-to-disk.js +24 -33
- package/lib/life-cycle/save-html-to-disk.js.map +1 -1
- package/lib/life-cycle/save-resource-to-disk.d.ts +4 -3
- package/lib/life-cycle/save-resource-to-disk.d.ts.map +1 -0
- package/lib/life-cycle/save-resource-to-disk.js +10 -17
- package/lib/life-cycle/save-resource-to-disk.js.map +1 -1
- package/lib/life-cycle/skip-links.d.ts +1 -0
- package/lib/life-cycle/skip-links.d.ts.map +1 -0
- package/lib/life-cycle/skip-links.js +6 -10
- package/lib/life-cycle/skip-links.js.map +1 -1
- package/lib/life-cycle/types.d.ts +8 -7
- package/lib/life-cycle/types.d.ts.map +1 -0
- package/lib/life-cycle/types.js +1 -2
- package/lib/logger/config-logger.d.ts +2 -1
- package/lib/logger/config-logger.d.ts.map +1 -0
- package/lib/logger/config-logger.js +4 -30
- package/lib/logger/config-logger.js.map +1 -1
- package/lib/logger/logger-worker.d.ts +3 -2
- package/lib/logger/logger-worker.d.ts.map +1 -0
- package/lib/logger/logger-worker.js +11 -13
- package/lib/logger/logger-worker.js.map +1 -1
- package/lib/logger/logger.d.ts +2 -1
- package/lib/logger/logger.d.ts.map +1 -0
- package/lib/logger/logger.js +15 -17
- package/lib/logger/logger.js.map +1 -1
- package/lib/options.d.ts +8 -8
- package/lib/options.d.ts.map +1 -0
- package/lib/options.js +22 -32
- package/lib/options.js.map +1 -1
- package/lib/resource.d.ts +3 -4
- package/lib/resource.d.ts.map +1 -0
- package/lib/resource.js +34 -70
- package/lib/resource.js.map +1 -1
- package/lib/sources.d.ts +2 -1
- package/lib/sources.d.ts.map +1 -0
- package/lib/sources.js +9 -12
- package/lib/sources.js.map +1 -1
- package/lib/types.d.ts +1 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +1 -2
- package/lib/util.d.ts +4 -3
- package/lib/util.d.ts.map +1 -0
- package/lib/util.js +17 -34
- package/lib/util.js.map +1 -1
- package/package.json +18 -20
- package/src/downloader/adjust-concurrency.ts +2 -2
- package/src/downloader/index.ts +8 -8
- package/src/downloader/main.ts +50 -28
- package/src/downloader/multi.ts +11 -10
- package/src/downloader/pipeline-executor-impl.ts +7 -7
- package/src/downloader/single.ts +9 -6
- package/src/downloader/types.ts +3 -3
- package/src/downloader/worker-pool.ts +9 -9
- package/src/downloader/worker-type.ts +3 -3
- package/src/downloader/worker.ts +51 -29
- package/src/index.ts +8 -8
- package/src/io.ts +6 -6
- package/src/life-cycle/adapters.ts +7 -6
- package/src/life-cycle/css-url-parser.d.ts +1 -1
- package/src/life-cycle/default-life-cycle.ts +15 -15
- package/src/life-cycle/detect-resource-type.ts +2 -2
- package/src/life-cycle/download-resource.ts +18 -20
- package/src/life-cycle/download-streaming-resource.ts +20 -18
- package/src/life-cycle/index.ts +15 -15
- package/src/life-cycle/pipeline-executor.ts +6 -6
- package/src/life-cycle/process-css.ts +6 -5
- package/src/life-cycle/process-html-meta.ts +7 -6
- package/src/life-cycle/process-html.ts +21 -13
- package/src/life-cycle/process-site-map.ts +7 -6
- package/src/life-cycle/process-source-map.ts +5 -4
- package/src/life-cycle/process-svg.ts +10 -9
- package/src/life-cycle/read-or-copy-local-resource.ts +9 -7
- package/src/life-cycle/save-html-to-disk.ts +9 -13
- package/src/life-cycle/save-resource-to-disk.ts +6 -6
- package/src/life-cycle/types.ts +7 -7
- package/src/logger/config-logger.ts +5 -3
- package/src/logger/logger-worker.ts +8 -4
- package/src/logger/logger.ts +6 -4
- package/src/options.ts +15 -19
- package/src/resource.ts +10 -5
- package/src/sources.ts +1 -1
- package/src/util.ts +6 -10
- package/tsconfig.json +6 -2
package/src/downloader/worker.ts
CHANGED
|
@@ -1,46 +1,49 @@
|
|
|
1
|
-
import {parentPort, workerData} from 'worker_threads';
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
import {
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
} from '
|
|
14
|
-
import {
|
|
15
|
-
import {importDefaultFromPath} from '../util';
|
|
16
|
-
import {DownloadWorkerMessage, WorkerMessageType} from './types';
|
|
17
|
-
import {PipelineExecutorImpl} from './pipeline-executor-impl';
|
|
1
|
+
import {parentPort, workerData} from 'node:worker_threads';
|
|
2
|
+
import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
|
|
3
|
+
import {mergeOverrideOptions} from '../options.js';
|
|
4
|
+
import type {
|
|
5
|
+
DownloadResource,
|
|
6
|
+
SubmitResourceFunc
|
|
7
|
+
} from '../life-cycle/types.js';
|
|
8
|
+
import type {RawResource, Resource} from '../resource.js';
|
|
9
|
+
import {normalizeResource, prepareResourceForClone} from '../resource.js';
|
|
10
|
+
import {skip} from '../logger/logger.js';
|
|
11
|
+
import {importDefaultFromPath} from '../util.js';
|
|
12
|
+
import type {DownloadWorkerMessage} from './types.js';
|
|
13
|
+
import {WorkerMessageType} from './types.js';
|
|
14
|
+
import {PipelineExecutorImpl} from './pipeline-executor-impl.js';
|
|
18
15
|
// noinspection ES6PreferShortImport
|
|
19
|
-
import type {PipelineExecutor} from '../life-cycle/pipeline-executor';
|
|
20
|
-
import type {WorkerTaskMessage} from './worker-type';
|
|
16
|
+
import type {PipelineExecutor} from '../life-cycle/pipeline-executor.js';
|
|
17
|
+
import type {WorkerTaskMessage} from './worker-type.js';
|
|
21
18
|
|
|
22
19
|
const {pathToOptions, overrideOptions}: {
|
|
23
20
|
pathToOptions: string,
|
|
24
21
|
overrideOptions?: Partial<StaticDownloadOptions>
|
|
25
22
|
} = workerData;
|
|
26
23
|
|
|
27
|
-
const
|
|
28
|
-
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
29
|
-
mergeOverrideOptions(importDefaultFromPath(pathToOptions), overrideOptions);
|
|
24
|
+
const asyncOptions: Promise<DownloadOptions> = importDefaultFromPath(pathToOptions);
|
|
30
25
|
|
|
31
|
-
const
|
|
32
|
-
|
|
26
|
+
const asyncPipeline = asyncOptions.then(options => {
|
|
27
|
+
options = mergeOverrideOptions(options, overrideOptions);
|
|
33
28
|
|
|
34
|
-
|
|
29
|
+
const pipeline: PipelineExecutor =
|
|
30
|
+
new PipelineExecutorImpl(options, options.req, options);
|
|
35
31
|
|
|
36
|
-
|
|
32
|
+
options.configureLogger(options.localRoot, options.logSubDir || '');
|
|
33
|
+
|
|
34
|
+
const init = pipeline.init(pipeline);
|
|
35
|
+
if (init && (init as Promise<void>).then) {
|
|
36
|
+
return init.then(() => pipeline);
|
|
37
|
+
}
|
|
38
|
+
return pipeline;
|
|
39
|
+
});
|
|
37
40
|
|
|
38
41
|
parentPort?.addListener('message', async (msg: WorkerTaskMessage<RawResource>) => {
|
|
39
42
|
const collectedResource: RawResource[] = [];
|
|
40
43
|
let error: Error | unknown | void;
|
|
41
44
|
let redirectedUrl: string | undefined;
|
|
42
45
|
try {
|
|
43
|
-
await
|
|
46
|
+
const pipeline = await asyncPipeline;
|
|
44
47
|
const res = msg.body;
|
|
45
48
|
const downloadResource: DownloadResource = normalizeResource(res) as DownloadResource;
|
|
46
49
|
const submit: SubmitResourceFunc = (resources: Resource | Resource[]) => {
|
|
@@ -67,8 +70,27 @@ parentPort?.addListener('message', async (msg: WorkerTaskMessage<RawResource>) =
|
|
|
67
70
|
redirectedUrl = processedResource.redirectedUrl;
|
|
68
71
|
}
|
|
69
72
|
} catch (e) {
|
|
70
|
-
//
|
|
71
|
-
|
|
73
|
+
// handle if object could not be cloned here
|
|
74
|
+
// https://github.com/website-local/website-scrap-engine/issues/340
|
|
75
|
+
try {
|
|
76
|
+
// should always be
|
|
77
|
+
if (typeof structuredClone === 'function') {
|
|
78
|
+
error = structuredClone(e);
|
|
79
|
+
} else {
|
|
80
|
+
// this is the old behavior before this
|
|
81
|
+
error = e;
|
|
82
|
+
}
|
|
83
|
+
} catch {
|
|
84
|
+
// can not clone, so no need to get the full error here
|
|
85
|
+
if (e && typeof e === 'object') {
|
|
86
|
+
const clone: Record<string, string> = {};
|
|
87
|
+
for (const k in e) {
|
|
88
|
+
clone[k] = String((e as Record<string, unknown>)[k]);
|
|
89
|
+
}
|
|
90
|
+
} else {
|
|
91
|
+
error = String(e);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
72
94
|
} finally {
|
|
73
95
|
const message: DownloadWorkerMessage = {
|
|
74
96
|
taskId: msg.taskId,
|
package/src/index.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
export * as logger from './logger/logger';
|
|
2
|
-
export * as downloader from './downloader/index';
|
|
3
|
-
export * as lifeCycle from './life-cycle/index';
|
|
4
|
-
export * as io from './io';
|
|
5
|
-
export * as options from './options';
|
|
6
|
-
export * as resource from './resource';
|
|
7
|
-
export {SourceDefinition} from './sources';
|
|
8
|
-
export * as util from './util';
|
|
1
|
+
export * as logger from './logger/logger.js';
|
|
2
|
+
export * as downloader from './downloader/index.js';
|
|
3
|
+
export * as lifeCycle from './life-cycle/index.js';
|
|
4
|
+
export * as io from './io.js';
|
|
5
|
+
export * as options from './options.js';
|
|
6
|
+
export * as resource from './resource.js';
|
|
7
|
+
export type {SourceDefinition} from './sources.js';
|
|
8
|
+
export * as util from './util.js';
|
package/src/io.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import type {ObjectEncodingOptions} from 'fs';
|
|
2
|
-
import fs from 'fs';
|
|
3
|
-
import {dirname} from 'path';
|
|
1
|
+
import type {ObjectEncodingOptions} from 'node:fs';
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import {dirname} from 'node:path';
|
|
4
4
|
import {mkdirp} from 'mkdirp';
|
|
5
|
-
import type {ResourceBody, ResourceEncoding} from './resource';
|
|
6
|
-
import {error as errorLogger, mkdir as mkdirLogger} from './logger/logger';
|
|
5
|
+
import type {ResourceBody, ResourceEncoding} from './resource.js';
|
|
6
|
+
import {error as errorLogger, mkdir as mkdirLogger} from './logger/logger.js';
|
|
7
7
|
|
|
8
8
|
export const mkdirRetry = async (dir: string, retry = 3): Promise<void> => {
|
|
9
9
|
let error: unknown | void;
|
|
@@ -40,7 +40,7 @@ export const writeFile = async (
|
|
|
40
40
|
await mkdirRetry(dir);
|
|
41
41
|
}
|
|
42
42
|
let fileData: Uint8Array | string;
|
|
43
|
-
let options: ObjectEncodingOptions | void;
|
|
43
|
+
let options: ObjectEncodingOptions | void = void 0;
|
|
44
44
|
if (typeof data === 'string') {
|
|
45
45
|
fileData = data;
|
|
46
46
|
options = {encoding};
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import {load} from 'cheerio';
|
|
2
|
-
import {Resource, ResourceEncoding
|
|
2
|
+
import type {Resource, ResourceEncoding} from '../resource.js';
|
|
3
|
+
import {ResourceType} from '../resource.js';
|
|
3
4
|
import type {
|
|
4
5
|
AsyncResult,
|
|
5
6
|
DownloadResource,
|
|
@@ -7,11 +8,11 @@ import type {
|
|
|
7
8
|
ProcessResourceAfterDownloadFunc,
|
|
8
9
|
ProcessResourceBeforeDownloadFunc,
|
|
9
10
|
SubmitResourceFunc
|
|
10
|
-
} from './types';
|
|
11
|
-
import {toString} from '../util';
|
|
12
|
-
import type {StaticDownloadOptions} from '../options';
|
|
13
|
-
import type {PipelineExecutor} from './pipeline-executor';
|
|
14
|
-
import type {Cheerio, CheerioStatic} from '../types';
|
|
11
|
+
} from './types.js';
|
|
12
|
+
import {toString} from '../util.js';
|
|
13
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
14
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
15
|
+
import type {Cheerio, CheerioStatic} from '../types.js';
|
|
15
16
|
|
|
16
17
|
export interface SkipProcessFunc {
|
|
17
18
|
(url: string, element: Cheerio | null, parent: Resource | null): boolean;
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
import type {ProcessingLifeCycle} from './types';
|
|
2
|
-
import {skipLinks} from './skip-links';
|
|
3
|
-
import {detectResourceType} from './detect-resource-type';
|
|
4
|
-
import {createResource} from '../resource';
|
|
5
|
-
import {downloadResource} from './download-resource';
|
|
6
|
-
import {processHtml} from './process-html';
|
|
7
|
-
import {processHtmlMetaRefresh} from './process-html-meta';
|
|
8
|
-
import {processCss} from './process-css';
|
|
9
|
-
import {processSiteMap} from './process-site-map';
|
|
10
|
-
import {processSvg} from './process-svg';
|
|
11
|
-
import {saveHtmlToDisk} from './save-html-to-disk';
|
|
12
|
-
import {saveResourceToDisk} from './save-resource-to-disk';
|
|
13
|
-
import {processRedirectedUrl} from './adapters';
|
|
14
|
-
import {downloadStreamingResource} from './download-streaming-resource';
|
|
15
|
-
import {readOrCopyLocalResource} from './read-or-copy-local-resource';
|
|
1
|
+
import type {ProcessingLifeCycle} from './types.js';
|
|
2
|
+
import {skipLinks} from './skip-links.js';
|
|
3
|
+
import {detectResourceType} from './detect-resource-type.js';
|
|
4
|
+
import {createResource} from '../resource.js';
|
|
5
|
+
import {downloadResource} from './download-resource.js';
|
|
6
|
+
import {processHtml} from './process-html.js';
|
|
7
|
+
import {processHtmlMetaRefresh} from './process-html-meta.js';
|
|
8
|
+
import {processCss} from './process-css.js';
|
|
9
|
+
import {processSiteMap} from './process-site-map.js';
|
|
10
|
+
import {processSvg} from './process-svg.js';
|
|
11
|
+
import {saveHtmlToDisk} from './save-html-to-disk.js';
|
|
12
|
+
import {saveResourceToDisk} from './save-resource-to-disk.js';
|
|
13
|
+
import {processRedirectedUrl} from './adapters.js';
|
|
14
|
+
import {downloadStreamingResource} from './download-streaming-resource.js';
|
|
15
|
+
import {readOrCopyLocalResource} from './read-or-copy-local-resource.js';
|
|
16
16
|
|
|
17
17
|
/**
|
|
18
18
|
* Get a copy of default life cycle
|
|
@@ -1,24 +1,22 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
import
|
|
9
|
-
import type {DownloadResource, RequestOptions} from './types';
|
|
10
|
-
import {generateSavePath, Resource, ResourceType} from '../resource';
|
|
11
|
-
import type {StaticDownloadOptions} from '../options';
|
|
12
|
-
import * as logger from '../logger/logger';
|
|
13
|
-
import {isUrlHttp, sleep} from '../util';
|
|
1
|
+
import type {BeforeRetryHook, OptionsInit, RequestError, Response} from 'got';
|
|
2
|
+
import got, {TimeoutError} from 'got';
|
|
3
|
+
import type {DownloadResource, RequestOptions} from './types.js';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {generateSavePath, ResourceType} from '../resource.js';
|
|
6
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
7
|
+
import * as logger from '../logger/logger.js';
|
|
8
|
+
import {isUrlHttp, sleep} from '../util.js';
|
|
14
9
|
import URI from 'urijs';
|
|
15
10
|
|
|
16
11
|
/** Take logs before retry */
|
|
17
12
|
export const beforeRetryHook: BeforeRetryHook = (
|
|
18
|
-
|
|
19
|
-
error: RequestError | undefined,
|
|
13
|
+
error: RequestError,
|
|
20
14
|
retryCount: number | undefined
|
|
21
15
|
) => {
|
|
16
|
+
const options = error.options;
|
|
17
|
+
if (!options) {
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
22
20
|
if (!error) {
|
|
23
21
|
logger.retry.warn(retryCount, String(options.url));
|
|
24
22
|
return;
|
|
@@ -49,15 +47,15 @@ export interface DownloadError extends Partial<Error> {
|
|
|
49
47
|
*/
|
|
50
48
|
export async function getRetry(
|
|
51
49
|
url: string,
|
|
52
|
-
options:
|
|
50
|
+
options: OptionsInit
|
|
53
51
|
): Promise<Response<Buffer | string> | void> {
|
|
54
|
-
let res: Response<Buffer | string> | void;
|
|
55
|
-
let err: DownloadError | void, optionsClone:
|
|
52
|
+
let res: Response<Buffer | string> | void = void 0;
|
|
53
|
+
let err: DownloadError | void = void 0, optionsClone: OptionsInit;
|
|
56
54
|
for (let i = 0; i < 25; i++) {
|
|
57
55
|
err = void 0;
|
|
58
56
|
try {
|
|
59
57
|
optionsClone = Object.assign({}, options);
|
|
60
|
-
res = (await got(url, optionsClone)) as
|
|
58
|
+
res = (await got(url, optionsClone)) as Response<Buffer | string>;
|
|
61
59
|
if (!res || !res.body || !res.body.length) {
|
|
62
60
|
logger.retry.warn(i, url, 'manually retry on empty response or body',
|
|
63
61
|
res && res.body);
|
|
@@ -102,7 +100,7 @@ export async function requestForResource(
|
|
|
102
100
|
options?: StaticDownloadOptions
|
|
103
101
|
): Promise<DownloadResource | Resource | void> {
|
|
104
102
|
const downloadLink: string = encodeURI(decodeURI(res.downloadLink));
|
|
105
|
-
const reqOptions:
|
|
103
|
+
const reqOptions: OptionsInit = Object.assign({}, requestOptions);
|
|
106
104
|
reqOptions.responseType = 'buffer';
|
|
107
105
|
if (res.refUrl && res.refUrl !== downloadLink) {
|
|
108
106
|
const headers = Object.assign({}, reqOptions.headers);
|
|
@@ -1,21 +1,23 @@
|
|
|
1
|
-
import
|
|
2
|
-
import type {
|
|
3
|
-
import
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import type {WriteStream} from 'node:fs';
|
|
3
|
+
import {constants, createWriteStream, promises as fs} from 'node:fs';
|
|
4
|
+
import {pipeline} from 'node:stream';
|
|
5
|
+
import {promisify} from 'node:util';
|
|
6
|
+
import type {RequestError, Response} from 'got';
|
|
7
|
+
import got, {HTTPError} from 'got';
|
|
8
|
+
import type {Resource} from '../resource.js';
|
|
9
|
+
import {ResourceType} from '../resource.js';
|
|
6
10
|
import type {
|
|
7
11
|
AsyncResult,
|
|
8
12
|
DownloadResource,
|
|
9
13
|
DownloadResourceFunc,
|
|
10
14
|
RequestOptions
|
|
11
|
-
} from './types';
|
|
12
|
-
import {mkdirRetry} from '../io';
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import
|
|
17
|
-
import type {PipelineExecutor} from './pipeline-executor';
|
|
18
|
-
import {isUrlHttp} from '../util';
|
|
15
|
+
} from './types.js';
|
|
16
|
+
import {mkdirRetry} from '../io.js';
|
|
17
|
+
import {error as errorLogger, retry as retryLogger} from '../logger/logger.js';
|
|
18
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
19
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
20
|
+
import {isUrlHttp} from '../util.js';
|
|
19
21
|
|
|
20
22
|
const promisifyPipeline = promisify(pipeline);
|
|
21
23
|
|
|
@@ -108,7 +110,7 @@ export async function streamingDownloadToFile(
|
|
|
108
110
|
rangeStart = undefined;
|
|
109
111
|
}
|
|
110
112
|
|
|
111
|
-
if (response.request.
|
|
113
|
+
if (response.request.isAborted) {
|
|
112
114
|
// Canceled while downloading
|
|
113
115
|
//- will throw a `CancelError` or `TimeoutError` error
|
|
114
116
|
return;
|
|
@@ -129,9 +131,9 @@ export async function streamingDownloadToFile(
|
|
|
129
131
|
return;
|
|
130
132
|
}
|
|
131
133
|
|
|
132
|
-
if (request._isAboutToError) {
|
|
133
|
-
|
|
134
|
-
}
|
|
134
|
+
// if (request._isAboutToError) {
|
|
135
|
+
// return;
|
|
136
|
+
// }
|
|
135
137
|
|
|
136
138
|
resolve(response);
|
|
137
139
|
});
|
|
@@ -222,7 +224,7 @@ export async function optionallySetLastModifiedTime(
|
|
|
222
224
|
res: Resource, options: StaticDownloadOptions
|
|
223
225
|
): Promise<void> {
|
|
224
226
|
// https://github.com/website-local/website-scrap-engine/issues/174
|
|
225
|
-
let mtime: number | void;
|
|
227
|
+
let mtime: number | void = void 0;
|
|
226
228
|
if (options.preferRemoteLastModifiedTime && res.meta?.headers?.['last-modified']) {
|
|
227
229
|
mtime = Date.parse(res.meta.headers?.['last-modified']);
|
|
228
230
|
}
|
package/src/life-cycle/index.ts
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
export * as adapter from './adapters';
|
|
2
|
-
export {defaultLifeCycle} from './default-life-cycle';
|
|
3
|
-
export {detectResourceType} from './detect-resource-type';
|
|
1
|
+
export * as adapter from './adapters.js';
|
|
2
|
+
export {defaultLifeCycle} from './default-life-cycle.js';
|
|
3
|
+
export {detectResourceType} from './detect-resource-type.js';
|
|
4
4
|
export {
|
|
5
5
|
beforeRetryHook, getRetry, requestForResource, downloadResource
|
|
6
|
-
} from './download-resource';
|
|
6
|
+
} from './download-resource.js';
|
|
7
7
|
export {
|
|
8
8
|
streamingDownloadToFile,
|
|
9
9
|
downloadStreamingResource,
|
|
10
10
|
downloadStreamingResourceWithHook
|
|
11
|
-
} from './download-streaming-resource';
|
|
12
|
-
export {PipelineExecutor} from './pipeline-executor';
|
|
13
|
-
export {processCssText, processCss} from './process-css';
|
|
14
|
-
export {processHtml} from './process-html';
|
|
15
|
-
export {processHtmlMetaRefresh} from './process-html-meta';
|
|
16
|
-
export {processSiteMap} from './process-site-map';
|
|
17
|
-
export {processSvg} from './process-svg';
|
|
18
|
-
export {getResourceBodyFromHtml, saveHtmlToDisk} from './save-html-to-disk';
|
|
19
|
-
export {saveResourceToDisk} from './save-resource-to-disk';
|
|
20
|
-
export {skipLinks} from './skip-links';
|
|
21
|
-
export * as types from './types';
|
|
11
|
+
} from './download-streaming-resource.js';
|
|
12
|
+
export type {PipelineExecutor} from './pipeline-executor.js';
|
|
13
|
+
export {processCssText, processCss} from './process-css.js';
|
|
14
|
+
export {processHtml} from './process-html.js';
|
|
15
|
+
export {processHtmlMetaRefresh} from './process-html-meta.js';
|
|
16
|
+
export {processSiteMap} from './process-site-map.js';
|
|
17
|
+
export {processSvg} from './process-svg.js';
|
|
18
|
+
export {getResourceBodyFromHtml, saveHtmlToDisk} from './save-html-to-disk.js';
|
|
19
|
+
export {saveResourceToDisk} from './save-resource-to-disk.js';
|
|
20
|
+
export {skipLinks} from './skip-links.js';
|
|
21
|
+
export * as types from './types.js';
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
import type {Resource, ResourceEncoding, ResourceType} from '../resource';
|
|
2
|
-
import type {StaticDownloadOptions} from '../options';
|
|
1
|
+
import type {Resource, ResourceEncoding, ResourceType} from '../resource.js';
|
|
2
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
3
3
|
import type {
|
|
4
4
|
AsyncResult,
|
|
5
5
|
DownloadResource,
|
|
6
6
|
RequestOptions,
|
|
7
7
|
SubmitResourceFunc
|
|
8
|
-
} from './types';
|
|
9
|
-
import type {Cheerio} from '../types';
|
|
10
|
-
import type {DownloaderWithMeta} from '../downloader/types';
|
|
11
|
-
import type {WorkerInfo} from '../downloader/worker-pool';
|
|
8
|
+
} from './types.js';
|
|
9
|
+
import type {Cheerio} from '../types.js';
|
|
10
|
+
import type {DownloaderWithMeta} from '../downloader/types.js';
|
|
11
|
+
import type {WorkerInfo} from '../downloader/worker-pool.js';
|
|
12
12
|
|
|
13
13
|
export interface PipelineExecutor {
|
|
14
14
|
/**
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import parseCssUrls from 'css-url-parser';
|
|
2
|
-
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
3
|
-
import type {StaticDownloadOptions} from '../options';
|
|
4
|
-
import {Resource
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
2
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
3
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {ResourceType} from '../resource.js';
|
|
6
|
+
import {toString} from '../util.js';
|
|
7
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
7
8
|
|
|
8
9
|
export async function processCssText(
|
|
9
10
|
cssText: string,
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
2
|
-
import type {StaticDownloadOptions} from '../options';
|
|
3
|
-
import type {PipelineExecutor} from './pipeline-executor';
|
|
4
|
-
import {Resource
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
1
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
2
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
3
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {ResourceType} from '../resource.js';
|
|
6
|
+
import {parseHtml} from './adapters.js';
|
|
7
|
+
import {skip} from '../logger/logger.js';
|
|
7
8
|
|
|
8
9
|
/**
|
|
9
10
|
* Originally create by https://github.com/stevenvachon at
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
import
|
|
2
|
-
import {
|
|
3
|
-
import
|
|
4
|
-
import type {
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
7
|
-
import {
|
|
8
|
-
import
|
|
9
|
-
import {
|
|
10
|
-
import type {
|
|
1
|
+
import type {SrcSetDefinition} from 'srcset';
|
|
2
|
+
import {parseSrcset, stringifySrcset} from 'srcset';
|
|
3
|
+
import {sources as defaultSources} from '../sources.js';
|
|
4
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
5
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
6
|
+
import type {Resource} from '../resource.js';
|
|
7
|
+
import {ResourceType} from '../resource.js';
|
|
8
|
+
import {processCssText} from './process-css.js';
|
|
9
|
+
import {error, skip} from '../logger/logger.js';
|
|
10
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
11
|
+
import {parseHtml} from './adapters.js';
|
|
12
|
+
import type {Cheerio, CheerioStatic} from '../types.js';
|
|
13
|
+
|
|
14
|
+
type Writeable<T> = { -readonly [P in keyof T]: T[P] };
|
|
15
|
+
type WriteableSrcSet = Writeable<SrcSetDefinition>;
|
|
11
16
|
|
|
12
17
|
export async function processHtml(
|
|
13
18
|
res: DownloadResource,
|
|
@@ -54,7 +59,7 @@ export async function processHtml(
|
|
|
54
59
|
let links: string[], replaceValue: string | SrcSetDefinition[];
|
|
55
60
|
if (attr === 'srcset') {
|
|
56
61
|
try {
|
|
57
|
-
replaceValue =
|
|
62
|
+
replaceValue = parseSrcset(attrValue);
|
|
58
63
|
} catch (e) {
|
|
59
64
|
error.info('skipping invalid srcset', attrValue, e);
|
|
60
65
|
// should invalid srcset being removed?
|
|
@@ -104,7 +109,10 @@ export async function processHtml(
|
|
|
104
109
|
submit(resource);
|
|
105
110
|
}
|
|
106
111
|
if (attr === 'srcset') {
|
|
107
|
-
|
|
112
|
+
// 20241005: It's ok to do this
|
|
113
|
+
// I've looked into the source code of srcset 5.0.1
|
|
114
|
+
// and there is nothing preventing the return value to change
|
|
115
|
+
(replaceValue as WriteableSrcSet[])[linkIndex].url = resource.replacePath;
|
|
108
116
|
} else {
|
|
109
117
|
replaceValue = resource.replacePath;
|
|
110
118
|
// historical workaround here
|
|
@@ -114,7 +122,7 @@ export async function processHtml(
|
|
|
114
122
|
}
|
|
115
123
|
}
|
|
116
124
|
if (attr === 'srcset') {
|
|
117
|
-
elem.attr(attr,
|
|
125
|
+
elem.attr(attr, stringifySrcset(replaceValue as SrcSetDefinition[]));
|
|
118
126
|
} else if (attr) {
|
|
119
127
|
elem.attr(attr, replaceValue as string);
|
|
120
128
|
} else {
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import {load} from 'cheerio';
|
|
2
|
-
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
3
|
-
import type {StaticDownloadOptions} from '../options';
|
|
4
|
-
import {Resource
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
7
|
-
import type {
|
|
2
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
3
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {ResourceType} from '../resource.js';
|
|
6
|
+
import {toString} from '../util.js';
|
|
7
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
8
|
+
import type {CheerioStatic} from '../types.js';
|
|
8
9
|
|
|
9
10
|
export async function processSiteMap(
|
|
10
11
|
res: DownloadResource,
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
2
|
-
import type {StaticDownloadOptions} from '../options';
|
|
3
|
-
import {Resource, ResourceEncoding
|
|
4
|
-
import
|
|
1
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
2
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
3
|
+
import type {Resource, ResourceEncoding} from '../resource.js';
|
|
4
|
+
import {ResourceType} from '../resource.js';
|
|
5
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
5
6
|
|
|
6
7
|
// https://developer.mozilla.org/docs/Web/HTTP/Headers/SourceMap
|
|
7
8
|
export const SOURCE_MAP_HEADER = 'SourceMap'.toLowerCase();
|
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
import type {SourceDefinition} from '../sources';
|
|
2
|
-
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
3
|
-
import type {StaticDownloadOptions} from '../options';
|
|
4
|
-
import {Resource
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
9
|
-
import
|
|
1
|
+
import type {SourceDefinition} from '../sources.js';
|
|
2
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
3
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {ResourceType} from '../resource.js';
|
|
6
|
+
import {error, skip} from '../logger/logger.js';
|
|
7
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
8
|
+
import {parseHtml} from './adapters.js';
|
|
9
|
+
import {getResourceBodyFromHtml} from './save-html-to-disk.js';
|
|
10
|
+
import type {Cheerio, CheerioStatic} from '../types.js';
|
|
10
11
|
|
|
11
12
|
const svgSelectors: SourceDefinition[] = [
|
|
12
13
|
{selector: '*[xlink\\:href]', attr: 'xlink:href', type: ResourceType.Binary},
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
import * as path from 'path';
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import type {
|
|
5
|
-
import
|
|
6
|
-
import {
|
|
1
|
+
import * as path from 'node:path';
|
|
2
|
+
import type {Stats} from 'node:fs';
|
|
3
|
+
import {promises} from 'node:fs';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {ResourceType} from '../resource.js';
|
|
6
|
+
import type {DownloadResource, RequestOptions} from './types.js';
|
|
7
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
8
|
+
import {error as errorLogger} from '../logger/logger.js';
|
|
7
9
|
|
|
8
10
|
const FILE_PREFIX = 'file://';
|
|
9
11
|
|
|
@@ -27,7 +29,7 @@ export async function readOrCopyLocalResource(
|
|
|
27
29
|
return;
|
|
28
30
|
}
|
|
29
31
|
// index.html handling
|
|
30
|
-
let stats: Stats | void;
|
|
32
|
+
let stats: Stats | void = void 0;
|
|
31
33
|
if (res.type === ResourceType.Html) {
|
|
32
34
|
stats = await promises.stat(fileSrcPath);
|
|
33
35
|
if (stats.isDirectory()) {
|
|
@@ -1,16 +1,12 @@
|
|
|
1
|
-
import path from 'path';
|
|
1
|
+
import path from 'node:path';
|
|
2
2
|
import URI from 'urijs';
|
|
3
|
-
import type {DownloadResource} from './types';
|
|
4
|
-
import type {StaticDownloadOptions} from '../options';
|
|
5
|
-
import {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
} from '../resource';
|
|
11
|
-
import {escapePath} from '../util';
|
|
12
|
-
import {writeFile} from '../io';
|
|
13
|
-
import type {PipelineExecutor} from './pipeline-executor';
|
|
3
|
+
import type {DownloadResource} from './types.js';
|
|
4
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
5
|
+
import type {ResourceBody, ResourceEncoding} from '../resource.js';
|
|
6
|
+
import {ResourceType, urlOfSavePath} from '../resource.js';
|
|
7
|
+
import {escapePath} from '../util.js';
|
|
8
|
+
import {writeFile} from '../io.js';
|
|
9
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
14
10
|
|
|
15
11
|
export function getResourceBodyFromHtml(
|
|
16
12
|
res: DownloadResource & { type: ResourceType.Html },
|
|
@@ -46,7 +42,7 @@ export async function saveHtmlToDisk(
|
|
|
46
42
|
}
|
|
47
43
|
const localRoot: string = res.localRoot ?? options.localRoot;
|
|
48
44
|
// https://github.com/website-local/website-scrap-engine/issues/174
|
|
49
|
-
let mtime: number | void;
|
|
45
|
+
let mtime: number | void = void 0;
|
|
50
46
|
if (options.preferRemoteLastModifiedTime && res.meta?.headers?.['last-modified']) {
|
|
51
47
|
mtime = Date.parse(res.meta.headers?.['last-modified']);
|
|
52
48
|
}
|