website-scrap-engine 0.7.2 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/downloader/adjust-concurrency.d.ts +2 -1
- package/lib/downloader/adjust-concurrency.d.ts.map +1 -0
- package/lib/downloader/adjust-concurrency.js +4 -8
- package/lib/downloader/adjust-concurrency.js.map +1 -1
- package/lib/downloader/index.d.ts +9 -8
- package/lib/downloader/index.d.ts.map +1 -0
- package/lib/downloader/index.js +8 -40
- package/lib/downloader/index.js.map +1 -1
- package/lib/downloader/main.d.ts +15 -6
- package/lib/downloader/main.d.ts.map +1 -0
- package/lib/downloader/main.js +49 -32
- package/lib/downloader/main.js.map +1 -1
- package/lib/downloader/multi.d.ts +7 -5
- package/lib/downloader/multi.d.ts.map +1 -0
- package/lib/downloader/multi.js +10 -17
- package/lib/downloader/multi.js.map +1 -1
- package/lib/downloader/pipeline-executor-impl.d.ts +8 -7
- package/lib/downloader/pipeline-executor-impl.d.ts.map +1 -0
- package/lib/downloader/pipeline-executor-impl.js +1 -5
- package/lib/downloader/pipeline-executor-impl.js.map +1 -1
- package/lib/downloader/single.d.ts +4 -3
- package/lib/downloader/single.d.ts.map +1 -0
- package/lib/downloader/single.js +7 -11
- package/lib/downloader/single.js.map +1 -1
- package/lib/downloader/types.d.ts +4 -4
- package/lib/downloader/types.d.ts.map +1 -0
- package/lib/downloader/types.js +2 -5
- package/lib/downloader/types.js.map +1 -1
- package/lib/downloader/worker-pool.d.ts +6 -7
- package/lib/downloader/worker-pool.d.ts.map +1 -0
- package/lib/downloader/worker-pool.js +7 -35
- package/lib/downloader/worker-pool.js.map +1 -1
- package/lib/downloader/worker-type.d.ts +4 -3
- package/lib/downloader/worker-type.d.ts.map +1 -0
- package/lib/downloader/worker-type.js +1 -2
- package/lib/downloader/worker.d.ts +1 -0
- package/lib/downloader/worker.d.ts.map +1 -0
- package/lib/downloader/worker.js +52 -27
- package/lib/downloader/worker.js.map +1 -1
- package/lib/index.d.ts +9 -8
- package/lib/index.d.ts.map +1 -0
- package/lib/index.js +7 -33
- package/lib/index.js.map +1 -1
- package/lib/io.d.ts +2 -1
- package/lib/io.d.ts.map +1 -0
- package/lib/io.js +17 -25
- package/lib/io.js.map +1 -1
- package/lib/life-cycle/adapters.d.ts +7 -5
- package/lib/life-cycle/adapters.d.ts.map +1 -0
- package/lib/life-cycle/adapters.js +18 -30
- package/lib/life-cycle/adapters.js.map +1 -1
- package/lib/life-cycle/default-life-cycle.d.ts +2 -1
- package/lib/life-cycle/default-life-cycle.d.ts.map +1 -0
- package/lib/life-cycle/default-life-cycle.js +28 -32
- package/lib/life-cycle/default-life-cycle.js.map +1 -1
- package/lib/life-cycle/detect-resource-type.d.ts +2 -1
- package/lib/life-cycle/detect-resource-type.d.ts.map +1 -0
- package/lib/life-cycle/detect-resource-type.js +12 -17
- package/lib/life-cycle/detect-resource-type.js.map +1 -1
- package/lib/life-cycle/download-resource.d.ts +6 -7
- package/lib/life-cycle/download-resource.d.ts.map +1 -0
- package/lib/life-cycle/download-resource.js +49 -52
- package/lib/life-cycle/download-resource.js.map +1 -1
- package/lib/life-cycle/download-streaming-resource.d.ts +6 -5
- package/lib/life-cycle/download-streaming-resource.d.ts.map +1 -0
- package/lib/life-cycle/download-streaming-resource.js +39 -74
- package/lib/life-cycle/download-streaming-resource.js.map +1 -1
- package/lib/life-cycle/index.d.ts +16 -15
- package/lib/life-cycle/index.d.ts.map +1 -0
- package/lib/life-cycle/index.js +14 -59
- package/lib/life-cycle/index.js.map +1 -1
- package/lib/life-cycle/pipeline-executor.d.ts +7 -6
- package/lib/life-cycle/pipeline-executor.d.ts.map +1 -0
- package/lib/life-cycle/pipeline-executor.js +1 -2
- package/lib/life-cycle/process-css.d.ts +5 -4
- package/lib/life-cycle/process-css.d.ts.map +1 -0
- package/lib/life-cycle/process-css.js +10 -18
- package/lib/life-cycle/process-css.js.map +1 -1
- package/lib/life-cycle/process-html-meta.d.ts +4 -3
- package/lib/life-cycle/process-html-meta.d.ts.map +1 -0
- package/lib/life-cycle/process-html-meta.js +11 -15
- package/lib/life-cycle/process-html-meta.js.map +1 -1
- package/lib/life-cycle/process-html.d.ts +4 -3
- package/lib/life-cycle/process-html.d.ts.map +1 -0
- package/lib/life-cycle/process-html.js +61 -43
- package/lib/life-cycle/process-html.js.map +1 -1
- package/lib/life-cycle/process-site-map.d.ts +4 -3
- package/lib/life-cycle/process-site-map.d.ts.map +1 -0
- package/lib/life-cycle/process-site-map.js +7 -11
- package/lib/life-cycle/process-site-map.js.map +1 -1
- package/lib/life-cycle/process-source-map.d.ts +4 -4
- package/lib/life-cycle/process-source-map.d.ts.map +1 -0
- package/lib/life-cycle/process-source-map.js +16 -21
- package/lib/life-cycle/process-source-map.js.map +1 -1
- package/lib/life-cycle/process-svg.d.ts +4 -3
- package/lib/life-cycle/process-svg.d.ts.map +1 -0
- package/lib/life-cycle/process-svg.js +17 -21
- package/lib/life-cycle/process-svg.js.map +1 -1
- package/lib/life-cycle/read-or-copy-local-resource.d.ts +4 -3
- package/lib/life-cycle/read-or-copy-local-resource.d.ts.map +1 -0
- package/lib/life-cycle/read-or-copy-local-resource.js +15 -42
- package/lib/life-cycle/read-or-copy-local-resource.js.map +1 -1
- package/lib/life-cycle/save-html-to-disk.d.ts +6 -4
- package/lib/life-cycle/save-html-to-disk.d.ts.map +1 -0
- package/lib/life-cycle/save-html-to-disk.js +24 -33
- package/lib/life-cycle/save-html-to-disk.js.map +1 -1
- package/lib/life-cycle/save-resource-to-disk.d.ts +4 -3
- package/lib/life-cycle/save-resource-to-disk.d.ts.map +1 -0
- package/lib/life-cycle/save-resource-to-disk.js +10 -17
- package/lib/life-cycle/save-resource-to-disk.js.map +1 -1
- package/lib/life-cycle/skip-links.d.ts +1 -0
- package/lib/life-cycle/skip-links.d.ts.map +1 -0
- package/lib/life-cycle/skip-links.js +6 -10
- package/lib/life-cycle/skip-links.js.map +1 -1
- package/lib/life-cycle/types.d.ts +8 -7
- package/lib/life-cycle/types.d.ts.map +1 -0
- package/lib/life-cycle/types.js +1 -2
- package/lib/logger/config-logger.d.ts +2 -1
- package/lib/logger/config-logger.d.ts.map +1 -0
- package/lib/logger/config-logger.js +4 -30
- package/lib/logger/config-logger.js.map +1 -1
- package/lib/logger/logger-worker.d.ts +3 -2
- package/lib/logger/logger-worker.d.ts.map +1 -0
- package/lib/logger/logger-worker.js +11 -13
- package/lib/logger/logger-worker.js.map +1 -1
- package/lib/logger/logger.d.ts +2 -1
- package/lib/logger/logger.d.ts.map +1 -0
- package/lib/logger/logger.js +15 -17
- package/lib/logger/logger.js.map +1 -1
- package/lib/options.d.ts +9 -8
- package/lib/options.d.ts.map +1 -0
- package/lib/options.js +22 -32
- package/lib/options.js.map +1 -1
- package/lib/resource.d.ts +3 -4
- package/lib/resource.d.ts.map +1 -0
- package/lib/resource.js +34 -70
- package/lib/resource.js.map +1 -1
- package/lib/sources.d.ts +2 -1
- package/lib/sources.d.ts.map +1 -0
- package/lib/sources.js +9 -12
- package/lib/sources.js.map +1 -1
- package/lib/types.d.ts +1 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +1 -2
- package/lib/util.d.ts +4 -3
- package/lib/util.d.ts.map +1 -0
- package/lib/util.js +17 -34
- package/lib/util.js.map +1 -1
- package/package.json +18 -20
- package/src/downloader/adjust-concurrency.ts +2 -2
- package/src/downloader/index.ts +8 -8
- package/src/downloader/main.ts +50 -28
- package/src/downloader/multi.ts +11 -10
- package/src/downloader/pipeline-executor-impl.ts +7 -7
- package/src/downloader/single.ts +9 -6
- package/src/downloader/types.ts +3 -3
- package/src/downloader/worker-pool.ts +9 -9
- package/src/downloader/worker-type.ts +3 -3
- package/src/downloader/worker.ts +51 -29
- package/src/index.ts +8 -8
- package/src/io.ts +6 -6
- package/src/life-cycle/adapters.ts +7 -6
- package/src/life-cycle/css-url-parser.d.ts +1 -1
- package/src/life-cycle/default-life-cycle.ts +15 -15
- package/src/life-cycle/detect-resource-type.ts +2 -2
- package/src/life-cycle/download-resource.ts +45 -20
- package/src/life-cycle/download-streaming-resource.ts +20 -18
- package/src/life-cycle/index.ts +15 -15
- package/src/life-cycle/pipeline-executor.ts +6 -6
- package/src/life-cycle/process-css.ts +6 -5
- package/src/life-cycle/process-html-meta.ts +7 -6
- package/src/life-cycle/process-html.ts +74 -32
- package/src/life-cycle/process-site-map.ts +7 -6
- package/src/life-cycle/process-source-map.ts +5 -4
- package/src/life-cycle/process-svg.ts +10 -9
- package/src/life-cycle/read-or-copy-local-resource.ts +9 -7
- package/src/life-cycle/save-html-to-disk.ts +9 -13
- package/src/life-cycle/save-resource-to-disk.ts +6 -6
- package/src/life-cycle/types.ts +7 -7
- package/src/logger/config-logger.ts +5 -3
- package/src/logger/logger-worker.ts +8 -4
- package/src/logger/logger.ts +6 -4
- package/src/options.ts +16 -19
- package/src/resource.ts +10 -5
- package/src/sources.ts +1 -1
- package/src/util.ts +6 -10
- package/tsconfig.json +6 -2
package/src/downloader/worker.ts
CHANGED
|
@@ -1,46 +1,49 @@
|
|
|
1
|
-
import {parentPort, workerData} from 'worker_threads';
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
import {
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
} from '
|
|
14
|
-
import {
|
|
15
|
-
import {importDefaultFromPath} from '../util';
|
|
16
|
-
import {DownloadWorkerMessage, WorkerMessageType} from './types';
|
|
17
|
-
import {PipelineExecutorImpl} from './pipeline-executor-impl';
|
|
1
|
+
import {parentPort, workerData} from 'node:worker_threads';
|
|
2
|
+
import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
|
|
3
|
+
import {mergeOverrideOptions} from '../options.js';
|
|
4
|
+
import type {
|
|
5
|
+
DownloadResource,
|
|
6
|
+
SubmitResourceFunc
|
|
7
|
+
} from '../life-cycle/types.js';
|
|
8
|
+
import type {RawResource, Resource} from '../resource.js';
|
|
9
|
+
import {normalizeResource, prepareResourceForClone} from '../resource.js';
|
|
10
|
+
import {skip} from '../logger/logger.js';
|
|
11
|
+
import {importDefaultFromPath} from '../util.js';
|
|
12
|
+
import type {DownloadWorkerMessage} from './types.js';
|
|
13
|
+
import {WorkerMessageType} from './types.js';
|
|
14
|
+
import {PipelineExecutorImpl} from './pipeline-executor-impl.js';
|
|
18
15
|
// noinspection ES6PreferShortImport
|
|
19
|
-
import type {PipelineExecutor} from '../life-cycle/pipeline-executor';
|
|
20
|
-
import type {WorkerTaskMessage} from './worker-type';
|
|
16
|
+
import type {PipelineExecutor} from '../life-cycle/pipeline-executor.js';
|
|
17
|
+
import type {WorkerTaskMessage} from './worker-type.js';
|
|
21
18
|
|
|
22
19
|
const {pathToOptions, overrideOptions}: {
|
|
23
20
|
pathToOptions: string,
|
|
24
21
|
overrideOptions?: Partial<StaticDownloadOptions>
|
|
25
22
|
} = workerData;
|
|
26
23
|
|
|
27
|
-
const
|
|
28
|
-
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
29
|
-
mergeOverrideOptions(importDefaultFromPath(pathToOptions), overrideOptions);
|
|
24
|
+
const asyncOptions: Promise<DownloadOptions> = importDefaultFromPath(pathToOptions);
|
|
30
25
|
|
|
31
|
-
const
|
|
32
|
-
|
|
26
|
+
const asyncPipeline = asyncOptions.then(options => {
|
|
27
|
+
options = mergeOverrideOptions(options, overrideOptions);
|
|
33
28
|
|
|
34
|
-
|
|
29
|
+
const pipeline: PipelineExecutor =
|
|
30
|
+
new PipelineExecutorImpl(options, options.req, options);
|
|
35
31
|
|
|
36
|
-
|
|
32
|
+
options.configureLogger(options.localRoot, options.logSubDir || '');
|
|
33
|
+
|
|
34
|
+
const init = pipeline.init(pipeline);
|
|
35
|
+
if (init && (init as Promise<void>).then) {
|
|
36
|
+
return init.then(() => pipeline);
|
|
37
|
+
}
|
|
38
|
+
return pipeline;
|
|
39
|
+
});
|
|
37
40
|
|
|
38
41
|
parentPort?.addListener('message', async (msg: WorkerTaskMessage<RawResource>) => {
|
|
39
42
|
const collectedResource: RawResource[] = [];
|
|
40
43
|
let error: Error | unknown | void;
|
|
41
44
|
let redirectedUrl: string | undefined;
|
|
42
45
|
try {
|
|
43
|
-
await
|
|
46
|
+
const pipeline = await asyncPipeline;
|
|
44
47
|
const res = msg.body;
|
|
45
48
|
const downloadResource: DownloadResource = normalizeResource(res) as DownloadResource;
|
|
46
49
|
const submit: SubmitResourceFunc = (resources: Resource | Resource[]) => {
|
|
@@ -67,8 +70,27 @@ parentPort?.addListener('message', async (msg: WorkerTaskMessage<RawResource>) =
|
|
|
67
70
|
redirectedUrl = processedResource.redirectedUrl;
|
|
68
71
|
}
|
|
69
72
|
} catch (e) {
|
|
70
|
-
//
|
|
71
|
-
|
|
73
|
+
// handle if object could not be cloned here
|
|
74
|
+
// https://github.com/website-local/website-scrap-engine/issues/340
|
|
75
|
+
try {
|
|
76
|
+
// should always be
|
|
77
|
+
if (typeof structuredClone === 'function') {
|
|
78
|
+
error = structuredClone(e);
|
|
79
|
+
} else {
|
|
80
|
+
// this is the old behavior before this
|
|
81
|
+
error = e;
|
|
82
|
+
}
|
|
83
|
+
} catch {
|
|
84
|
+
// can not clone, so no need to get the full error here
|
|
85
|
+
if (e && typeof e === 'object') {
|
|
86
|
+
const clone: Record<string, string> = {};
|
|
87
|
+
for (const k in e) {
|
|
88
|
+
clone[k] = String((e as Record<string, unknown>)[k]);
|
|
89
|
+
}
|
|
90
|
+
} else {
|
|
91
|
+
error = String(e);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
72
94
|
} finally {
|
|
73
95
|
const message: DownloadWorkerMessage = {
|
|
74
96
|
taskId: msg.taskId,
|
package/src/index.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
export * as logger from './logger/logger';
|
|
2
|
-
export * as downloader from './downloader/index';
|
|
3
|
-
export * as lifeCycle from './life-cycle/index';
|
|
4
|
-
export * as io from './io';
|
|
5
|
-
export * as options from './options';
|
|
6
|
-
export * as resource from './resource';
|
|
7
|
-
export {SourceDefinition} from './sources';
|
|
8
|
-
export * as util from './util';
|
|
1
|
+
export * as logger from './logger/logger.js';
|
|
2
|
+
export * as downloader from './downloader/index.js';
|
|
3
|
+
export * as lifeCycle from './life-cycle/index.js';
|
|
4
|
+
export * as io from './io.js';
|
|
5
|
+
export * as options from './options.js';
|
|
6
|
+
export * as resource from './resource.js';
|
|
7
|
+
export type {SourceDefinition} from './sources.js';
|
|
8
|
+
export * as util from './util.js';
|
package/src/io.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import type {ObjectEncodingOptions} from 'fs';
|
|
2
|
-
import fs from 'fs';
|
|
3
|
-
import {dirname} from 'path';
|
|
1
|
+
import type {ObjectEncodingOptions} from 'node:fs';
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import {dirname} from 'node:path';
|
|
4
4
|
import {mkdirp} from 'mkdirp';
|
|
5
|
-
import type {ResourceBody, ResourceEncoding} from './resource';
|
|
6
|
-
import {error as errorLogger, mkdir as mkdirLogger} from './logger/logger';
|
|
5
|
+
import type {ResourceBody, ResourceEncoding} from './resource.js';
|
|
6
|
+
import {error as errorLogger, mkdir as mkdirLogger} from './logger/logger.js';
|
|
7
7
|
|
|
8
8
|
export const mkdirRetry = async (dir: string, retry = 3): Promise<void> => {
|
|
9
9
|
let error: unknown | void;
|
|
@@ -40,7 +40,7 @@ export const writeFile = async (
|
|
|
40
40
|
await mkdirRetry(dir);
|
|
41
41
|
}
|
|
42
42
|
let fileData: Uint8Array | string;
|
|
43
|
-
let options: ObjectEncodingOptions | void;
|
|
43
|
+
let options: ObjectEncodingOptions | void = void 0;
|
|
44
44
|
if (typeof data === 'string') {
|
|
45
45
|
fileData = data;
|
|
46
46
|
options = {encoding};
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import {load} from 'cheerio';
|
|
2
|
-
import {Resource, ResourceEncoding
|
|
2
|
+
import type {Resource, ResourceEncoding} from '../resource.js';
|
|
3
|
+
import {ResourceType} from '../resource.js';
|
|
3
4
|
import type {
|
|
4
5
|
AsyncResult,
|
|
5
6
|
DownloadResource,
|
|
@@ -7,11 +8,11 @@ import type {
|
|
|
7
8
|
ProcessResourceAfterDownloadFunc,
|
|
8
9
|
ProcessResourceBeforeDownloadFunc,
|
|
9
10
|
SubmitResourceFunc
|
|
10
|
-
} from './types';
|
|
11
|
-
import {toString} from '../util';
|
|
12
|
-
import type {StaticDownloadOptions} from '../options';
|
|
13
|
-
import type {PipelineExecutor} from './pipeline-executor';
|
|
14
|
-
import type {Cheerio, CheerioStatic} from '../types';
|
|
11
|
+
} from './types.js';
|
|
12
|
+
import {toString} from '../util.js';
|
|
13
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
14
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
15
|
+
import type {Cheerio, CheerioStatic} from '../types.js';
|
|
15
16
|
|
|
16
17
|
export interface SkipProcessFunc {
|
|
17
18
|
(url: string, element: Cheerio | null, parent: Resource | null): boolean;
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
import type {ProcessingLifeCycle} from './types';
|
|
2
|
-
import {skipLinks} from './skip-links';
|
|
3
|
-
import {detectResourceType} from './detect-resource-type';
|
|
4
|
-
import {createResource} from '../resource';
|
|
5
|
-
import {downloadResource} from './download-resource';
|
|
6
|
-
import {processHtml} from './process-html';
|
|
7
|
-
import {processHtmlMetaRefresh} from './process-html-meta';
|
|
8
|
-
import {processCss} from './process-css';
|
|
9
|
-
import {processSiteMap} from './process-site-map';
|
|
10
|
-
import {processSvg} from './process-svg';
|
|
11
|
-
import {saveHtmlToDisk} from './save-html-to-disk';
|
|
12
|
-
import {saveResourceToDisk} from './save-resource-to-disk';
|
|
13
|
-
import {processRedirectedUrl} from './adapters';
|
|
14
|
-
import {downloadStreamingResource} from './download-streaming-resource';
|
|
15
|
-
import {readOrCopyLocalResource} from './read-or-copy-local-resource';
|
|
1
|
+
import type {ProcessingLifeCycle} from './types.js';
|
|
2
|
+
import {skipLinks} from './skip-links.js';
|
|
3
|
+
import {detectResourceType} from './detect-resource-type.js';
|
|
4
|
+
import {createResource} from '../resource.js';
|
|
5
|
+
import {downloadResource} from './download-resource.js';
|
|
6
|
+
import {processHtml} from './process-html.js';
|
|
7
|
+
import {processHtmlMetaRefresh} from './process-html-meta.js';
|
|
8
|
+
import {processCss} from './process-css.js';
|
|
9
|
+
import {processSiteMap} from './process-site-map.js';
|
|
10
|
+
import {processSvg} from './process-svg.js';
|
|
11
|
+
import {saveHtmlToDisk} from './save-html-to-disk.js';
|
|
12
|
+
import {saveResourceToDisk} from './save-resource-to-disk.js';
|
|
13
|
+
import {processRedirectedUrl} from './adapters.js';
|
|
14
|
+
import {downloadStreamingResource} from './download-streaming-resource.js';
|
|
15
|
+
import {readOrCopyLocalResource} from './read-or-copy-local-resource.js';
|
|
16
16
|
|
|
17
17
|
/**
|
|
18
18
|
* Get a copy of default life cycle
|
|
@@ -1,24 +1,22 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
import
|
|
9
|
-
import type {DownloadResource, RequestOptions} from './types';
|
|
10
|
-
import {generateSavePath, Resource, ResourceType} from '../resource';
|
|
11
|
-
import type {StaticDownloadOptions} from '../options';
|
|
12
|
-
import * as logger from '../logger/logger';
|
|
13
|
-
import {isUrlHttp, sleep} from '../util';
|
|
1
|
+
import type {BeforeRetryHook, OptionsInit, RequestError, Response} from 'got';
|
|
2
|
+
import got, {TimeoutError} from 'got';
|
|
3
|
+
import type {DownloadResource, RequestOptions} from './types.js';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {generateSavePath, ResourceType} from '../resource.js';
|
|
6
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
7
|
+
import * as logger from '../logger/logger.js';
|
|
8
|
+
import {isUrlHttp, sleep} from '../util.js';
|
|
14
9
|
import URI from 'urijs';
|
|
15
10
|
|
|
16
11
|
/** Take logs before retry */
|
|
17
12
|
export const beforeRetryHook: BeforeRetryHook = (
|
|
18
|
-
|
|
19
|
-
error: RequestError | undefined,
|
|
13
|
+
error: RequestError,
|
|
20
14
|
retryCount: number | undefined
|
|
21
15
|
) => {
|
|
16
|
+
const options = error.options;
|
|
17
|
+
if (!options) {
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
22
20
|
if (!error) {
|
|
23
21
|
logger.retry.warn(retryCount, String(options.url));
|
|
24
22
|
return;
|
|
@@ -49,15 +47,15 @@ export interface DownloadError extends Partial<Error> {
|
|
|
49
47
|
*/
|
|
50
48
|
export async function getRetry(
|
|
51
49
|
url: string,
|
|
52
|
-
options:
|
|
50
|
+
options: OptionsInit
|
|
53
51
|
): Promise<Response<Buffer | string> | void> {
|
|
54
|
-
let res: Response<Buffer | string> | void;
|
|
55
|
-
let err: DownloadError | void, optionsClone:
|
|
52
|
+
let res: Response<Buffer | string> | void = void 0;
|
|
53
|
+
let err: DownloadError | void = void 0, optionsClone: OptionsInit;
|
|
56
54
|
for (let i = 0; i < 25; i++) {
|
|
57
55
|
err = void 0;
|
|
58
56
|
try {
|
|
59
57
|
optionsClone = Object.assign({}, options);
|
|
60
|
-
res = (await got(url, optionsClone)) as
|
|
58
|
+
res = (await got(url, optionsClone)) as Response<Buffer | string>;
|
|
61
59
|
if (!res || !res.body || !res.body.length) {
|
|
62
60
|
logger.retry.warn(i, url, 'manually retry on empty response or body',
|
|
63
61
|
res && res.body);
|
|
@@ -102,7 +100,7 @@ export async function requestForResource(
|
|
|
102
100
|
options?: StaticDownloadOptions
|
|
103
101
|
): Promise<DownloadResource | Resource | void> {
|
|
104
102
|
const downloadLink: string = encodeURI(decodeURI(res.downloadLink));
|
|
105
|
-
const reqOptions:
|
|
103
|
+
const reqOptions: OptionsInit = Object.assign({}, requestOptions);
|
|
106
104
|
reqOptions.responseType = 'buffer';
|
|
107
105
|
if (res.refUrl && res.refUrl !== downloadLink) {
|
|
108
106
|
const headers = Object.assign({}, reqOptions.headers);
|
|
@@ -167,6 +165,33 @@ export async function downloadResource(
|
|
|
167
165
|
return downloadedResource;
|
|
168
166
|
}
|
|
169
167
|
if (downloadedResource.type === ResourceType.Html) {
|
|
168
|
+
if (options.meta.warnForNonHtml) {
|
|
169
|
+
const headers = downloadedResource.meta.headers;
|
|
170
|
+
if (headers) {
|
|
171
|
+
const contentType =
|
|
172
|
+
headers['content-type'] || headers['Content-Type'];
|
|
173
|
+
let nonHtml = false;
|
|
174
|
+
if (typeof contentType === 'string') {
|
|
175
|
+
nonHtml = !contentType.includes('/html') &&
|
|
176
|
+
!contentType.includes('/xml') &&
|
|
177
|
+
!contentType.includes('application/xhtml+xml');
|
|
178
|
+
} else if (Array.isArray(contentType)) {
|
|
179
|
+
nonHtml = true;
|
|
180
|
+
for (const header of contentType) {
|
|
181
|
+
if (!header.includes('/html') &&
|
|
182
|
+
!header.includes('/xml') &&
|
|
183
|
+
!header.includes('application/xhtml+xml')) {
|
|
184
|
+
nonHtml = false;
|
|
185
|
+
break;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
if (nonHtml) {
|
|
190
|
+
logger.error.warn('Detected non-html content type',
|
|
191
|
+
downloadedResource.downloadLink, downloadedResource.rawUrl, contentType);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
170
195
|
if (options.meta.detectIncompleteHtml &&
|
|
171
196
|
(typeof downloadedResource.body === 'string' ||
|
|
172
197
|
Buffer.isBuffer(downloadedResource.body))) {
|
|
@@ -1,21 +1,23 @@
|
|
|
1
|
-
import
|
|
2
|
-
import type {
|
|
3
|
-
import
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import type {WriteStream} from 'node:fs';
|
|
3
|
+
import {constants, createWriteStream, promises as fs} from 'node:fs';
|
|
4
|
+
import {pipeline} from 'node:stream';
|
|
5
|
+
import {promisify} from 'node:util';
|
|
6
|
+
import type {RequestError, Response} from 'got';
|
|
7
|
+
import got, {HTTPError} from 'got';
|
|
8
|
+
import type {Resource} from '../resource.js';
|
|
9
|
+
import {ResourceType} from '../resource.js';
|
|
6
10
|
import type {
|
|
7
11
|
AsyncResult,
|
|
8
12
|
DownloadResource,
|
|
9
13
|
DownloadResourceFunc,
|
|
10
14
|
RequestOptions
|
|
11
|
-
} from './types';
|
|
12
|
-
import {mkdirRetry} from '../io';
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import
|
|
17
|
-
import type {PipelineExecutor} from './pipeline-executor';
|
|
18
|
-
import {isUrlHttp} from '../util';
|
|
15
|
+
} from './types.js';
|
|
16
|
+
import {mkdirRetry} from '../io.js';
|
|
17
|
+
import {error as errorLogger, retry as retryLogger} from '../logger/logger.js';
|
|
18
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
19
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
20
|
+
import {isUrlHttp} from '../util.js';
|
|
19
21
|
|
|
20
22
|
const promisifyPipeline = promisify(pipeline);
|
|
21
23
|
|
|
@@ -108,7 +110,7 @@ export async function streamingDownloadToFile(
|
|
|
108
110
|
rangeStart = undefined;
|
|
109
111
|
}
|
|
110
112
|
|
|
111
|
-
if (response.request.
|
|
113
|
+
if (response.request.isAborted) {
|
|
112
114
|
// Canceled while downloading
|
|
113
115
|
//- will throw a `CancelError` or `TimeoutError` error
|
|
114
116
|
return;
|
|
@@ -129,9 +131,9 @@ export async function streamingDownloadToFile(
|
|
|
129
131
|
return;
|
|
130
132
|
}
|
|
131
133
|
|
|
132
|
-
if (request._isAboutToError) {
|
|
133
|
-
|
|
134
|
-
}
|
|
134
|
+
// if (request._isAboutToError) {
|
|
135
|
+
// return;
|
|
136
|
+
// }
|
|
135
137
|
|
|
136
138
|
resolve(response);
|
|
137
139
|
});
|
|
@@ -222,7 +224,7 @@ export async function optionallySetLastModifiedTime(
|
|
|
222
224
|
res: Resource, options: StaticDownloadOptions
|
|
223
225
|
): Promise<void> {
|
|
224
226
|
// https://github.com/website-local/website-scrap-engine/issues/174
|
|
225
|
-
let mtime: number | void;
|
|
227
|
+
let mtime: number | void = void 0;
|
|
226
228
|
if (options.preferRemoteLastModifiedTime && res.meta?.headers?.['last-modified']) {
|
|
227
229
|
mtime = Date.parse(res.meta.headers?.['last-modified']);
|
|
228
230
|
}
|
package/src/life-cycle/index.ts
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
export * as adapter from './adapters';
|
|
2
|
-
export {defaultLifeCycle} from './default-life-cycle';
|
|
3
|
-
export {detectResourceType} from './detect-resource-type';
|
|
1
|
+
export * as adapter from './adapters.js';
|
|
2
|
+
export {defaultLifeCycle} from './default-life-cycle.js';
|
|
3
|
+
export {detectResourceType} from './detect-resource-type.js';
|
|
4
4
|
export {
|
|
5
5
|
beforeRetryHook, getRetry, requestForResource, downloadResource
|
|
6
|
-
} from './download-resource';
|
|
6
|
+
} from './download-resource.js';
|
|
7
7
|
export {
|
|
8
8
|
streamingDownloadToFile,
|
|
9
9
|
downloadStreamingResource,
|
|
10
10
|
downloadStreamingResourceWithHook
|
|
11
|
-
} from './download-streaming-resource';
|
|
12
|
-
export {PipelineExecutor} from './pipeline-executor';
|
|
13
|
-
export {processCssText, processCss} from './process-css';
|
|
14
|
-
export {processHtml} from './process-html';
|
|
15
|
-
export {processHtmlMetaRefresh} from './process-html-meta';
|
|
16
|
-
export {processSiteMap} from './process-site-map';
|
|
17
|
-
export {processSvg} from './process-svg';
|
|
18
|
-
export {getResourceBodyFromHtml, saveHtmlToDisk} from './save-html-to-disk';
|
|
19
|
-
export {saveResourceToDisk} from './save-resource-to-disk';
|
|
20
|
-
export {skipLinks} from './skip-links';
|
|
21
|
-
export * as types from './types';
|
|
11
|
+
} from './download-streaming-resource.js';
|
|
12
|
+
export type {PipelineExecutor} from './pipeline-executor.js';
|
|
13
|
+
export {processCssText, processCss} from './process-css.js';
|
|
14
|
+
export {processHtml} from './process-html.js';
|
|
15
|
+
export {processHtmlMetaRefresh} from './process-html-meta.js';
|
|
16
|
+
export {processSiteMap} from './process-site-map.js';
|
|
17
|
+
export {processSvg} from './process-svg.js';
|
|
18
|
+
export {getResourceBodyFromHtml, saveHtmlToDisk} from './save-html-to-disk.js';
|
|
19
|
+
export {saveResourceToDisk} from './save-resource-to-disk.js';
|
|
20
|
+
export {skipLinks} from './skip-links.js';
|
|
21
|
+
export * as types from './types.js';
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
import type {Resource, ResourceEncoding, ResourceType} from '../resource';
|
|
2
|
-
import type {StaticDownloadOptions} from '../options';
|
|
1
|
+
import type {Resource, ResourceEncoding, ResourceType} from '../resource.js';
|
|
2
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
3
3
|
import type {
|
|
4
4
|
AsyncResult,
|
|
5
5
|
DownloadResource,
|
|
6
6
|
RequestOptions,
|
|
7
7
|
SubmitResourceFunc
|
|
8
|
-
} from './types';
|
|
9
|
-
import type {Cheerio} from '../types';
|
|
10
|
-
import type {DownloaderWithMeta} from '../downloader/types';
|
|
11
|
-
import type {WorkerInfo} from '../downloader/worker-pool';
|
|
8
|
+
} from './types.js';
|
|
9
|
+
import type {Cheerio} from '../types.js';
|
|
10
|
+
import type {DownloaderWithMeta} from '../downloader/types.js';
|
|
11
|
+
import type {WorkerInfo} from '../downloader/worker-pool.js';
|
|
12
12
|
|
|
13
13
|
export interface PipelineExecutor {
|
|
14
14
|
/**
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import parseCssUrls from 'css-url-parser';
|
|
2
|
-
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
3
|
-
import type {StaticDownloadOptions} from '../options';
|
|
4
|
-
import {Resource
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
2
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
3
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {ResourceType} from '../resource.js';
|
|
6
|
+
import {toString} from '../util.js';
|
|
7
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
7
8
|
|
|
8
9
|
export async function processCssText(
|
|
9
10
|
cssText: string,
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
2
|
-
import type {StaticDownloadOptions} from '../options';
|
|
3
|
-
import type {PipelineExecutor} from './pipeline-executor';
|
|
4
|
-
import {Resource
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
1
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
2
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
3
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {ResourceType} from '../resource.js';
|
|
6
|
+
import {parseHtml} from './adapters.js';
|
|
7
|
+
import {skip} from '../logger/logger.js';
|
|
7
8
|
|
|
8
9
|
/**
|
|
9
10
|
* Originally create by https://github.com/stevenvachon at
|
|
@@ -1,34 +1,31 @@
|
|
|
1
|
-
import
|
|
2
|
-
import {
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
7
|
-
import {
|
|
8
|
-
import
|
|
9
|
-
import {
|
|
10
|
-
import
|
|
1
|
+
import type {SrcSetDefinition} from 'srcset';
|
|
2
|
+
import {parseSrcset, stringifySrcset} from 'srcset';
|
|
3
|
+
import {load} from 'cheerio';
|
|
4
|
+
import {sources as defaultSources} from '../sources.js';
|
|
5
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
6
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
7
|
+
import type {Resource} from '../resource.js';
|
|
8
|
+
import {ResourceType} from '../resource.js';
|
|
9
|
+
import {processCssText} from './process-css.js';
|
|
10
|
+
import {error, skip} from '../logger/logger.js';
|
|
11
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
12
|
+
import {parseHtml} from './adapters.js';
|
|
13
|
+
import type {Cheerio, CheerioStatic} from '../types.js';
|
|
11
14
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
submit: SubmitResourceFunc,
|
|
15
|
-
options: StaticDownloadOptions,
|
|
16
|
-
pipeline: PipelineExecutor): Promise<DownloadResource | void> {
|
|
17
|
-
if (res.type !== ResourceType.Html) {
|
|
18
|
-
return res;
|
|
19
|
-
}
|
|
20
|
-
const refUrl: string = res.redirectedUrl || res.url;
|
|
21
|
-
const savePath = refUrl === res.url ? res.savePath : undefined;
|
|
22
|
-
// useless since processRedirectedUrl enabled by default
|
|
23
|
-
// refUrl = await pipeline.linkRedirect(refUrl, null, res) || refUrl;
|
|
15
|
+
type Writeable<T> = { -readonly [P in keyof T]: T[P] };
|
|
16
|
+
type WriteableSrcSet = Writeable<SrcSetDefinition>;
|
|
24
17
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
18
|
+
async function processHtmlDoc(
|
|
19
|
+
options: StaticDownloadOptions,
|
|
20
|
+
doc: CheerioStatic,
|
|
21
|
+
res: DownloadResource,
|
|
22
|
+
pipeline: PipelineExecutor,
|
|
23
|
+
depth: number,
|
|
24
|
+
resources: Resource[],
|
|
25
|
+
refUrl: string,
|
|
26
|
+
savePath: string | undefined,
|
|
27
|
+
submit: SubmitResourceFunc
|
|
28
|
+
) {
|
|
32
29
|
const sources: typeof defaultSources = options.sources || defaultSources;
|
|
33
30
|
for (const {selector, attr, type} of sources) {
|
|
34
31
|
const elements: Cheerio = doc(selector);
|
|
@@ -54,7 +51,7 @@ export async function processHtml(
|
|
|
54
51
|
let links: string[], replaceValue: string | SrcSetDefinition[];
|
|
55
52
|
if (attr === 'srcset') {
|
|
56
53
|
try {
|
|
57
|
-
replaceValue =
|
|
54
|
+
replaceValue = parseSrcset(attrValue);
|
|
58
55
|
} catch (e) {
|
|
59
56
|
error.info('skipping invalid srcset', attrValue, e);
|
|
60
57
|
// should invalid srcset being removed?
|
|
@@ -104,7 +101,10 @@ export async function processHtml(
|
|
|
104
101
|
submit(resource);
|
|
105
102
|
}
|
|
106
103
|
if (attr === 'srcset') {
|
|
107
|
-
|
|
104
|
+
// 20241005: It's ok to do this
|
|
105
|
+
// I've looked into the source code of srcset 5.0.1
|
|
106
|
+
// and there is nothing preventing the return value to change
|
|
107
|
+
(replaceValue as WriteableSrcSet[])[linkIndex].url = resource.replacePath;
|
|
108
108
|
} else {
|
|
109
109
|
replaceValue = resource.replacePath;
|
|
110
110
|
// historical workaround here
|
|
@@ -114,7 +114,7 @@ export async function processHtml(
|
|
|
114
114
|
}
|
|
115
115
|
}
|
|
116
116
|
if (attr === 'srcset') {
|
|
117
|
-
elem.attr(attr,
|
|
117
|
+
elem.attr(attr, stringifySrcset(replaceValue as SrcSetDefinition[]));
|
|
118
118
|
} else if (attr) {
|
|
119
119
|
elem.attr(attr, replaceValue as string);
|
|
120
120
|
} else {
|
|
@@ -122,6 +122,48 @@ export async function processHtml(
|
|
|
122
122
|
}
|
|
123
123
|
}
|
|
124
124
|
}
|
|
125
|
+
const iframeSrcDocs = doc('iframe[srcdoc]');
|
|
126
|
+
|
|
127
|
+
for (let index = 0; index < iframeSrcDocs.length; index++) {
|
|
128
|
+
const elem = iframeSrcDocs.eq(index);
|
|
129
|
+
const attrValue: string | void = elem.attr('srcdoc');
|
|
130
|
+
if (!attrValue) {
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
try {
|
|
134
|
+
const iframeDoc = load(attrValue);
|
|
135
|
+
await processHtmlDoc(options, iframeDoc, res, pipeline, depth, resources, refUrl, savePath, submit);
|
|
136
|
+
const html = options.cheerioSerialize ?
|
|
137
|
+
iframeDoc.html(options.cheerioSerialize) : iframeDoc.html();
|
|
138
|
+
elem.attr('srcdoc', html);
|
|
139
|
+
} catch (e) {
|
|
140
|
+
error.info('can not parse iframe srcdoc', res.url, res.rawUrl, e);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
export async function processHtml(
|
|
146
|
+
res: DownloadResource,
|
|
147
|
+
submit: SubmitResourceFunc,
|
|
148
|
+
options: StaticDownloadOptions,
|
|
149
|
+
pipeline: PipelineExecutor
|
|
150
|
+
): Promise<DownloadResource | void> {
|
|
151
|
+
if (res.type !== ResourceType.Html) {
|
|
152
|
+
return res;
|
|
153
|
+
}
|
|
154
|
+
const refUrl: string = res.redirectedUrl || res.url;
|
|
155
|
+
const savePath = refUrl === res.url ? res.savePath : undefined;
|
|
156
|
+
// useless since processRedirectedUrl enabled by default
|
|
157
|
+
// refUrl = await pipeline.linkRedirect(refUrl, null, res) || refUrl;
|
|
158
|
+
|
|
159
|
+
const depth: number = res.depth + 1;
|
|
160
|
+
let doc: CheerioStatic | void = res.meta.doc;
|
|
161
|
+
if (!doc) {
|
|
162
|
+
res.meta.doc = doc = parseHtml(res, options);
|
|
163
|
+
}
|
|
164
|
+
// resources from inline css
|
|
165
|
+
const resources: Resource[] = [];
|
|
166
|
+
await processHtmlDoc(options, doc, res, pipeline, depth, resources, refUrl, savePath, submit);
|
|
125
167
|
if (resources.length) {
|
|
126
168
|
submit(resources);
|
|
127
169
|
}
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import {load} from 'cheerio';
|
|
2
|
-
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
3
|
-
import type {StaticDownloadOptions} from '../options';
|
|
4
|
-
import {Resource
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
7
|
-
import type {
|
|
2
|
+
import type {DownloadResource, SubmitResourceFunc} from './types.js';
|
|
3
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
4
|
+
import type {Resource} from '../resource.js';
|
|
5
|
+
import {ResourceType} from '../resource.js';
|
|
6
|
+
import {toString} from '../util.js';
|
|
7
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
8
|
+
import type {CheerioStatic} from '../types.js';
|
|
8
9
|
|
|
9
10
|
export async function processSiteMap(
|
|
10
11
|
res: DownloadResource,
|