website-scrap-engine 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/downloader/adjust-concurrency.d.ts +2 -1
- package/lib/downloader/adjust-concurrency.d.ts.map +1 -0
- package/lib/downloader/adjust-concurrency.js +4 -8
- package/lib/downloader/adjust-concurrency.js.map +1 -1
- package/lib/downloader/index.d.ts +9 -8
- package/lib/downloader/index.d.ts.map +1 -0
- package/lib/downloader/index.js +8 -40
- package/lib/downloader/index.js.map +1 -1
- package/lib/downloader/main.d.ts +15 -6
- package/lib/downloader/main.d.ts.map +1 -0
- package/lib/downloader/main.js +49 -32
- package/lib/downloader/main.js.map +1 -1
- package/lib/downloader/multi.d.ts +7 -5
- package/lib/downloader/multi.d.ts.map +1 -0
- package/lib/downloader/multi.js +10 -17
- package/lib/downloader/multi.js.map +1 -1
- package/lib/downloader/pipeline-executor-impl.d.ts +8 -7
- package/lib/downloader/pipeline-executor-impl.d.ts.map +1 -0
- package/lib/downloader/pipeline-executor-impl.js +1 -5
- package/lib/downloader/pipeline-executor-impl.js.map +1 -1
- package/lib/downloader/single.d.ts +4 -3
- package/lib/downloader/single.d.ts.map +1 -0
- package/lib/downloader/single.js +7 -11
- package/lib/downloader/single.js.map +1 -1
- package/lib/downloader/types.d.ts +4 -4
- package/lib/downloader/types.d.ts.map +1 -0
- package/lib/downloader/types.js +2 -5
- package/lib/downloader/types.js.map +1 -1
- package/lib/downloader/worker-pool.d.ts +6 -7
- package/lib/downloader/worker-pool.d.ts.map +1 -0
- package/lib/downloader/worker-pool.js +7 -35
- package/lib/downloader/worker-pool.js.map +1 -1
- package/lib/downloader/worker-type.d.ts +4 -3
- package/lib/downloader/worker-type.d.ts.map +1 -0
- package/lib/downloader/worker-type.js +1 -2
- package/lib/downloader/worker.d.ts +1 -0
- package/lib/downloader/worker.d.ts.map +1 -0
- package/lib/downloader/worker.js +52 -27
- package/lib/downloader/worker.js.map +1 -1
- package/lib/index.d.ts +9 -8
- package/lib/index.d.ts.map +1 -0
- package/lib/index.js +7 -33
- package/lib/index.js.map +1 -1
- package/lib/io.d.ts +2 -1
- package/lib/io.d.ts.map +1 -0
- package/lib/io.js +17 -25
- package/lib/io.js.map +1 -1
- package/lib/life-cycle/adapters.d.ts +7 -5
- package/lib/life-cycle/adapters.d.ts.map +1 -0
- package/lib/life-cycle/adapters.js +18 -30
- package/lib/life-cycle/adapters.js.map +1 -1
- package/lib/life-cycle/default-life-cycle.d.ts +2 -1
- package/lib/life-cycle/default-life-cycle.d.ts.map +1 -0
- package/lib/life-cycle/default-life-cycle.js +28 -32
- package/lib/life-cycle/default-life-cycle.js.map +1 -1
- package/lib/life-cycle/detect-resource-type.d.ts +2 -1
- package/lib/life-cycle/detect-resource-type.d.ts.map +1 -0
- package/lib/life-cycle/detect-resource-type.js +12 -17
- package/lib/life-cycle/detect-resource-type.js.map +1 -1
- package/lib/life-cycle/download-resource.d.ts +6 -7
- package/lib/life-cycle/download-resource.d.ts.map +1 -0
- package/lib/life-cycle/download-resource.js +23 -52
- package/lib/life-cycle/download-resource.js.map +1 -1
- package/lib/life-cycle/download-streaming-resource.d.ts +6 -5
- package/lib/life-cycle/download-streaming-resource.d.ts.map +1 -0
- package/lib/life-cycle/download-streaming-resource.js +39 -74
- package/lib/life-cycle/download-streaming-resource.js.map +1 -1
- package/lib/life-cycle/index.d.ts +16 -15
- package/lib/life-cycle/index.d.ts.map +1 -0
- package/lib/life-cycle/index.js +14 -59
- package/lib/life-cycle/index.js.map +1 -1
- package/lib/life-cycle/pipeline-executor.d.ts +7 -6
- package/lib/life-cycle/pipeline-executor.d.ts.map +1 -0
- package/lib/life-cycle/pipeline-executor.js +1 -2
- package/lib/life-cycle/process-css.d.ts +5 -4
- package/lib/life-cycle/process-css.d.ts.map +1 -0
- package/lib/life-cycle/process-css.js +10 -18
- package/lib/life-cycle/process-css.js.map +1 -1
- package/lib/life-cycle/process-html-meta.d.ts +4 -3
- package/lib/life-cycle/process-html-meta.d.ts.map +1 -0
- package/lib/life-cycle/process-html-meta.js +11 -15
- package/lib/life-cycle/process-html-meta.js.map +1 -1
- package/lib/life-cycle/process-html.d.ts +4 -3
- package/lib/life-cycle/process-html.d.ts.map +1 -0
- package/lib/life-cycle/process-html.js +27 -31
- package/lib/life-cycle/process-html.js.map +1 -1
- package/lib/life-cycle/process-site-map.d.ts +4 -3
- package/lib/life-cycle/process-site-map.d.ts.map +1 -0
- package/lib/life-cycle/process-site-map.js +7 -11
- package/lib/life-cycle/process-site-map.js.map +1 -1
- package/lib/life-cycle/process-source-map.d.ts +4 -4
- package/lib/life-cycle/process-source-map.d.ts.map +1 -0
- package/lib/life-cycle/process-source-map.js +16 -21
- package/lib/life-cycle/process-source-map.js.map +1 -1
- package/lib/life-cycle/process-svg.d.ts +4 -3
- package/lib/life-cycle/process-svg.d.ts.map +1 -0
- package/lib/life-cycle/process-svg.js +17 -21
- package/lib/life-cycle/process-svg.js.map +1 -1
- package/lib/life-cycle/read-or-copy-local-resource.d.ts +4 -3
- package/lib/life-cycle/read-or-copy-local-resource.d.ts.map +1 -0
- package/lib/life-cycle/read-or-copy-local-resource.js +15 -42
- package/lib/life-cycle/read-or-copy-local-resource.js.map +1 -1
- package/lib/life-cycle/save-html-to-disk.d.ts +6 -4
- package/lib/life-cycle/save-html-to-disk.d.ts.map +1 -0
- package/lib/life-cycle/save-html-to-disk.js +24 -33
- package/lib/life-cycle/save-html-to-disk.js.map +1 -1
- package/lib/life-cycle/save-resource-to-disk.d.ts +4 -3
- package/lib/life-cycle/save-resource-to-disk.d.ts.map +1 -0
- package/lib/life-cycle/save-resource-to-disk.js +10 -17
- package/lib/life-cycle/save-resource-to-disk.js.map +1 -1
- package/lib/life-cycle/skip-links.d.ts +1 -0
- package/lib/life-cycle/skip-links.d.ts.map +1 -0
- package/lib/life-cycle/skip-links.js +6 -10
- package/lib/life-cycle/skip-links.js.map +1 -1
- package/lib/life-cycle/types.d.ts +8 -7
- package/lib/life-cycle/types.d.ts.map +1 -0
- package/lib/life-cycle/types.js +1 -2
- package/lib/logger/config-logger.d.ts +2 -1
- package/lib/logger/config-logger.d.ts.map +1 -0
- package/lib/logger/config-logger.js +4 -30
- package/lib/logger/config-logger.js.map +1 -1
- package/lib/logger/logger-worker.d.ts +3 -2
- package/lib/logger/logger-worker.d.ts.map +1 -0
- package/lib/logger/logger-worker.js +11 -13
- package/lib/logger/logger-worker.js.map +1 -1
- package/lib/logger/logger.d.ts +2 -1
- package/lib/logger/logger.d.ts.map +1 -0
- package/lib/logger/logger.js +15 -17
- package/lib/logger/logger.js.map +1 -1
- package/lib/options.d.ts +8 -8
- package/lib/options.d.ts.map +1 -0
- package/lib/options.js +22 -32
- package/lib/options.js.map +1 -1
- package/lib/resource.d.ts +3 -4
- package/lib/resource.d.ts.map +1 -0
- package/lib/resource.js +34 -70
- package/lib/resource.js.map +1 -1
- package/lib/sources.d.ts +2 -1
- package/lib/sources.d.ts.map +1 -0
- package/lib/sources.js +9 -12
- package/lib/sources.js.map +1 -1
- package/lib/types.d.ts +1 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +1 -2
- package/lib/util.d.ts +4 -3
- package/lib/util.d.ts.map +1 -0
- package/lib/util.js +17 -34
- package/lib/util.js.map +1 -1
- package/package.json +19 -21
- package/src/downloader/adjust-concurrency.ts +2 -2
- package/src/downloader/index.ts +8 -8
- package/src/downloader/main.ts +50 -28
- package/src/downloader/multi.ts +11 -10
- package/src/downloader/pipeline-executor-impl.ts +7 -7
- package/src/downloader/single.ts +9 -6
- package/src/downloader/types.ts +3 -3
- package/src/downloader/worker-pool.ts +9 -9
- package/src/downloader/worker-type.ts +3 -3
- package/src/downloader/worker.ts +51 -29
- package/src/index.ts +8 -8
- package/src/io.ts +6 -6
- package/src/life-cycle/adapters.ts +7 -6
- package/src/life-cycle/css-url-parser.d.ts +1 -1
- package/src/life-cycle/default-life-cycle.ts +15 -15
- package/src/life-cycle/detect-resource-type.ts +2 -2
- package/src/life-cycle/download-resource.ts +18 -20
- package/src/life-cycle/download-streaming-resource.ts +20 -18
- package/src/life-cycle/index.ts +15 -15
- package/src/life-cycle/pipeline-executor.ts +6 -6
- package/src/life-cycle/process-css.ts +6 -5
- package/src/life-cycle/process-html-meta.ts +7 -6
- package/src/life-cycle/process-html.ts +21 -13
- package/src/life-cycle/process-site-map.ts +7 -6
- package/src/life-cycle/process-source-map.ts +5 -4
- package/src/life-cycle/process-svg.ts +10 -9
- package/src/life-cycle/read-or-copy-local-resource.ts +9 -7
- package/src/life-cycle/save-html-to-disk.ts +9 -13
- package/src/life-cycle/save-resource-to-disk.ts +6 -6
- package/src/life-cycle/types.ts +7 -7
- package/src/logger/config-logger.ts +5 -3
- package/src/logger/logger-worker.ts +8 -4
- package/src/logger/logger.ts +6 -4
- package/src/options.ts +15 -19
- package/src/resource.ts +10 -5
- package/src/sources.ts +1 -1
- package/src/util.ts +6 -10
- package/tsconfig.json +6 -2
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import path from 'path';
|
|
2
|
-
import type {DownloadResource} from './types';
|
|
3
|
-
import type {StaticDownloadOptions} from '../options';
|
|
4
|
-
import {writeFile} from '../io';
|
|
5
|
-
import type {PipelineExecutor} from './pipeline-executor';
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import type {DownloadResource} from './types.js';
|
|
3
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
4
|
+
import {writeFile} from '../io.js';
|
|
5
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
6
6
|
|
|
7
7
|
export async function saveResourceToDisk(
|
|
8
8
|
res: DownloadResource,
|
|
@@ -10,7 +10,7 @@ export async function saveResourceToDisk(
|
|
|
10
10
|
pipeline: PipelineExecutor): Promise<DownloadResource | void> {
|
|
11
11
|
const localRoot: string = res.localRoot ?? options.localRoot;
|
|
12
12
|
// https://github.com/website-local/website-scrap-engine/issues/174
|
|
13
|
-
let mtime: number | void;
|
|
13
|
+
let mtime: number | void = void 0;
|
|
14
14
|
if (options.preferRemoteLastModifiedTime && res.meta?.headers?.['last-modified']) {
|
|
15
15
|
mtime = Date.parse(res.meta.headers?.['last-modified']);
|
|
16
16
|
}
|
package/src/life-cycle/types.ts
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type {OptionsInit as GotOptions} from 'got';
|
|
2
2
|
import type {
|
|
3
3
|
createResource,
|
|
4
4
|
GenerateSavePathFn,
|
|
5
5
|
Resource,
|
|
6
6
|
ResourceBody,
|
|
7
7
|
ResourceType
|
|
8
|
-
} from '../resource';
|
|
9
|
-
import type {StaticDownloadOptions} from '../options';
|
|
10
|
-
import type {PipelineExecutor} from './pipeline-executor';
|
|
11
|
-
import type {Cheerio} from '../types';
|
|
12
|
-
import type {DownloaderWithMeta} from '../downloader/types';
|
|
13
|
-
import type {WorkerInfo} from '../downloader/worker-pool';
|
|
8
|
+
} from '../resource.js';
|
|
9
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
10
|
+
import type {PipelineExecutor} from './pipeline-executor.js';
|
|
11
|
+
import type {Cheerio} from '../types.js';
|
|
12
|
+
import type {DownloaderWithMeta} from '../downloader/types.js';
|
|
13
|
+
import type {WorkerInfo} from '../downloader/worker-pool.js';
|
|
14
14
|
|
|
15
15
|
export type AsyncResult<T> = T | Promise<T>;
|
|
16
16
|
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
1
|
+
import type {Log4js} from 'log4js';
|
|
2
|
+
// https://github.com/jestjs/jest/issues/11563
|
|
3
|
+
import log4js from 'log4js';
|
|
4
|
+
import * as path from 'node:path';
|
|
3
5
|
|
|
4
6
|
export const configureLogger = (localRoot: string, subDir: string): Log4js =>
|
|
5
|
-
configure({
|
|
7
|
+
log4js.configure({
|
|
6
8
|
appenders: {
|
|
7
9
|
'retry': {
|
|
8
10
|
type: 'file',
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
import
|
|
4
|
-
import {
|
|
1
|
+
import type {Logger} from 'log4js';
|
|
2
|
+
// https://github.com/jestjs/jest/issues/11563
|
|
3
|
+
import log4js from 'log4js';
|
|
4
|
+
import {parentPort} from 'node:worker_threads';
|
|
5
|
+
import type {LogWorkerMessage, WorkerLog} from '../downloader/worker-type.js';
|
|
6
|
+
import {WorkerMessageType} from '../downloader/types.js';
|
|
7
|
+
|
|
8
|
+
const getLogger = log4js.getLogger;
|
|
5
9
|
|
|
6
10
|
export const logLevels = [
|
|
7
11
|
'trace', 'debug', 'info', 'warn', 'error', 'fatal', 'mark'
|
package/src/logger/logger.ts
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
import
|
|
1
|
+
import type {Logger} from 'log4js';
|
|
2
|
+
// https://github.com/jestjs/jest/issues/11563
|
|
3
|
+
import log4js from 'log4js';
|
|
4
|
+
import {isMainThread} from 'node:worker_threads';
|
|
5
|
+
import {getWorkerLogger} from './logger-worker.js';
|
|
4
6
|
|
|
5
7
|
const getLogger: typeof getWorkerLogger =
|
|
6
|
-
isMainThread ?
|
|
8
|
+
isMainThread ? log4js.getLogger : getWorkerLogger;
|
|
7
9
|
|
|
8
10
|
export const notFound: Logger = getLogger('notFound');
|
|
9
11
|
export const retry: Logger = getLogger('retry');
|
package/src/options.ts
CHANGED
|
@@ -1,22 +1,18 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
} from 'got/dist/source/as-promise/types';
|
|
7
|
-
import type {RequestError} from 'got/dist/source/core';
|
|
8
|
-
import {createResource, ResourceEncoding, ResourceType} from './resource';
|
|
9
|
-
import type {ProcessingLifeCycle, RequestOptions} from './life-cycle/types';
|
|
1
|
+
import type {RequestError, RetryFunction, RetryObject, TimeoutError} from 'got';
|
|
2
|
+
import got, {Options} from 'got';
|
|
3
|
+
import type {ResourceEncoding, ResourceType} from './resource.js';
|
|
4
|
+
import {createResource} from './resource.js';
|
|
5
|
+
import type {ProcessingLifeCycle, RequestOptions} from './life-cycle/types.js';
|
|
10
6
|
// noinspection ES6PreferShortImport
|
|
11
|
-
import {beforeRetryHook} from './life-cycle/download-resource';
|
|
12
|
-
import {error} from './logger/logger';
|
|
7
|
+
import {beforeRetryHook} from './life-cycle/download-resource.js';
|
|
8
|
+
import {error} from './logger/logger.js';
|
|
13
9
|
// noinspection ES6PreferShortImport
|
|
14
|
-
import {adjust} from './downloader/adjust-concurrency';
|
|
15
|
-
import {configureLogger} from './logger/config-logger';
|
|
16
|
-
import type {DownloaderWithMeta} from './downloader/types';
|
|
17
|
-
import {weakAssign} from './util';
|
|
18
|
-
import type {SourceDefinition} from './sources';
|
|
19
|
-
import type {CheerioOptionsInterface} from './types';
|
|
10
|
+
import {adjust} from './downloader/adjust-concurrency.js';
|
|
11
|
+
import {configureLogger} from './logger/config-logger.js';
|
|
12
|
+
import type {DownloaderWithMeta} from './downloader/types.js';
|
|
13
|
+
import {weakAssign} from './util.js';
|
|
14
|
+
import type {SourceDefinition} from './sources.js';
|
|
15
|
+
import type {CheerioOptionsInterface} from './types.js';
|
|
20
16
|
|
|
21
17
|
/**
|
|
22
18
|
* Extra options for custom life cycle
|
|
@@ -379,8 +375,8 @@ export function mergeOverrideOptions(
|
|
|
379
375
|
overrideOptions.meta = Object.assign(opt.meta, overrideOptions.meta);
|
|
380
376
|
}
|
|
381
377
|
if (opt.req && overrideOptions.req) {
|
|
382
|
-
|
|
383
|
-
|
|
378
|
+
const options = got.defaults.options;
|
|
379
|
+
overrideOptions.req = new Options(opt.req, overrideOptions.req, options);
|
|
384
380
|
}
|
|
385
381
|
return checkDownloadOptions(Object.assign(opt, overrideOptions));
|
|
386
382
|
}
|
package/src/resource.ts
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import URI from 'urijs';
|
|
2
|
-
import type {IncomingHttpHeaders} from 'http';
|
|
3
|
-
import * as path from 'path';
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
2
|
+
import type {IncomingHttpHeaders} from 'node:http';
|
|
3
|
+
import * as path from 'node:path';
|
|
4
|
+
import {
|
|
5
|
+
escapePath,
|
|
6
|
+
isUrlHttp,
|
|
7
|
+
orderUrlSearch,
|
|
8
|
+
simpleHashString
|
|
9
|
+
} from './util.js';
|
|
10
|
+
import type {CheerioStatic} from './types.js';
|
|
11
|
+
import {error as log} from './logger/logger.js';
|
|
7
12
|
|
|
8
13
|
export enum ResourceType {
|
|
9
14
|
/**
|
package/src/sources.ts
CHANGED
package/src/util.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {createHash} from 'crypto';
|
|
2
|
-
import type {ResourceBody, ResourceEncoding} from './resource';
|
|
1
|
+
import {createHash} from 'node:crypto';
|
|
2
|
+
import type {ResourceBody, ResourceEncoding} from './resource.js';
|
|
3
3
|
|
|
4
4
|
const forbiddenChar = /[:*?"<>|&]|%3A|%2A|%3F|%22|%3C|%3E|%7C|%26/ig;
|
|
5
5
|
|
|
@@ -39,14 +39,10 @@ export const toString = (body: ResourceBody, encoding: ResourceEncoding): string
|
|
|
39
39
|
return stringValue;
|
|
40
40
|
};
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if (mod && mod.__esModule && mod.default) {
|
|
47
|
-
return mod.default;
|
|
48
|
-
}
|
|
49
|
-
return mod;
|
|
42
|
+
export const importDefaultFromPath = <T>(path: string): Promise<T> => {
|
|
43
|
+
return import(path).then(mod => {
|
|
44
|
+
return mod.default || mod;
|
|
45
|
+
});
|
|
50
46
|
};
|
|
51
47
|
|
|
52
48
|
export const orderUrlSearch = (search: string): string => {
|
package/tsconfig.json
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"compilerOptions": {
|
|
3
|
-
"module": "
|
|
3
|
+
"module": "node16",
|
|
4
|
+
"moduleResolution": "node16",
|
|
4
5
|
"target": "es2018",
|
|
5
6
|
"sourceMap": true,
|
|
6
7
|
"newLine": "lf",
|
|
@@ -8,7 +9,10 @@
|
|
|
8
9
|
"declaration": true,
|
|
9
10
|
"esModuleInterop": true,
|
|
10
11
|
"removeComments": false,
|
|
11
|
-
"strict": true
|
|
12
|
+
"strict": true,
|
|
13
|
+
"declarationMap": true,
|
|
14
|
+
"allowJs": true,
|
|
15
|
+
"verbatimModuleSyntax": true
|
|
12
16
|
},
|
|
13
17
|
"include": [
|
|
14
18
|
"src"
|