website-scrap-engine 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/downloader/adjust-concurrency.d.ts +2 -1
- package/lib/downloader/adjust-concurrency.d.ts.map +1 -0
- package/lib/downloader/adjust-concurrency.js +4 -8
- package/lib/downloader/adjust-concurrency.js.map +1 -1
- package/lib/downloader/index.d.ts +9 -8
- package/lib/downloader/index.d.ts.map +1 -0
- package/lib/downloader/index.js +8 -40
- package/lib/downloader/index.js.map +1 -1
- package/lib/downloader/main.d.ts +15 -6
- package/lib/downloader/main.d.ts.map +1 -0
- package/lib/downloader/main.js +49 -32
- package/lib/downloader/main.js.map +1 -1
- package/lib/downloader/multi.d.ts +7 -5
- package/lib/downloader/multi.d.ts.map +1 -0
- package/lib/downloader/multi.js +10 -17
- package/lib/downloader/multi.js.map +1 -1
- package/lib/downloader/pipeline-executor-impl.d.ts +8 -7
- package/lib/downloader/pipeline-executor-impl.d.ts.map +1 -0
- package/lib/downloader/pipeline-executor-impl.js +1 -5
- package/lib/downloader/pipeline-executor-impl.js.map +1 -1
- package/lib/downloader/single.d.ts +4 -3
- package/lib/downloader/single.d.ts.map +1 -0
- package/lib/downloader/single.js +7 -11
- package/lib/downloader/single.js.map +1 -1
- package/lib/downloader/types.d.ts +4 -4
- package/lib/downloader/types.d.ts.map +1 -0
- package/lib/downloader/types.js +2 -5
- package/lib/downloader/types.js.map +1 -1
- package/lib/downloader/worker-pool.d.ts +6 -7
- package/lib/downloader/worker-pool.d.ts.map +1 -0
- package/lib/downloader/worker-pool.js +7 -35
- package/lib/downloader/worker-pool.js.map +1 -1
- package/lib/downloader/worker-type.d.ts +4 -3
- package/lib/downloader/worker-type.d.ts.map +1 -0
- package/lib/downloader/worker-type.js +1 -2
- package/lib/downloader/worker.d.ts +1 -0
- package/lib/downloader/worker.d.ts.map +1 -0
- package/lib/downloader/worker.js +52 -27
- package/lib/downloader/worker.js.map +1 -1
- package/lib/index.d.ts +9 -8
- package/lib/index.d.ts.map +1 -0
- package/lib/index.js +7 -33
- package/lib/index.js.map +1 -1
- package/lib/io.d.ts +2 -1
- package/lib/io.d.ts.map +1 -0
- package/lib/io.js +17 -25
- package/lib/io.js.map +1 -1
- package/lib/life-cycle/adapters.d.ts +7 -5
- package/lib/life-cycle/adapters.d.ts.map +1 -0
- package/lib/life-cycle/adapters.js +18 -30
- package/lib/life-cycle/adapters.js.map +1 -1
- package/lib/life-cycle/default-life-cycle.d.ts +2 -1
- package/lib/life-cycle/default-life-cycle.d.ts.map +1 -0
- package/lib/life-cycle/default-life-cycle.js +28 -32
- package/lib/life-cycle/default-life-cycle.js.map +1 -1
- package/lib/life-cycle/detect-resource-type.d.ts +2 -1
- package/lib/life-cycle/detect-resource-type.d.ts.map +1 -0
- package/lib/life-cycle/detect-resource-type.js +12 -17
- package/lib/life-cycle/detect-resource-type.js.map +1 -1
- package/lib/life-cycle/download-resource.d.ts +6 -7
- package/lib/life-cycle/download-resource.d.ts.map +1 -0
- package/lib/life-cycle/download-resource.js +23 -52
- package/lib/life-cycle/download-resource.js.map +1 -1
- package/lib/life-cycle/download-streaming-resource.d.ts +6 -5
- package/lib/life-cycle/download-streaming-resource.d.ts.map +1 -0
- package/lib/life-cycle/download-streaming-resource.js +39 -74
- package/lib/life-cycle/download-streaming-resource.js.map +1 -1
- package/lib/life-cycle/index.d.ts +16 -15
- package/lib/life-cycle/index.d.ts.map +1 -0
- package/lib/life-cycle/index.js +14 -59
- package/lib/life-cycle/index.js.map +1 -1
- package/lib/life-cycle/pipeline-executor.d.ts +7 -6
- package/lib/life-cycle/pipeline-executor.d.ts.map +1 -0
- package/lib/life-cycle/pipeline-executor.js +1 -2
- package/lib/life-cycle/process-css.d.ts +5 -4
- package/lib/life-cycle/process-css.d.ts.map +1 -0
- package/lib/life-cycle/process-css.js +10 -18
- package/lib/life-cycle/process-css.js.map +1 -1
- package/lib/life-cycle/process-html-meta.d.ts +4 -3
- package/lib/life-cycle/process-html-meta.d.ts.map +1 -0
- package/lib/life-cycle/process-html-meta.js +11 -15
- package/lib/life-cycle/process-html-meta.js.map +1 -1
- package/lib/life-cycle/process-html.d.ts +4 -3
- package/lib/life-cycle/process-html.d.ts.map +1 -0
- package/lib/life-cycle/process-html.js +27 -31
- package/lib/life-cycle/process-html.js.map +1 -1
- package/lib/life-cycle/process-site-map.d.ts +4 -3
- package/lib/life-cycle/process-site-map.d.ts.map +1 -0
- package/lib/life-cycle/process-site-map.js +7 -11
- package/lib/life-cycle/process-site-map.js.map +1 -1
- package/lib/life-cycle/process-source-map.d.ts +4 -4
- package/lib/life-cycle/process-source-map.d.ts.map +1 -0
- package/lib/life-cycle/process-source-map.js +16 -21
- package/lib/life-cycle/process-source-map.js.map +1 -1
- package/lib/life-cycle/process-svg.d.ts +4 -3
- package/lib/life-cycle/process-svg.d.ts.map +1 -0
- package/lib/life-cycle/process-svg.js +17 -21
- package/lib/life-cycle/process-svg.js.map +1 -1
- package/lib/life-cycle/read-or-copy-local-resource.d.ts +4 -3
- package/lib/life-cycle/read-or-copy-local-resource.d.ts.map +1 -0
- package/lib/life-cycle/read-or-copy-local-resource.js +15 -42
- package/lib/life-cycle/read-or-copy-local-resource.js.map +1 -1
- package/lib/life-cycle/save-html-to-disk.d.ts +6 -4
- package/lib/life-cycle/save-html-to-disk.d.ts.map +1 -0
- package/lib/life-cycle/save-html-to-disk.js +24 -33
- package/lib/life-cycle/save-html-to-disk.js.map +1 -1
- package/lib/life-cycle/save-resource-to-disk.d.ts +4 -3
- package/lib/life-cycle/save-resource-to-disk.d.ts.map +1 -0
- package/lib/life-cycle/save-resource-to-disk.js +10 -17
- package/lib/life-cycle/save-resource-to-disk.js.map +1 -1
- package/lib/life-cycle/skip-links.d.ts +1 -0
- package/lib/life-cycle/skip-links.d.ts.map +1 -0
- package/lib/life-cycle/skip-links.js +6 -10
- package/lib/life-cycle/skip-links.js.map +1 -1
- package/lib/life-cycle/types.d.ts +8 -7
- package/lib/life-cycle/types.d.ts.map +1 -0
- package/lib/life-cycle/types.js +1 -2
- package/lib/logger/config-logger.d.ts +2 -1
- package/lib/logger/config-logger.d.ts.map +1 -0
- package/lib/logger/config-logger.js +4 -30
- package/lib/logger/config-logger.js.map +1 -1
- package/lib/logger/logger-worker.d.ts +3 -2
- package/lib/logger/logger-worker.d.ts.map +1 -0
- package/lib/logger/logger-worker.js +11 -13
- package/lib/logger/logger-worker.js.map +1 -1
- package/lib/logger/logger.d.ts +2 -1
- package/lib/logger/logger.d.ts.map +1 -0
- package/lib/logger/logger.js +15 -17
- package/lib/logger/logger.js.map +1 -1
- package/lib/options.d.ts +8 -8
- package/lib/options.d.ts.map +1 -0
- package/lib/options.js +22 -32
- package/lib/options.js.map +1 -1
- package/lib/resource.d.ts +3 -4
- package/lib/resource.d.ts.map +1 -0
- package/lib/resource.js +34 -70
- package/lib/resource.js.map +1 -1
- package/lib/sources.d.ts +2 -1
- package/lib/sources.d.ts.map +1 -0
- package/lib/sources.js +9 -12
- package/lib/sources.js.map +1 -1
- package/lib/types.d.ts +1 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +1 -2
- package/lib/util.d.ts +4 -3
- package/lib/util.d.ts.map +1 -0
- package/lib/util.js +17 -34
- package/lib/util.js.map +1 -1
- package/package.json +18 -20
- package/src/downloader/adjust-concurrency.ts +2 -2
- package/src/downloader/index.ts +8 -8
- package/src/downloader/main.ts +50 -28
- package/src/downloader/multi.ts +11 -10
- package/src/downloader/pipeline-executor-impl.ts +7 -7
- package/src/downloader/single.ts +9 -6
- package/src/downloader/types.ts +3 -3
- package/src/downloader/worker-pool.ts +9 -9
- package/src/downloader/worker-type.ts +3 -3
- package/src/downloader/worker.ts +51 -29
- package/src/index.ts +8 -8
- package/src/io.ts +6 -6
- package/src/life-cycle/adapters.ts +7 -6
- package/src/life-cycle/css-url-parser.d.ts +1 -1
- package/src/life-cycle/default-life-cycle.ts +15 -15
- package/src/life-cycle/detect-resource-type.ts +2 -2
- package/src/life-cycle/download-resource.ts +18 -20
- package/src/life-cycle/download-streaming-resource.ts +20 -18
- package/src/life-cycle/index.ts +15 -15
- package/src/life-cycle/pipeline-executor.ts +6 -6
- package/src/life-cycle/process-css.ts +6 -5
- package/src/life-cycle/process-html-meta.ts +7 -6
- package/src/life-cycle/process-html.ts +21 -13
- package/src/life-cycle/process-site-map.ts +7 -6
- package/src/life-cycle/process-source-map.ts +5 -4
- package/src/life-cycle/process-svg.ts +10 -9
- package/src/life-cycle/read-or-copy-local-resource.ts +9 -7
- package/src/life-cycle/save-html-to-disk.ts +9 -13
- package/src/life-cycle/save-resource-to-disk.ts +6 -6
- package/src/life-cycle/types.ts +7 -7
- package/src/logger/config-logger.ts +5 -3
- package/src/logger/logger-worker.ts +8 -4
- package/src/logger/logger.ts +6 -4
- package/src/options.ts +15 -19
- package/src/resource.ts +10 -5
- package/src/sources.ts +1 -1
- package/src/util.ts +6 -10
- package/tsconfig.json +6 -2
package/lib/sources.js
CHANGED
|
@@ -1,12 +1,9 @@
|
|
|
1
|
-
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.sources = void 0;
|
|
4
|
-
const resource_1 = require("./resource");
|
|
1
|
+
import { ResourceType } from './resource.js';
|
|
5
2
|
// https://github.com/website-scraper/node-website-scraper
|
|
6
3
|
// /blob/66f5113475843ae86f12ea9e5d2ebcfade9f056e/lib/config/defaults.js
|
|
7
|
-
|
|
8
|
-
{ selector: 'style', type:
|
|
9
|
-
{ selector: '[style]', attr: 'style', type:
|
|
4
|
+
export const sources = [
|
|
5
|
+
{ selector: 'style', type: ResourceType.CssInline },
|
|
6
|
+
{ selector: '[style]', attr: 'style', type: ResourceType.CssInline },
|
|
10
7
|
{ selector: 'img', attr: 'src' },
|
|
11
8
|
{ selector: 'img', attr: 'srcset' },
|
|
12
9
|
{ selector: 'input', attr: 'src' },
|
|
@@ -14,7 +11,7 @@ exports.sources = [
|
|
|
14
11
|
{ selector: 'embed', attr: 'src' },
|
|
15
12
|
{ selector: 'param[name="movie"]', attr: 'value' },
|
|
16
13
|
{ selector: 'script', attr: 'src' },
|
|
17
|
-
{ selector: 'link[rel="stylesheet"]', attr: 'href', type:
|
|
14
|
+
{ selector: 'link[rel="stylesheet"]', attr: 'href', type: ResourceType.Css },
|
|
18
15
|
{ selector: 'link[rel*="icon"]', attr: 'href' },
|
|
19
16
|
{ selector: 'link[rel*="preload"]', attr: 'href' },
|
|
20
17
|
// prefetch links not included by default
|
|
@@ -38,9 +35,9 @@ exports.sources = [
|
|
|
38
35
|
{ selector: 'audio', attr: 'src' },
|
|
39
36
|
{ selector: 'audio source', attr: 'src' },
|
|
40
37
|
{ selector: 'audio track', attr: 'src' },
|
|
41
|
-
{ selector: 'frame', attr: 'src', type:
|
|
42
|
-
{ selector: 'iframe', attr: 'src', type:
|
|
43
|
-
{ selector: 'a', attr: 'href', type:
|
|
38
|
+
{ selector: 'frame', attr: 'src', type: ResourceType.Html },
|
|
39
|
+
{ selector: 'iframe', attr: 'src', type: ResourceType.Html },
|
|
40
|
+
{ selector: 'a', attr: 'href', type: ResourceType.Html },
|
|
44
41
|
// https://github.com/website-scraper/node-website-scraper/pull/408
|
|
45
42
|
{ selector: '[background]', attr: 'background' },
|
|
46
43
|
].map((obj) => {
|
|
@@ -48,7 +45,7 @@ exports.sources = [
|
|
|
48
45
|
obj.selector += `[${obj.attr}]`;
|
|
49
46
|
}
|
|
50
47
|
if (!obj.type) {
|
|
51
|
-
obj.type =
|
|
48
|
+
obj.type = ResourceType.Binary;
|
|
52
49
|
}
|
|
53
50
|
return obj;
|
|
54
51
|
});
|
package/lib/sources.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sources.js","sourceRoot":"","sources":["../src/sources.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"sources.js","sourceRoot":"","sources":["../src/sources.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,YAAY,EAAC,MAAM,eAAe,CAAC;AAQ3C,0DAA0D;AAC1D,wEAAwE;AACxE,MAAM,CAAC,MAAM,OAAO,GAAuB;IACzC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC,SAAS,EAAC;IACjD,EAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC,SAAS,EAAC;IAClE,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAC;IAC9B,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAC;IACjC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAC;IAClC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,OAAO,EAAC;IAChD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAC;IACjC,EAAC,QAAQ,EAAE,wBAAwB,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,CAAC,GAAG,EAAC;IAC1E,EAAC,QAAQ,EAAE,mBAAmB,EAAE,IAAI,EAAE,MAAM,EAAC;IAC7C,EAAC,QAAQ,EAAE,sBAAsB,EAAE,IAAI,EAAE,MAAM,EAAC;IAChD,yCAAyC;IACzC,qDAAqD;IACrD,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,YAAY,EAAC;IACrD,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAC;IACvC,EAAC,QAAQ,EAAE,gBAAgB,EAAE,IAAI,EAAE,QAAQ,EAAC;IAC5C,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAC;IACnC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,YAAY,CAAC,IAAI,EAAC;IACzD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,YAAY,CAAC,IAAI,EAAC;IAC1D,EAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,CAAC,IAAI,EAAC;IACtD,mEAAmE;IACnE,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,YAAY,EAAC;CAC/C,CAAC,GAAG,CAAC,CAAC,GAA8B,EAAE,EAAE;IACvC,IAAI,GAAG,CAAC,QAAQ,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC;QAChE,GAAG,CAAC,QAAQ,IAAI,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC;IAClC,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;QACd,GAAG,CAAC,IAAI,GAAG,YAAY,CAAC,MAAM,CAAC;IACjC,CAAC;IACD,OAAO,GAAuB,CAAC;AACjC,CAAC,CAAC,CAAC"}
|
package/lib/types.d.ts
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAC,IAAI,EAAC,MAAM,SAAS,CAAC;AAGlC,MAAM,MAAM,aAAa,GAAG,UAAU,CAAC,OAAO,IAAI,CAAC,CAAC;AACpD,MAAM,MAAM,OAAO,GAAG,UAAU,CAAC,aAAa,CAAC,CAAC;AAChD,MAAM,MAAM,uBAAuB,GAAG,WAAW,CAAC,UAAU,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAC9E,MAAM,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC"}
|
package/lib/types.js
CHANGED
package/lib/util.d.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import type { ResourceBody, ResourceEncoding } from './resource';
|
|
1
|
+
import type { ResourceBody, ResourceEncoding } from './resource.js';
|
|
2
2
|
export declare const sleep: (ms: number) => Promise<void>;
|
|
3
3
|
export declare const escapePath: (str: string) => string;
|
|
4
|
-
export declare const isSiteMap: (url?: string) => boolean |
|
|
4
|
+
export declare const isSiteMap: (url?: string) => boolean | "" | void;
|
|
5
5
|
export declare const arrayToMap: (array: (string | number)[], freeze?: boolean) => Record<string | number, number>;
|
|
6
6
|
export declare const toString: (body: ResourceBody, encoding: ResourceEncoding) => string;
|
|
7
|
-
export declare const importDefaultFromPath: (path: string) =>
|
|
7
|
+
export declare const importDefaultFromPath: <T>(path: string) => Promise<T>;
|
|
8
8
|
export declare const orderUrlSearch: (search: string) => string;
|
|
9
9
|
export declare const simpleHashString: (str: string) => string;
|
|
10
10
|
export declare const hasOwnProperty: (v: PropertyKey) => boolean;
|
|
@@ -19,3 +19,4 @@ export declare const weakAssign: <T, U>(target: T, source: U) => T & U;
|
|
|
19
19
|
* @param url
|
|
20
20
|
*/
|
|
21
21
|
export declare const isUrlHttp: (url: string) => boolean;
|
|
22
|
+
//# sourceMappingURL=util.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"util.d.ts","sourceRoot":"","sources":["../src/util.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAC,YAAY,EAAE,gBAAgB,EAAC,MAAM,eAAe,CAAC;AAIlE,eAAO,MAAM,KAAK,OAAQ,MAAM,KAAG,OAAO,CAAC,IAAI,CACN,CAAC;AAE1C,eAAO,MAAM,UAAU,QAAS,MAAM,KAAG,MACD,CAAC;AAEzC,eAAO,MAAM,SAAS,SAAU,MAAM,KAAG,OAAO,GAAG,EAAE,GAAG,IAEY,CAAC;AAErE,eAAO,MAAM,UAAU,UAAW,CAAC,MAAM,GAAG,MAAM,CAAC,EAAE,WAAW,OAAO,KACrE,MAAM,CAAC,MAAM,GAAG,MAAM,EAAE,MAAM,CAM/B,CAAC;AAEF,eAAO,MAAM,QAAQ,SAAU,YAAY,YAAY,gBAAgB,KAAG,MAezE,CAAC;AAEF,eAAO,MAAM,qBAAqB,GAAI,CAAC,QAAQ,MAAM,KAAG,OAAO,CAAC,CAAC,CAIhE,CAAC;AAEF,eAAO,MAAM,cAAc,WAAY,MAAM,KAAG,MAmB/C,CAAC;AAEF,eAAO,MAAM,gBAAgB,QAAS,MAAM,KAAG,MAQzB,CAAC;AAEvB,eAAO,MAAM,cAAc,6BAAkC,CAAC;AAE9D;;;;GAIG;AACH,eAAO,MAAM,UAAU,GAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,KAAG,CAAC,GAAG,CAW3D,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,SAAS,QAAS,MAAM,KAAG,OACiB,CAAC"}
|
package/lib/util.js
CHANGED
|
@@ -1,25 +1,18 @@
|
|
|
1
|
-
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.isUrlHttp = exports.weakAssign = exports.hasOwnProperty = exports.simpleHashString = exports.orderUrlSearch = exports.importDefaultFromPath = exports.toString = exports.arrayToMap = exports.isSiteMap = exports.escapePath = exports.sleep = void 0;
|
|
4
|
-
const crypto_1 = require("crypto");
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
5
2
|
const forbiddenChar = /[:*?"<>|&]|%3A|%2A|%3F|%22|%3C|%3E|%7C|%26/ig;
|
|
6
|
-
const sleep = (ms) => new Promise(r => setTimeout(r, ms | 0));
|
|
7
|
-
|
|
8
|
-
const
|
|
9
|
-
exports.escapePath = escapePath;
|
|
10
|
-
const isSiteMap = (url) => url &&
|
|
3
|
+
export const sleep = (ms) => new Promise(r => setTimeout(r, ms | 0));
|
|
4
|
+
export const escapePath = (str) => str && str.replace(forbiddenChar, '_');
|
|
5
|
+
export const isSiteMap = (url) => url &&
|
|
11
6
|
url.includes('/sitemaps/') &&
|
|
12
7
|
(url.endsWith('sitemap.xml') || url.endsWith('sitemap_other.xml'));
|
|
13
|
-
|
|
14
|
-
const arrayToMap = (array, freeze) => {
|
|
8
|
+
export const arrayToMap = (array, freeze) => {
|
|
15
9
|
const obj = {};
|
|
16
10
|
for (const item of array) {
|
|
17
11
|
obj[item] = 1;
|
|
18
12
|
}
|
|
19
13
|
return freeze ? Object.freeze(obj) : obj;
|
|
20
14
|
};
|
|
21
|
-
|
|
22
|
-
const toString = (body, encoding) => {
|
|
15
|
+
export const toString = (body, encoding) => {
|
|
23
16
|
let stringValue;
|
|
24
17
|
if (Buffer.isBuffer(body)) {
|
|
25
18
|
stringValue = body.toString(encoding || 'utf8');
|
|
@@ -38,18 +31,12 @@ const toString = (body, encoding) => {
|
|
|
38
31
|
}
|
|
39
32
|
return stringValue;
|
|
40
33
|
};
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
const mod = require(path);
|
|
46
|
-
if (mod && mod.__esModule && mod.default) {
|
|
47
|
-
return mod.default;
|
|
48
|
-
}
|
|
49
|
-
return mod;
|
|
34
|
+
export const importDefaultFromPath = (path) => {
|
|
35
|
+
return import(path).then(mod => {
|
|
36
|
+
return mod.default || mod;
|
|
37
|
+
});
|
|
50
38
|
};
|
|
51
|
-
|
|
52
|
-
const orderUrlSearch = (search) => {
|
|
39
|
+
export const orderUrlSearch = (search) => {
|
|
53
40
|
const parts = (search[0] === '?' ? search.slice(1) : search)
|
|
54
41
|
.split('&');
|
|
55
42
|
const searchKeys = [], searchMap = {};
|
|
@@ -69,8 +56,7 @@ const orderUrlSearch = (search) => {
|
|
|
69
56
|
.map(k => { var _a; return (_a = searchMap[k]) === null || _a === void 0 ? void 0 : _a.map(v => k + '=' + v).join('&'); })
|
|
70
57
|
.join('&');
|
|
71
58
|
};
|
|
72
|
-
|
|
73
|
-
const simpleHashString = (str) => (0, crypto_1.createHash)('sha256')
|
|
59
|
+
export const simpleHashString = (str) => createHash('sha256')
|
|
74
60
|
.update(str)
|
|
75
61
|
.digest()
|
|
76
62
|
.toString('base64')
|
|
@@ -78,32 +64,29 @@ const simpleHashString = (str) => (0, crypto_1.createHash)('sha256')
|
|
|
78
64
|
.replace(/\+/g, '-')
|
|
79
65
|
.replace(/\//g, '_')
|
|
80
66
|
.replace(/=/g, '');
|
|
81
|
-
|
|
82
|
-
exports.hasOwnProperty = Object.prototype.hasOwnProperty;
|
|
67
|
+
export const hasOwnProperty = Object.prototype.hasOwnProperty;
|
|
83
68
|
/**
|
|
84
69
|
* Merge values from source to target only if key not exists in target
|
|
85
70
|
* Note that using this function against incompatible type or null | undefined
|
|
86
71
|
* may lead to typescript parser errors.
|
|
87
72
|
*/
|
|
88
|
-
const weakAssign = (target, source) => {
|
|
73
|
+
export const weakAssign = (target, source) => {
|
|
89
74
|
if (!target)
|
|
90
75
|
return Object.assign({}, source);
|
|
91
76
|
if (!source)
|
|
92
77
|
return target;
|
|
93
78
|
for (const key in source) {
|
|
94
|
-
if (
|
|
95
|
-
!
|
|
79
|
+
if (hasOwnProperty.call(source, key) &&
|
|
80
|
+
!hasOwnProperty.call(target, key)) {
|
|
96
81
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
97
82
|
Reflect.set(target, key, source[key]);
|
|
98
83
|
}
|
|
99
84
|
}
|
|
100
85
|
return target;
|
|
101
86
|
};
|
|
102
|
-
exports.weakAssign = weakAssign;
|
|
103
87
|
/**
|
|
104
88
|
* Test if the given url is http url
|
|
105
89
|
* @param url
|
|
106
90
|
*/
|
|
107
|
-
const isUrlHttp = (url) => url.startsWith('http://') || url.startsWith('https://');
|
|
108
|
-
exports.isUrlHttp = isUrlHttp;
|
|
91
|
+
export const isUrlHttp = (url) => url.startsWith('http://') || url.startsWith('https://');
|
|
109
92
|
//# sourceMappingURL=util.js.map
|
package/lib/util.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"util.js","sourceRoot":"","sources":["../src/util.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"util.js","sourceRoot":"","sources":["../src/util.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,UAAU,EAAC,MAAM,aAAa,CAAC;AAGvC,MAAM,aAAa,GAAG,8CAA8C,CAAC;AAErE,MAAM,CAAC,MAAM,KAAK,GAAG,CAAC,EAAU,EAAiB,EAAE,CACjD,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC;AAE1C,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,GAAW,EAAU,EAAE,CAChD,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;AAEzC,MAAM,CAAC,MAAM,SAAS,GAAG,CAAC,GAAY,EAAuB,EAAE,CAAC,GAAG;IACjE,GAAG,CAAC,QAAQ,CAAC,YAAY,CAAC;IAC1B,CAAC,GAAG,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,mBAAmB,CAAC,CAAC,CAAC;AAErE,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,KAA0B,EAAE,MAAgB,EACrC,EAAE;IAClC,MAAM,GAAG,GAAoC,EAAE,CAAC;IAChD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAChB,CAAC;IACD,OAAO,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;AAC3C,CAAC,CAAC;AAEF,MAAM,CAAC,MAAM,QAAQ,GAAG,CAAC,IAAkB,EAAE,QAA0B,EAAU,EAAE;IACjF,IAAI,WAAmB,CAAC;IACxB,IAAI,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC1B,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,IAAI,MAAM,CAAC,CAAC;IAClD,CAAC;SAAM,IAAI,WAAW,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC;QACpC,uCAAuC;QACvC,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,UAAU,CAAC;aACrE,QAAQ,CAAC,QAAQ,IAAI,MAAM,CAAC,CAAC;IAClC,CAAC;SAAM,IAAI,IAAI,YAAY,WAAW,EAAE,CAAC;QACvC,uCAAuC;QACvC,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,QAAQ,IAAI,MAAM,CAAC,CAAC;IAC/D,CAAC;SAAM,CAAC;QACN,WAAW,GAAG,IAAI,CAAC;IACrB,CAAC;IACD,OAAO,WAAW,CAAC;AACrB,CAAC,CAAC;AAEF,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAI,IAAY,EAAc,EAAE;IACnE,OAAO,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE;QAC7B,OAAO,GAAG,CAAC,OAAO,IAAI,GAAG,CAAC;IAC5B,CAAC,CAAC,CAAC;AACL,CAAC,CAAC;AAEF,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,MAAc,EAAU,EAAE;IACvD,MAAM,KAAK,GAAa,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;SACnE,KAAK,CAAC,GAAG,CAAC,CAAC;IACd,MAAM,UAAU,GAAa,EAAE,EAC7B,SAAS,GAA6B,EAAE,CAAC;IAC3C,IAAI,WAAqB,EAAE,SAAiB,CAAC;IAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,WAAW,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAClC,IAAI,SAAS,CAAC,SAAS,GAAG,WAAW,CAAC,KAAK,EAAE,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC3D,SAAS,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QACnD,CAAC;aAAM,CAAC;YACN,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAC3B,SAAS,CAAC,SAAS,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IACD,OAAO,GAAG,GAAG,UAAU;SACpB,IAAI,EAAE;SACN,GAAG,CAAC,CAAC,CAAC,EAAE,WAAC,OAAA,MAAA,SAAS,CAAC,CAAC,CAAC,0CAAE,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,GAAG,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAA,EAAA,CAAC;SACvD,IAAI,CAAC,GAAG,CAAC,CAAC;AACf,CAAC,CAAC;AAEF,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,GAAW,EAAU,EAAE,CACtD,UAAU,CAAC,QAAQ,CAAC;KACjB,MAAM,CAAC,GAAG,CAAC;KACX,MAAM,EAAE;KACR,QAAQ,CAAC,QAAQ,CAAC;IACnB,qBAAqB;KACpB,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC;KACnB,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC;KACnB,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;AAEvB,MAAM,CAAC,MAAM,cAAc,GAAG,MAAM,CAAC,SAAS,CAAC,cAAc,CAAC;AAE9D;;;;GAIG;AACH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAO,MAAS,EAAE,MAAS,EAAS,EAAE;IAC9D,IAAI,CAAC,MAAM;QAAE,OAAO,MAAM,CAAC,MAAM,CAAC,EAAE,EAAE,MAAM,CAAU,CAAC;IACvD,IAAI,CAAC,MAAM;QAAE,OAAO,MAAe,CAAC;IACpC,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QACzB,IAAI,cAAc,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC;YAClC,CAAC,cAAc,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,CAAC;YACpC,8DAA8D;YAC9D,OAAO,CAAC,GAAG,CAAC,MAAa,EAAE,GAAG,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IACD,OAAO,MAAe,CAAC;AACzB,CAAC,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,MAAM,SAAS,GAAG,CAAC,GAAW,EAAW,EAAE,CAChD,GAAG,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,45 +1,43 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "website-scrap-engine",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.8.0",
|
|
4
4
|
"description": "Configurable website scraper in typescript",
|
|
5
5
|
"main": "lib",
|
|
6
6
|
"types": "lib",
|
|
7
|
+
"type": "module",
|
|
7
8
|
"engines": {
|
|
8
|
-
"node": ">=
|
|
9
|
+
"node": ">=18.17.0"
|
|
9
10
|
},
|
|
10
11
|
"scripts": {
|
|
11
|
-
"clean": "
|
|
12
|
+
"clean": "node -e \"require('fs').rmSync('lib',{force:true,recursive:true})\"",
|
|
12
13
|
"tsc": "tsc",
|
|
13
14
|
"lint": "eslint --fix src test",
|
|
14
|
-
"test": "npm run lint && jest",
|
|
15
|
+
"test": "npm run lint && node --experimental-vm-modules node_modules/jest/bin/jest.js",
|
|
15
16
|
"build": "npm run lint && npm run tsc && npm run copy",
|
|
16
17
|
"copy": "node copy-src.js",
|
|
17
18
|
"prepack": "npm run clean && npm run build",
|
|
18
19
|
"postshrinkwrap": "node package-lock-resolved.js"
|
|
19
20
|
},
|
|
20
|
-
"jest": {
|
|
21
|
-
"preset": "ts-jest"
|
|
22
|
-
},
|
|
23
21
|
"dependencies": {
|
|
24
|
-
"cheerio": "^1.0.0
|
|
25
|
-
"css-url-parser": "^1.1.
|
|
26
|
-
"got": "^
|
|
22
|
+
"cheerio": "^1.0.0",
|
|
23
|
+
"css-url-parser": "^1.1.4",
|
|
24
|
+
"got": "^13.0.0",
|
|
27
25
|
"log4js": "^6.9.1",
|
|
28
26
|
"mkdirp": "^3.0.1",
|
|
29
|
-
"p-queue": "^
|
|
30
|
-
"srcset": "^
|
|
27
|
+
"p-queue": "^8.1.0",
|
|
28
|
+
"srcset": "^5.0.1",
|
|
31
29
|
"urijs": "^1.19.11"
|
|
32
30
|
},
|
|
33
31
|
"devDependencies": {
|
|
34
|
-
"@
|
|
35
|
-
"@types/node": "^22.
|
|
32
|
+
"@jest/globals": "^29.7.0",
|
|
33
|
+
"@types/node": "^22.12.0",
|
|
36
34
|
"@types/urijs": "^1.19.25",
|
|
37
|
-
"@typescript-eslint/eslint-plugin": "^
|
|
38
|
-
"@typescript-eslint/parser": "^
|
|
39
|
-
"eslint": "^
|
|
40
|
-
"jest": "^
|
|
41
|
-
"ts-jest": "^
|
|
42
|
-
"typescript": "^5.
|
|
35
|
+
"@typescript-eslint/eslint-plugin": "^8.22.0",
|
|
36
|
+
"@typescript-eslint/parser": "^8.22.0",
|
|
37
|
+
"eslint": "^9.19.0",
|
|
38
|
+
"jest": "^29.7.0",
|
|
39
|
+
"ts-jest": "^29.2.5",
|
|
40
|
+
"typescript": "^5.7.3"
|
|
43
41
|
},
|
|
44
42
|
"files": [
|
|
45
43
|
".editorconfig",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {adjustConcurrency as logger} from '../logger/logger';
|
|
2
|
-
import type {DownloaderWithMeta} from './types';
|
|
1
|
+
import {adjustConcurrency as logger} from '../logger/logger.js';
|
|
2
|
+
import type {DownloaderWithMeta} from './types.js';
|
|
3
3
|
|
|
4
4
|
export function adjust(downloader: DownloaderWithMeta): void {
|
|
5
5
|
const {meta} = downloader;
|
package/src/downloader/index.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
export {adjust} from './adjust-concurrency';
|
|
2
|
-
export {AbstractDownloader} from './main';
|
|
3
|
-
export {MultiThreadDownloader} from './multi';
|
|
4
|
-
export {PipelineExecutorImpl} from './pipeline-executor-impl';
|
|
5
|
-
export {SingleThreadDownloader} from './single';
|
|
6
|
-
export * as types from './types';
|
|
7
|
-
export {WorkerPool} from './worker-pool';
|
|
8
|
-
export * as workerType from './worker-type';
|
|
1
|
+
export {adjust} from './adjust-concurrency.js';
|
|
2
|
+
export {AbstractDownloader} from './main.js';
|
|
3
|
+
export {MultiThreadDownloader} from './multi.js';
|
|
4
|
+
export {PipelineExecutorImpl} from './pipeline-executor-impl.js';
|
|
5
|
+
export {SingleThreadDownloader} from './single.js';
|
|
6
|
+
export * as types from './types.js';
|
|
7
|
+
export {WorkerPool} from './worker-pool.js';
|
|
8
|
+
export * as workerType from './worker-type.js';
|
package/src/downloader/main.ts
CHANGED
|
@@ -1,26 +1,23 @@
|
|
|
1
1
|
import PQueue from 'p-queue';
|
|
2
2
|
import type {HTTPError} from 'got';
|
|
3
3
|
import URI from 'urijs';
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
} from '../
|
|
9
|
-
import {
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
Resource,
|
|
13
|
-
ResourceType
|
|
14
|
-
} from '../resource';
|
|
15
|
-
import {error, notFound, skip} from '../logger/logger';
|
|
16
|
-
import {importDefaultFromPath} from '../util';
|
|
17
|
-
import type {DownloaderStats, DownloaderWithMeta} from './types';
|
|
18
|
-
import {PipelineExecutorImpl} from './pipeline-executor-impl';
|
|
4
|
+
import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
|
|
5
|
+
import {mergeOverrideOptions} from '../options.js';
|
|
6
|
+
import type {RawResource, Resource} from '../resource.js';
|
|
7
|
+
import {normalizeResource, ResourceType} from '../resource.js';
|
|
8
|
+
import {error, notFound, skip} from '../logger/logger.js';
|
|
9
|
+
import {importDefaultFromPath} from '../util.js';
|
|
10
|
+
import type {DownloaderStats, DownloaderWithMeta} from './types.js';
|
|
11
|
+
import {PipelineExecutorImpl} from './pipeline-executor-impl.js';
|
|
19
12
|
|
|
20
13
|
export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
21
14
|
readonly queue: PQueue;
|
|
22
|
-
readonly
|
|
23
|
-
readonly
|
|
15
|
+
readonly _asyncOptions: Promise<DownloadOptions>;
|
|
16
|
+
readonly _overrideOptions?: Partial<StaticDownloadOptions> & { pathToWorker?: string };
|
|
17
|
+
_options?: DownloadOptions;
|
|
18
|
+
_isInit: boolean;
|
|
19
|
+
_pipeline?: PipelineExecutorImpl;
|
|
20
|
+
_initOptions: Promise<void>;
|
|
24
21
|
readonly downloadedUrl: Set<string> = new Set<string>();
|
|
25
22
|
readonly queuedUrl: Set<string> = new Set<string>();
|
|
26
23
|
readonly meta: DownloaderStats = {
|
|
@@ -33,10 +30,31 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
|
33
30
|
|
|
34
31
|
protected constructor(public pathToOptions: string,
|
|
35
32
|
overrideOptions?: Partial<StaticDownloadOptions> & { pathToWorker?: string }) {
|
|
36
|
-
this.
|
|
37
|
-
this.
|
|
38
|
-
this.
|
|
39
|
-
this.
|
|
33
|
+
this._asyncOptions = importDefaultFromPath(pathToOptions);
|
|
34
|
+
this._overrideOptions = overrideOptions;
|
|
35
|
+
this.queue = new PQueue();
|
|
36
|
+
this._isInit = false;
|
|
37
|
+
this._initOptions = this._asyncOptions.then(options => {
|
|
38
|
+
options = mergeOverrideOptions(options, this._overrideOptions);
|
|
39
|
+
this._options = options;
|
|
40
|
+
this._pipeline = new PipelineExecutorImpl(options, options.req, options);
|
|
41
|
+
options.configureLogger(options.localRoot, options.logSubDir || '');
|
|
42
|
+
this._isInit = true;
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
get options(): DownloadOptions {
|
|
47
|
+
if (this._options) {
|
|
48
|
+
return this._options;
|
|
49
|
+
}
|
|
50
|
+
throw new TypeError('AbstractDownloader: not initialized');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
get pipeline(): PipelineExecutorImpl {
|
|
54
|
+
if (this._pipeline) {
|
|
55
|
+
return this._pipeline;
|
|
56
|
+
}
|
|
57
|
+
throw new TypeError('AbstractDownloader: not initialized');
|
|
40
58
|
}
|
|
41
59
|
|
|
42
60
|
get concurrency(): number {
|
|
@@ -56,20 +74,22 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
|
56
74
|
}
|
|
57
75
|
|
|
58
76
|
async addInitialResource(urlArr: string[]): Promise<void> {
|
|
59
|
-
await this.
|
|
77
|
+
await this._initOptions;
|
|
78
|
+
const pipeline = this.pipeline;
|
|
79
|
+
await pipeline.init(pipeline, this);
|
|
60
80
|
// noinspection DuplicatedCode
|
|
61
81
|
for (let i = 0, l = urlArr.length; i < l; i++) {
|
|
62
82
|
let url: string | void = urlArr[i];
|
|
63
|
-
url = await
|
|
83
|
+
url = await pipeline.linkRedirect(url, null, null);
|
|
64
84
|
if (!url) continue;
|
|
65
|
-
const type: ResourceType | void = await
|
|
85
|
+
const type: ResourceType | void = await pipeline.detectResourceType(
|
|
66
86
|
url, ResourceType.Html, null, null);
|
|
67
87
|
if (!type) continue;
|
|
68
|
-
let r: Resource | void = await
|
|
88
|
+
let r: Resource | void = await pipeline.createResource(
|
|
69
89
|
type, 0, url, url,
|
|
70
90
|
undefined, undefined, undefined, type);
|
|
71
91
|
if (!r) continue;
|
|
72
|
-
r = await
|
|
92
|
+
r = await pipeline.processBeforeDownload(r, null, null);
|
|
73
93
|
if (!r) continue;
|
|
74
94
|
if (!r.shouldBeDiscardedFromDownload) {
|
|
75
95
|
this.addProcessedResource(r);
|
|
@@ -138,7 +158,9 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
|
138
158
|
() => this.options.adjustConcurrencyFunc?.(this),
|
|
139
159
|
this.options.adjustConcurrencyPeriod || 60000);
|
|
140
160
|
}
|
|
141
|
-
this.
|
|
161
|
+
this._initOptions.then(() => {
|
|
162
|
+
this.queue.start();
|
|
163
|
+
});
|
|
142
164
|
}
|
|
143
165
|
|
|
144
166
|
stop(): void {
|
|
@@ -155,7 +177,7 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
|
|
|
155
177
|
async dispose(): Promise<void> {
|
|
156
178
|
this.stop();
|
|
157
179
|
this.queue.clear();
|
|
158
|
-
await this.pipeline
|
|
180
|
+
await this.pipeline?.dispose(this.pipeline, this);
|
|
159
181
|
}
|
|
160
182
|
|
|
161
183
|
}
|
package/src/downloader/multi.ts
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
import path from 'path';
|
|
2
|
-
import {
|
|
3
|
-
import
|
|
4
|
-
import type {
|
|
5
|
-
import type {
|
|
6
|
-
import type {
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import type {WorkerFactory} from './worker-pool.js';
|
|
3
|
+
import {WorkerPool} from './worker-pool.js';
|
|
4
|
+
import type {RawResource, Resource} from '../resource.js';
|
|
5
|
+
import type {DownloadWorkerMessage} from './types.js';
|
|
6
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
7
|
+
import type {DownloadResource} from '../life-cycle/types.js';
|
|
8
|
+
import {skip} from '../logger/logger.js';
|
|
9
|
+
import {AbstractDownloader} from './main.js';
|
|
9
10
|
|
|
10
11
|
export interface MultiThreadDownloaderOptions extends StaticDownloadOptions {
|
|
11
12
|
pathToWorker?: string;
|
|
@@ -46,14 +47,14 @@ export class MultiThreadDownloader extends AbstractDownloader {
|
|
|
46
47
|
if (this.options.initialUrl) {
|
|
47
48
|
this.init = this.addInitialResource(this.options.initialUrl);
|
|
48
49
|
} else {
|
|
49
|
-
this.init = this.pipeline.init(this.pipeline, this);
|
|
50
|
+
this.init = this._initOptions.then(() => this.pipeline.init(this.pipeline, this));
|
|
50
51
|
}
|
|
51
52
|
}
|
|
52
53
|
|
|
53
54
|
async downloadAndProcessResource(res: Resource): Promise<boolean | void> {
|
|
54
55
|
let r: DownloadResource | void;
|
|
55
56
|
try {
|
|
56
|
-
r = await this.pipeline
|
|
57
|
+
r = await this.pipeline!.download(res);
|
|
57
58
|
if (!r) {
|
|
58
59
|
skip.debug('discarded after download', res.url, res.rawUrl, res.refUrl);
|
|
59
60
|
return;
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
import type {StaticDownloadOptions} from '../options';
|
|
1
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
2
2
|
import type {
|
|
3
3
|
CreateResourceArgument,
|
|
4
4
|
Resource,
|
|
5
5
|
ResourceEncoding,
|
|
6
6
|
ResourceType
|
|
7
|
-
} from '../resource';
|
|
7
|
+
} from '../resource.js';
|
|
8
8
|
import type {
|
|
9
9
|
DownloadResource,
|
|
10
10
|
ProcessingLifeCycle,
|
|
11
11
|
RequestOptions,
|
|
12
12
|
SubmitResourceFunc
|
|
13
|
-
} from '../life-cycle/types';
|
|
13
|
+
} from '../life-cycle/types.js';
|
|
14
14
|
// noinspection ES6PreferShortImport
|
|
15
|
-
import type {PipelineExecutor} from '../life-cycle/pipeline-executor';
|
|
16
|
-
import type {Cheerio} from '../types';
|
|
17
|
-
import type {DownloaderWithMeta} from './types';
|
|
18
|
-
import type {WorkerInfo} from './worker-pool';
|
|
15
|
+
import type {PipelineExecutor} from '../life-cycle/pipeline-executor.js';
|
|
16
|
+
import type {Cheerio} from '../types.js';
|
|
17
|
+
import type {DownloaderWithMeta} from './types.js';
|
|
18
|
+
import type {WorkerInfo} from './worker-pool.js';
|
|
19
19
|
|
|
20
20
|
/**
|
|
21
21
|
* Pipeline executor
|
package/src/downloader/single.ts
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
import {AbstractDownloader} from './main';
|
|
2
|
-
import type {Resource} from '../resource';
|
|
3
|
-
import type {StaticDownloadOptions} from '../options';
|
|
4
|
-
import {skip} from '../logger/logger';
|
|
5
|
-
import type {
|
|
1
|
+
import {AbstractDownloader} from './main.js';
|
|
2
|
+
import type {Resource} from '../resource.js';
|
|
3
|
+
import type {StaticDownloadOptions} from '../options.js';
|
|
4
|
+
import {skip} from '../logger/logger.js';
|
|
5
|
+
import type {
|
|
6
|
+
DownloadResource,
|
|
7
|
+
SubmitResourceFunc
|
|
8
|
+
} from '../life-cycle/types.js';
|
|
6
9
|
|
|
7
10
|
export class SingleThreadDownloader extends AbstractDownloader {
|
|
8
11
|
readonly init: Promise<void>;
|
|
@@ -13,7 +16,7 @@ export class SingleThreadDownloader extends AbstractDownloader {
|
|
|
13
16
|
if (this.options.initialUrl) {
|
|
14
17
|
this.init = this.addInitialResource(this.options.initialUrl);
|
|
15
18
|
} else {
|
|
16
|
-
this.init = this.pipeline.init(this.pipeline, this);
|
|
19
|
+
this.init = this._initOptions.then(() => this.pipeline.init(this.pipeline, this));
|
|
17
20
|
}
|
|
18
21
|
}
|
|
19
22
|
|
package/src/downloader/types.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import type {MessagePort} from 'worker_threads';
|
|
2
|
-
import type {DownloadOptions} from '../options';
|
|
3
|
-
import type {RawResource} from '../resource';
|
|
1
|
+
import type {MessagePort} from 'node:worker_threads';
|
|
2
|
+
import type {DownloadOptions} from '../options.js';
|
|
3
|
+
import type {RawResource} from '../resource.js';
|
|
4
4
|
|
|
5
5
|
export interface DownloaderStats {
|
|
6
6
|
firstPeriodCount: number;
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
import type {MessagePort, WorkerOptions} from 'worker_threads';
|
|
2
|
-
import
|
|
3
|
-
import {
|
|
4
|
-
import * as logger from '../logger/logger';
|
|
5
|
-
import type {LogWorkerMessage} from './worker-type';
|
|
6
|
-
import {
|
|
1
|
+
import type {MessagePort, WorkerOptions} from 'node:worker_threads';
|
|
2
|
+
import {Worker} from 'node:worker_threads';
|
|
3
|
+
import type {URL} from 'node:url';
|
|
4
|
+
import * as logger from '../logger/logger.js';
|
|
5
|
+
import type {LogWorkerMessage} from './worker-type.js';
|
|
6
|
+
import type {
|
|
7
7
|
PendingPromise,
|
|
8
8
|
PendingPromiseWithBody,
|
|
9
|
-
WorkerMessage
|
|
10
|
-
|
|
11
|
-
} from './types';
|
|
9
|
+
WorkerMessage
|
|
10
|
+
} from './types.js';
|
|
11
|
+
import {WorkerMessageType} from './types.js';
|
|
12
12
|
|
|
13
13
|
export interface WorkerInfo {
|
|
14
14
|
readonly id: number;
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import type {logLevels} from '../logger/logger-worker';
|
|
2
|
-
import type * as logger from '../logger/logger';
|
|
3
|
-
import {WorkerMessage, WorkerMessageType} from './types';
|
|
1
|
+
import type {logLevels} from '../logger/logger-worker.js';
|
|
2
|
+
import type * as logger from '../logger/logger.js';
|
|
3
|
+
import type {WorkerMessage, WorkerMessageType} from './types.js';
|
|
4
4
|
|
|
5
5
|
export interface WorkerLog<T = unknown> {
|
|
6
6
|
logger: keyof typeof logger;
|