website-scrap-engine 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +167 -0
- package/lib/downloader/index.js +5 -1
- package/lib/downloader/index.js.map +1 -1
- package/lib/downloader/main.d.ts +1 -1
- package/lib/downloader/main.js +7 -6
- package/lib/downloader/main.js.map +1 -1
- package/lib/downloader/pipeline-executor-impl.js +4 -4
- package/lib/downloader/pipeline-executor-impl.js.map +1 -1
- package/lib/downloader/types.d.ts +1 -1
- package/lib/downloader/worker-pool.d.ts +1 -0
- package/lib/downloader/worker-pool.js +17 -3
- package/lib/downloader/worker-pool.js.map +1 -1
- package/lib/downloader/worker.js +5 -4
- package/lib/downloader/worker.js.map +1 -1
- package/lib/index.js +5 -1
- package/lib/index.js.map +1 -1
- package/lib/io.js +3 -3
- package/lib/io.js.map +1 -1
- package/lib/life-cycle/adapters.js +10 -8
- package/lib/life-cycle/adapters.js.map +1 -1
- package/lib/life-cycle/detect-resource-type.js +2 -2
- package/lib/life-cycle/detect-resource-type.js.map +1 -1
- package/lib/life-cycle/download-resource.d.ts +6 -1
- package/lib/life-cycle/download-resource.js +27 -13
- package/lib/life-cycle/download-resource.js.map +1 -1
- package/lib/life-cycle/download-streaming-resource.d.ts +1 -1
- package/lib/life-cycle/download-streaming-resource.js +12 -7
- package/lib/life-cycle/download-streaming-resource.js.map +1 -1
- package/lib/life-cycle/index.js +5 -1
- package/lib/life-cycle/index.js.map +1 -1
- package/lib/life-cycle/process-css.js +2 -2
- package/lib/life-cycle/process-css.js.map +1 -1
- package/lib/life-cycle/process-html.js +3 -3
- package/lib/life-cycle/process-html.js.map +1 -1
- package/lib/life-cycle/process-site-map.js +2 -5
- package/lib/life-cycle/process-site-map.js.map +1 -1
- package/lib/life-cycle/process-source-map.js +2 -2
- package/lib/life-cycle/process-source-map.js.map +1 -1
- package/lib/life-cycle/process-svg.js +2 -2
- package/lib/life-cycle/process-svg.js.map +1 -1
- package/lib/life-cycle/read-or-copy-local-resource.js +5 -1
- package/lib/life-cycle/read-or-copy-local-resource.js.map +1 -1
- package/lib/life-cycle/save-html-to-disk.js +10 -10
- package/lib/life-cycle/save-html-to-disk.js.map +1 -1
- package/lib/life-cycle/save-resource-to-disk.js +5 -5
- package/lib/life-cycle/save-resource-to-disk.js.map +1 -1
- package/lib/life-cycle/types.d.ts +4 -3
- package/lib/logger/config-logger.js +6 -2
- package/lib/logger/config-logger.js.map +1 -1
- package/lib/logger/logger-worker.d.ts +1 -1
- package/lib/logger/logger-worker.js +2 -2
- package/lib/logger/logger-worker.js.map +1 -1
- package/lib/options.d.ts +1 -1
- package/lib/options.js +1 -1
- package/lib/options.js.map +1 -1
- package/lib/resource.d.ts +11 -3
- package/lib/resource.js +25 -18
- package/lib/resource.js.map +1 -1
- package/lib/types.d.ts +5 -5
- package/lib/util.d.ts +2 -2
- package/lib/util.js +2 -3
- package/lib/util.js.map +1 -1
- package/package.json +19 -19
- package/src/downloader/main.ts +3 -2
- package/src/downloader/pipeline-executor-impl.ts +3 -2
- package/src/downloader/types.ts +1 -1
- package/src/downloader/worker-pool.ts +10 -2
- package/src/downloader/worker.ts +2 -1
- package/src/io.ts +1 -1
- package/src/life-cycle/adapters.ts +8 -3
- package/src/life-cycle/download-resource.ts +29 -12
- package/src/life-cycle/download-streaming-resource.ts +3 -2
- package/src/life-cycle/process-site-map.ts +2 -2
- package/src/life-cycle/types.ts +2 -0
- package/src/logger/logger-worker.ts +1 -1
- package/src/resource.ts +14 -3
- package/src/types.ts +3 -3
- package/src/util.ts +2 -3
|
@@ -7,10 +7,11 @@ import got, {
|
|
|
7
7
|
} from 'got';
|
|
8
8
|
import type {Response} from 'got/dist/source/as-promise';
|
|
9
9
|
import type {DownloadResource, RequestOptions} from './types';
|
|
10
|
-
import {Resource, ResourceType} from '../resource';
|
|
10
|
+
import {generateSavePath, Resource, ResourceType} from '../resource';
|
|
11
11
|
import type {StaticDownloadOptions} from '../options';
|
|
12
12
|
import * as logger from '../logger/logger';
|
|
13
13
|
import {isUrlHttp, sleep} from '../util';
|
|
14
|
+
import URI from 'urijs';
|
|
14
15
|
|
|
15
16
|
/** Take logs before retry */
|
|
16
17
|
export const beforeRetryHook: BeforeRetryHook = (
|
|
@@ -33,6 +34,11 @@ export const beforeRetryHook: BeforeRetryHook = (
|
|
|
33
34
|
}
|
|
34
35
|
};
|
|
35
36
|
|
|
37
|
+
export interface DownloadError extends Partial<Error> {
|
|
38
|
+
retryLimitExceeded?: boolean;
|
|
39
|
+
code?: string;
|
|
40
|
+
event?: string;
|
|
41
|
+
}
|
|
36
42
|
|
|
37
43
|
/**
|
|
38
44
|
* workaround for retry premature close on node 12
|
|
@@ -46,7 +52,7 @@ export async function getRetry(
|
|
|
46
52
|
options: Options
|
|
47
53
|
): Promise<Response<Buffer | string> | void> {
|
|
48
54
|
let res: Response<Buffer | string> | void;
|
|
49
|
-
let err:
|
|
55
|
+
let err: DownloadError | void, optionsClone: Options;
|
|
50
56
|
for (let i = 0; i < 25; i++) {
|
|
51
57
|
err = void 0;
|
|
52
58
|
try {
|
|
@@ -59,22 +65,23 @@ export async function getRetry(
|
|
|
59
65
|
}
|
|
60
66
|
break;
|
|
61
67
|
} catch (e) {
|
|
62
|
-
|
|
63
|
-
|
|
68
|
+
// force cast for typescript 4.4
|
|
69
|
+
err = e as DownloadError | void;
|
|
70
|
+
if (err && err.message === 'premature close') {
|
|
64
71
|
logger.retry.warn(i, url, 'manually retry on premature close',
|
|
65
|
-
|
|
72
|
+
err.name, err.code, err.event, err.message);
|
|
66
73
|
await sleep(i * 200);
|
|
67
74
|
continue;
|
|
68
75
|
}
|
|
69
76
|
// these events might be accidentally unhandled
|
|
70
|
-
if (
|
|
71
|
-
(
|
|
77
|
+
if (err && !err.retryLimitExceeded &&
|
|
78
|
+
(err.name === 'RequestError' || err.name === 'TimeoutError') &&
|
|
72
79
|
// RequestError: Cannot read property 'request' of undefined
|
|
73
80
|
// at Object.exports.default (got\dist\source\core\utils\timed-out.js:56:23)
|
|
74
81
|
// error.code === undefined
|
|
75
|
-
(
|
|
76
|
-
logger.retry.warn(i, url, `manually retry on ${
|
|
77
|
-
|
|
82
|
+
(err.code === 'ETIMEDOUT' || err.code === undefined)) {
|
|
83
|
+
logger.retry.warn(i, url, `manually retry on ${err.event} timeout`,
|
|
84
|
+
err.name, err.code, err.message);
|
|
78
85
|
await sleep(i * 300);
|
|
79
86
|
continue;
|
|
80
87
|
}
|
|
@@ -91,7 +98,8 @@ export async function getRetry(
|
|
|
91
98
|
|
|
92
99
|
export async function requestForResource(
|
|
93
100
|
res: Resource & { downloadStartTimestamp: number },
|
|
94
|
-
requestOptions: RequestOptions
|
|
101
|
+
requestOptions: RequestOptions,
|
|
102
|
+
options?: StaticDownloadOptions
|
|
95
103
|
): Promise<DownloadResource | Resource | void> {
|
|
96
104
|
const downloadLink: string = encodeURI(decodeURI(res.downloadLink));
|
|
97
105
|
const reqOptions: Options = Object.assign({}, requestOptions);
|
|
@@ -122,6 +130,15 @@ export async function requestForResource(
|
|
|
122
130
|
res.finishTimestamp = Date.now();
|
|
123
131
|
res.downloadTime = res.finishTimestamp - res.downloadStartTimestamp;
|
|
124
132
|
res.redirectedUrl = response.url;
|
|
133
|
+
// https://github.com/website-local/website-scrap-engine/issues/385
|
|
134
|
+
// 2011/11/15
|
|
135
|
+
if (res.redirectedUrl !== res.url) {
|
|
136
|
+
res.redirectedSavePath = generateSavePath(
|
|
137
|
+
URI(res.redirectedUrl),
|
|
138
|
+
res.type === ResourceType.Html,
|
|
139
|
+
!options?.deduplicateStripSearch,
|
|
140
|
+
options?.localSrcRoot);
|
|
141
|
+
}
|
|
125
142
|
res.body = response.body;
|
|
126
143
|
return res;
|
|
127
144
|
}
|
|
@@ -145,7 +162,7 @@ export async function downloadResource(
|
|
|
145
162
|
res.waitTime = res.downloadStartTimestamp - res.createTimestamp;
|
|
146
163
|
}
|
|
147
164
|
let downloadedResource: DownloadResource | Resource | void = await requestForResource(
|
|
148
|
-
res as (Resource & { downloadStartTimestamp: number }), requestOptions);
|
|
165
|
+
res as (Resource & { downloadStartTimestamp: number }), requestOptions, options);
|
|
149
166
|
if (!downloadedResource || !downloadedResource.body) {
|
|
150
167
|
return downloadedResource;
|
|
151
168
|
}
|
|
@@ -53,7 +53,8 @@ export async function streamingDownloadToFile(
|
|
|
53
53
|
try {
|
|
54
54
|
await fs.access(savePath, constants.W_OK);
|
|
55
55
|
} catch (e) {
|
|
56
|
-
|
|
56
|
+
// force cast for typescript 4.4
|
|
57
|
+
if (e && (e as {code?: string | void}).code === 'ENOENT') {
|
|
57
58
|
await mkdirRetry(path.dirname(savePath));
|
|
58
59
|
} else {
|
|
59
60
|
throw e;
|
|
@@ -303,7 +304,7 @@ export interface StreamingDownloadErrorHook {
|
|
|
303
304
|
* @param options
|
|
304
305
|
* @param pipeline
|
|
305
306
|
*/
|
|
306
|
-
(e: Error, res: Resource, requestOptions: RequestOptions,
|
|
307
|
+
(e: Error | unknown, res: Resource, requestOptions: RequestOptions,
|
|
307
308
|
options: StaticDownloadOptions,
|
|
308
309
|
pipeline: PipelineExecutor): AsyncResult<void>;
|
|
309
310
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
import {load} from 'cheerio';
|
|
2
2
|
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
3
3
|
import type {StaticDownloadOptions} from '../options';
|
|
4
4
|
import {Resource, ResourceType} from '../resource';
|
|
@@ -14,7 +14,7 @@ export async function processSiteMap(
|
|
|
14
14
|
if (res.type !== ResourceType.SiteMap) {
|
|
15
15
|
return res;
|
|
16
16
|
}
|
|
17
|
-
const $: CheerioStatic =
|
|
17
|
+
const $: CheerioStatic = load(toString(res.body,
|
|
18
18
|
res.encoding || options.encoding[ResourceType.SiteMap] || 'utf8'));
|
|
19
19
|
const urlSet: Set<string> = new Set();
|
|
20
20
|
const depth: number = res.depth + 1;
|
package/src/life-cycle/types.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import type {Options as GotOptions} from 'got/dist/source/as-promise';
|
|
2
2
|
import type {
|
|
3
3
|
createResource,
|
|
4
|
+
GenerateSavePathFn,
|
|
4
5
|
Resource,
|
|
5
6
|
ResourceBody,
|
|
6
7
|
ResourceType
|
|
@@ -176,6 +177,7 @@ export interface ProcessingLifeCycle {
|
|
|
176
177
|
init: InitLifeCycleFunc[];
|
|
177
178
|
linkRedirect: LinkRedirectFunc[];
|
|
178
179
|
detectResourceType: DetectResourceTypeFunc[];
|
|
180
|
+
generateSavePath?: GenerateSavePathFn | void;
|
|
179
181
|
createResource: typeof createResource;
|
|
180
182
|
/**
|
|
181
183
|
* link in parent resource would be replaced after this
|
|
@@ -4,7 +4,7 @@ import type {LogWorkerMessage, WorkerLog} from '../downloader/worker-type';
|
|
|
4
4
|
import {WorkerMessageType} from '../downloader/types';
|
|
5
5
|
|
|
6
6
|
export const logLevels = [
|
|
7
|
-
'
|
|
7
|
+
'trace', 'debug', 'info', 'warn', 'error', 'fatal', 'mark'
|
|
8
8
|
] as const;
|
|
9
9
|
|
|
10
10
|
export function getWorkerLogger(category: WorkerLog['logger']): Logger {
|
package/src/resource.ts
CHANGED
|
@@ -285,6 +285,10 @@ export interface CreateResourceArgument {
|
|
|
285
285
|
* https://github.com/website-local/website-scrap-engine/issues/107
|
|
286
286
|
*/
|
|
287
287
|
skipReplacePathError?: boolean;
|
|
288
|
+
/**
|
|
289
|
+
* Set this to use a custom implementation of {@link generateSavePath}
|
|
290
|
+
*/
|
|
291
|
+
generateSavePathFn?: GenerateSavePathFn | void;
|
|
288
292
|
}
|
|
289
293
|
|
|
290
294
|
/**
|
|
@@ -293,6 +297,7 @@ export interface CreateResourceArgument {
|
|
|
293
297
|
* @param isHtml should the savePath endsWith .html
|
|
294
298
|
* @param keepSearch keep url search params in file name
|
|
295
299
|
* @param localSrcRoot local source path to download from
|
|
300
|
+
* @return string must return non-empty string
|
|
296
301
|
*/
|
|
297
302
|
export function generateSavePath(
|
|
298
303
|
uri: URI,
|
|
@@ -361,6 +366,8 @@ export function generateSavePath(
|
|
|
361
366
|
return savePath;
|
|
362
367
|
}
|
|
363
368
|
|
|
369
|
+
export type GenerateSavePathFn = typeof generateSavePath;
|
|
370
|
+
|
|
364
371
|
export const urlOfSavePath = (savePath: string): string => {
|
|
365
372
|
if (savePath.includes('\\')) {
|
|
366
373
|
return `file:///${savePath.replace(/\\/g, '/')}`;
|
|
@@ -486,6 +493,7 @@ export function resolveFileUrl(
|
|
|
486
493
|
* @param encoding {@link CreateResourceArgument.encoding}
|
|
487
494
|
* @param keepSearch {@link CreateResourceArgument.keepSearch}
|
|
488
495
|
* @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError}
|
|
496
|
+
* @param generateSavePathFn {@link CreateResourceArgument.generateSavePathFn}
|
|
489
497
|
* @return the resource
|
|
490
498
|
*/
|
|
491
499
|
export function createResource({
|
|
@@ -499,7 +507,8 @@ export function createResource({
|
|
|
499
507
|
localSrcRoot,
|
|
500
508
|
encoding,
|
|
501
509
|
keepSearch,
|
|
502
|
-
skipReplacePathError
|
|
510
|
+
skipReplacePathError,
|
|
511
|
+
generateSavePathFn
|
|
503
512
|
}: CreateResourceArgument): Resource {
|
|
504
513
|
const rawUrl: string = url;
|
|
505
514
|
const refUri: URI = URI(refUrl);
|
|
@@ -541,11 +550,13 @@ export function createResource({
|
|
|
541
550
|
downloadLink = uri.clone().hash('').toString();
|
|
542
551
|
}
|
|
543
552
|
|
|
553
|
+
const implGenerateSavePath = generateSavePathFn || generateSavePath;
|
|
554
|
+
|
|
544
555
|
// make savePath and replaceUri
|
|
545
|
-
const savePath = replacePathHasError ? rawUrl :
|
|
556
|
+
const savePath = replacePathHasError ? rawUrl : implGenerateSavePath(
|
|
546
557
|
uri, type === ResourceType.Html, keepSearch, localSrcRoot);
|
|
547
558
|
if (!refSavePath) {
|
|
548
|
-
refSavePath =
|
|
559
|
+
refSavePath = implGenerateSavePath(refUri, refType === ResourceType.Html,
|
|
549
560
|
false, localSrcRoot);
|
|
550
561
|
}
|
|
551
562
|
const replaceUri = replacePathHasError ? URI(rawUrl) :
|
package/src/types.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import type {load} from 'cheerio';
|
|
2
2
|
|
|
3
3
|
// adapters for making cheerio's namespace type definitions to module
|
|
4
|
-
export type CheerioStatic = ReturnType<typeof
|
|
4
|
+
export type CheerioStatic = ReturnType<typeof load>;
|
|
5
5
|
export type Cheerio = ReturnType<CheerioStatic>;
|
|
6
|
-
export type CheerioOptionsInterface = NonNullable<Parameters<typeof
|
|
6
|
+
export type CheerioOptionsInterface = NonNullable<Parameters<typeof load>[1]>;
|
|
7
7
|
export type CheerioElement = Cheerio[number];
|
package/src/util.ts
CHANGED
|
@@ -88,10 +88,9 @@ export const hasOwnProperty = Object.prototype.hasOwnProperty;
|
|
|
88
88
|
* may lead to typescript parser errors.
|
|
89
89
|
*/
|
|
90
90
|
export const weakAssign = <T, U>(target: T, source: U): T & U => {
|
|
91
|
-
if (!target) return Object.assign({} as T
|
|
91
|
+
if (!target) return Object.assign({}, source) as T & U;
|
|
92
92
|
if (!source) return target as T & U;
|
|
93
|
-
|
|
94
|
-
for (key in source) {
|
|
93
|
+
for (const key in source) {
|
|
95
94
|
if (hasOwnProperty.call(source, key) &&
|
|
96
95
|
!hasOwnProperty.call(target, key)) {
|
|
97
96
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|