website-scrap-engine 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/CHANGELOG.md +167 -0
  2. package/lib/downloader/index.js +5 -1
  3. package/lib/downloader/index.js.map +1 -1
  4. package/lib/downloader/main.d.ts +1 -1
  5. package/lib/downloader/main.js +7 -6
  6. package/lib/downloader/main.js.map +1 -1
  7. package/lib/downloader/pipeline-executor-impl.js +4 -4
  8. package/lib/downloader/pipeline-executor-impl.js.map +1 -1
  9. package/lib/downloader/types.d.ts +1 -1
  10. package/lib/downloader/worker-pool.d.ts +1 -0
  11. package/lib/downloader/worker-pool.js +17 -3
  12. package/lib/downloader/worker-pool.js.map +1 -1
  13. package/lib/downloader/worker.js +5 -4
  14. package/lib/downloader/worker.js.map +1 -1
  15. package/lib/index.js +5 -1
  16. package/lib/index.js.map +1 -1
  17. package/lib/io.js +3 -3
  18. package/lib/io.js.map +1 -1
  19. package/lib/life-cycle/adapters.js +10 -8
  20. package/lib/life-cycle/adapters.js.map +1 -1
  21. package/lib/life-cycle/detect-resource-type.js +2 -2
  22. package/lib/life-cycle/detect-resource-type.js.map +1 -1
  23. package/lib/life-cycle/download-resource.d.ts +6 -1
  24. package/lib/life-cycle/download-resource.js +27 -13
  25. package/lib/life-cycle/download-resource.js.map +1 -1
  26. package/lib/life-cycle/download-streaming-resource.d.ts +1 -1
  27. package/lib/life-cycle/download-streaming-resource.js +12 -7
  28. package/lib/life-cycle/download-streaming-resource.js.map +1 -1
  29. package/lib/life-cycle/index.js +5 -1
  30. package/lib/life-cycle/index.js.map +1 -1
  31. package/lib/life-cycle/process-css.js +2 -2
  32. package/lib/life-cycle/process-css.js.map +1 -1
  33. package/lib/life-cycle/process-html.js +3 -3
  34. package/lib/life-cycle/process-html.js.map +1 -1
  35. package/lib/life-cycle/process-site-map.js +2 -5
  36. package/lib/life-cycle/process-site-map.js.map +1 -1
  37. package/lib/life-cycle/process-source-map.js +2 -2
  38. package/lib/life-cycle/process-source-map.js.map +1 -1
  39. package/lib/life-cycle/process-svg.js +2 -2
  40. package/lib/life-cycle/process-svg.js.map +1 -1
  41. package/lib/life-cycle/read-or-copy-local-resource.js +5 -1
  42. package/lib/life-cycle/read-or-copy-local-resource.js.map +1 -1
  43. package/lib/life-cycle/save-html-to-disk.js +10 -10
  44. package/lib/life-cycle/save-html-to-disk.js.map +1 -1
  45. package/lib/life-cycle/save-resource-to-disk.js +5 -5
  46. package/lib/life-cycle/save-resource-to-disk.js.map +1 -1
  47. package/lib/life-cycle/types.d.ts +4 -3
  48. package/lib/logger/config-logger.js +6 -2
  49. package/lib/logger/config-logger.js.map +1 -1
  50. package/lib/logger/logger-worker.d.ts +1 -1
  51. package/lib/logger/logger-worker.js +2 -2
  52. package/lib/logger/logger-worker.js.map +1 -1
  53. package/lib/options.d.ts +1 -1
  54. package/lib/options.js +1 -1
  55. package/lib/options.js.map +1 -1
  56. package/lib/resource.d.ts +11 -3
  57. package/lib/resource.js +25 -18
  58. package/lib/resource.js.map +1 -1
  59. package/lib/types.d.ts +5 -5
  60. package/lib/util.d.ts +2 -2
  61. package/lib/util.js +2 -3
  62. package/lib/util.js.map +1 -1
  63. package/package.json +19 -19
  64. package/src/downloader/main.ts +3 -2
  65. package/src/downloader/pipeline-executor-impl.ts +3 -2
  66. package/src/downloader/types.ts +1 -1
  67. package/src/downloader/worker-pool.ts +10 -2
  68. package/src/downloader/worker.ts +2 -1
  69. package/src/io.ts +1 -1
  70. package/src/life-cycle/adapters.ts +8 -3
  71. package/src/life-cycle/download-resource.ts +29 -12
  72. package/src/life-cycle/download-streaming-resource.ts +3 -2
  73. package/src/life-cycle/process-site-map.ts +2 -2
  74. package/src/life-cycle/types.ts +2 -0
  75. package/src/logger/logger-worker.ts +1 -1
  76. package/src/resource.ts +14 -3
  77. package/src/types.ts +3 -3
  78. package/src/util.ts +2 -3
@@ -7,10 +7,11 @@ import got, {
7
7
  } from 'got';
8
8
  import type {Response} from 'got/dist/source/as-promise';
9
9
  import type {DownloadResource, RequestOptions} from './types';
10
- import {Resource, ResourceType} from '../resource';
10
+ import {generateSavePath, Resource, ResourceType} from '../resource';
11
11
  import type {StaticDownloadOptions} from '../options';
12
12
  import * as logger from '../logger/logger';
13
13
  import {isUrlHttp, sleep} from '../util';
14
+ import URI from 'urijs';
14
15
 
15
16
  /** Take logs before retry */
16
17
  export const beforeRetryHook: BeforeRetryHook = (
@@ -33,6 +34,11 @@ export const beforeRetryHook: BeforeRetryHook = (
33
34
  }
34
35
  };
35
36
 
37
+ export interface DownloadError extends Partial<Error> {
38
+ retryLimitExceeded?: boolean;
39
+ code?: string;
40
+ event?: string;
41
+ }
36
42
 
37
43
  /**
38
44
  * workaround for retry premature close on node 12
@@ -46,7 +52,7 @@ export async function getRetry(
46
52
  options: Options
47
53
  ): Promise<Response<Buffer | string> | void> {
48
54
  let res: Response<Buffer | string> | void;
49
- let err: Error | void, optionsClone: Options;
55
+ let err: DownloadError | void, optionsClone: Options;
50
56
  for (let i = 0; i < 25; i++) {
51
57
  err = void 0;
52
58
  try {
@@ -59,22 +65,23 @@ export async function getRetry(
59
65
  }
60
66
  break;
61
67
  } catch (e) {
62
- err = e;
63
- if (e && e.message === 'premature close') {
68
+ // force cast for typescript 4.4
69
+ err = e as DownloadError | void;
70
+ if (err && err.message === 'premature close') {
64
71
  logger.retry.warn(i, url, 'manually retry on premature close',
65
- e.name, e.code, e.event, e.message);
72
+ err.name, err.code, err.event, err.message);
66
73
  await sleep(i * 200);
67
74
  continue;
68
75
  }
69
76
  // these events might be accidentally unhandled
70
- if (e && !e.retryLimitExceeded &&
71
- (e.name === 'RequestError' || e.name === 'TimeoutError') &&
77
+ if (err && !err.retryLimitExceeded &&
78
+ (err.name === 'RequestError' || err.name === 'TimeoutError') &&
72
79
  // RequestError: Cannot read property 'request' of undefined
73
80
  // at Object.exports.default (got\dist\source\core\utils\timed-out.js:56:23)
74
81
  // error.code === undefined
75
- (e.code === 'ETIMEDOUT' || e.code === undefined)) {
76
- logger.retry.warn(i, url, `manually retry on ${e.event} timeout`,
77
- e.name, e.code, e.message);
82
+ (err.code === 'ETIMEDOUT' || err.code === undefined)) {
83
+ logger.retry.warn(i, url, `manually retry on ${err.event} timeout`,
84
+ err.name, err.code, err.message);
78
85
  await sleep(i * 300);
79
86
  continue;
80
87
  }
@@ -91,7 +98,8 @@ export async function getRetry(
91
98
 
92
99
  export async function requestForResource(
93
100
  res: Resource & { downloadStartTimestamp: number },
94
- requestOptions: RequestOptions
101
+ requestOptions: RequestOptions,
102
+ options?: StaticDownloadOptions
95
103
  ): Promise<DownloadResource | Resource | void> {
96
104
  const downloadLink: string = encodeURI(decodeURI(res.downloadLink));
97
105
  const reqOptions: Options = Object.assign({}, requestOptions);
@@ -122,6 +130,15 @@ export async function requestForResource(
122
130
  res.finishTimestamp = Date.now();
123
131
  res.downloadTime = res.finishTimestamp - res.downloadStartTimestamp;
124
132
  res.redirectedUrl = response.url;
133
+ // https://github.com/website-local/website-scrap-engine/issues/385
134
+ // 2011/11/15
135
+ if (res.redirectedUrl !== res.url) {
136
+ res.redirectedSavePath = generateSavePath(
137
+ URI(res.redirectedUrl),
138
+ res.type === ResourceType.Html,
139
+ !options?.deduplicateStripSearch,
140
+ options?.localSrcRoot);
141
+ }
125
142
  res.body = response.body;
126
143
  return res;
127
144
  }
@@ -145,7 +162,7 @@ export async function downloadResource(
145
162
  res.waitTime = res.downloadStartTimestamp - res.createTimestamp;
146
163
  }
147
164
  let downloadedResource: DownloadResource | Resource | void = await requestForResource(
148
- res as (Resource & { downloadStartTimestamp: number }), requestOptions);
165
+ res as (Resource & { downloadStartTimestamp: number }), requestOptions, options);
149
166
  if (!downloadedResource || !downloadedResource.body) {
150
167
  return downloadedResource;
151
168
  }
@@ -53,7 +53,8 @@ export async function streamingDownloadToFile(
53
53
  try {
54
54
  await fs.access(savePath, constants.W_OK);
55
55
  } catch (e) {
56
- if (e?.code === 'ENOENT') {
56
+ // force cast for typescript 4.4
57
+ if (e && (e as {code?: string | void}).code === 'ENOENT') {
57
58
  await mkdirRetry(path.dirname(savePath));
58
59
  } else {
59
60
  throw e;
@@ -303,7 +304,7 @@ export interface StreamingDownloadErrorHook {
303
304
  * @param options
304
305
  * @param pipeline
305
306
  */
306
- (e: Error, res: Resource, requestOptions: RequestOptions,
307
+ (e: Error | unknown, res: Resource, requestOptions: RequestOptions,
307
308
  options: StaticDownloadOptions,
308
309
  pipeline: PipelineExecutor): AsyncResult<void>;
309
310
  }
@@ -1,4 +1,4 @@
1
- import cheerio from 'cheerio';
1
+ import {load} from 'cheerio';
2
2
  import type {DownloadResource, SubmitResourceFunc} from './types';
3
3
  import type {StaticDownloadOptions} from '../options';
4
4
  import {Resource, ResourceType} from '../resource';
@@ -14,7 +14,7 @@ export async function processSiteMap(
14
14
  if (res.type !== ResourceType.SiteMap) {
15
15
  return res;
16
16
  }
17
- const $: CheerioStatic = cheerio.load(toString(res.body,
17
+ const $: CheerioStatic = load(toString(res.body,
18
18
  res.encoding || options.encoding[ResourceType.SiteMap] || 'utf8'));
19
19
  const urlSet: Set<string> = new Set();
20
20
  const depth: number = res.depth + 1;
@@ -1,6 +1,7 @@
1
1
  import type {Options as GotOptions} from 'got/dist/source/as-promise';
2
2
  import type {
3
3
  createResource,
4
+ GenerateSavePathFn,
4
5
  Resource,
5
6
  ResourceBody,
6
7
  ResourceType
@@ -176,6 +177,7 @@ export interface ProcessingLifeCycle {
176
177
  init: InitLifeCycleFunc[];
177
178
  linkRedirect: LinkRedirectFunc[];
178
179
  detectResourceType: DetectResourceTypeFunc[];
180
+ generateSavePath?: GenerateSavePathFn | void;
179
181
  createResource: typeof createResource;
180
182
  /**
181
183
  * link in parent resource would be replaced after this
@@ -4,7 +4,7 @@ import type {LogWorkerMessage, WorkerLog} from '../downloader/worker-type';
4
4
  import {WorkerMessageType} from '../downloader/types';
5
5
 
6
6
  export const logLevels = [
7
- 'log', 'trace', 'debug', 'info', 'warn', 'error', 'fatal', 'mark'
7
+ 'trace', 'debug', 'info', 'warn', 'error', 'fatal', 'mark'
8
8
  ] as const;
9
9
 
10
10
  export function getWorkerLogger(category: WorkerLog['logger']): Logger {
package/src/resource.ts CHANGED
@@ -285,6 +285,10 @@ export interface CreateResourceArgument {
285
285
  * https://github.com/website-local/website-scrap-engine/issues/107
286
286
  */
287
287
  skipReplacePathError?: boolean;
288
+ /**
289
+ * Set this to use a custom implementation of {@link generateSavePath}
290
+ */
291
+ generateSavePathFn?: GenerateSavePathFn | void;
288
292
  }
289
293
 
290
294
  /**
@@ -293,6 +297,7 @@ export interface CreateResourceArgument {
293
297
  * @param isHtml should the savePath endsWith .html
294
298
  * @param keepSearch keep url search params in file name
295
299
  * @param localSrcRoot local source path to download from
300
+ * @return string must return non-empty string
296
301
  */
297
302
  export function generateSavePath(
298
303
  uri: URI,
@@ -361,6 +366,8 @@ export function generateSavePath(
361
366
  return savePath;
362
367
  }
363
368
 
369
+ export type GenerateSavePathFn = typeof generateSavePath;
370
+
364
371
  export const urlOfSavePath = (savePath: string): string => {
365
372
  if (savePath.includes('\\')) {
366
373
  return `file:///${savePath.replace(/\\/g, '/')}`;
@@ -486,6 +493,7 @@ export function resolveFileUrl(
486
493
  * @param encoding {@link CreateResourceArgument.encoding}
487
494
  * @param keepSearch {@link CreateResourceArgument.keepSearch}
488
495
  * @param skipReplacePathError {@link CreateResourceArgument.skipReplacePathError}
496
+ * @param generateSavePathFn {@link CreateResourceArgument.generateSavePathFn}
489
497
  * @return the resource
490
498
  */
491
499
  export function createResource({
@@ -499,7 +507,8 @@ export function createResource({
499
507
  localSrcRoot,
500
508
  encoding,
501
509
  keepSearch,
502
- skipReplacePathError
510
+ skipReplacePathError,
511
+ generateSavePathFn
503
512
  }: CreateResourceArgument): Resource {
504
513
  const rawUrl: string = url;
505
514
  const refUri: URI = URI(refUrl);
@@ -541,11 +550,13 @@ export function createResource({
541
550
  downloadLink = uri.clone().hash('').toString();
542
551
  }
543
552
 
553
+ const implGenerateSavePath = generateSavePathFn || generateSavePath;
554
+
544
555
  // make savePath and replaceUri
545
- const savePath = replacePathHasError ? rawUrl : generateSavePath(
556
+ const savePath = replacePathHasError ? rawUrl : implGenerateSavePath(
546
557
  uri, type === ResourceType.Html, keepSearch, localSrcRoot);
547
558
  if (!refSavePath) {
548
- refSavePath = generateSavePath(refUri, refType === ResourceType.Html,
559
+ refSavePath = implGenerateSavePath(refUri, refType === ResourceType.Html,
549
560
  false, localSrcRoot);
550
561
  }
551
562
  const replaceUri = replacePathHasError ? URI(rawUrl) :
package/src/types.ts CHANGED
@@ -1,7 +1,7 @@
1
- import type cheerio from 'cheerio';
1
+ import type {load} from 'cheerio';
2
2
 
3
3
  // adapters for making cheerio's namespace type definitions to module
4
- export type CheerioStatic = ReturnType<typeof cheerio.load>;
4
+ export type CheerioStatic = ReturnType<typeof load>;
5
5
  export type Cheerio = ReturnType<CheerioStatic>;
6
- export type CheerioOptionsInterface = NonNullable<Parameters<typeof cheerio.load>[1]>;
6
+ export type CheerioOptionsInterface = NonNullable<Parameters<typeof load>[1]>;
7
7
  export type CheerioElement = Cheerio[number];
package/src/util.ts CHANGED
@@ -88,10 +88,9 @@ export const hasOwnProperty = Object.prototype.hasOwnProperty;
88
88
  * may lead to typescript parser errors.
89
89
  */
90
90
  export const weakAssign = <T, U>(target: T, source: U): T & U => {
91
- if (!target) return Object.assign({} as T, source);
91
+ if (!target) return Object.assign({}, source) as T & U;
92
92
  if (!source) return target as T & U;
93
- let key: keyof U;
94
- for (key in source) {
93
+ for (const key in source) {
95
94
  if (hasOwnProperty.call(source, key) &&
96
95
  !hasOwnProperty.call(target, key)) {
97
96
  // eslint-disable-next-line @typescript-eslint/no-explicit-any