website-scrap-engine 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/lib/downloader/adjust-concurrency.d.ts +2 -1
  2. package/lib/downloader/adjust-concurrency.d.ts.map +1 -0
  3. package/lib/downloader/adjust-concurrency.js +4 -8
  4. package/lib/downloader/adjust-concurrency.js.map +1 -1
  5. package/lib/downloader/index.d.ts +9 -8
  6. package/lib/downloader/index.d.ts.map +1 -0
  7. package/lib/downloader/index.js +8 -40
  8. package/lib/downloader/index.js.map +1 -1
  9. package/lib/downloader/main.d.ts +15 -6
  10. package/lib/downloader/main.d.ts.map +1 -0
  11. package/lib/downloader/main.js +49 -32
  12. package/lib/downloader/main.js.map +1 -1
  13. package/lib/downloader/multi.d.ts +7 -5
  14. package/lib/downloader/multi.d.ts.map +1 -0
  15. package/lib/downloader/multi.js +10 -17
  16. package/lib/downloader/multi.js.map +1 -1
  17. package/lib/downloader/pipeline-executor-impl.d.ts +8 -7
  18. package/lib/downloader/pipeline-executor-impl.d.ts.map +1 -0
  19. package/lib/downloader/pipeline-executor-impl.js +1 -5
  20. package/lib/downloader/pipeline-executor-impl.js.map +1 -1
  21. package/lib/downloader/single.d.ts +4 -3
  22. package/lib/downloader/single.d.ts.map +1 -0
  23. package/lib/downloader/single.js +7 -11
  24. package/lib/downloader/single.js.map +1 -1
  25. package/lib/downloader/types.d.ts +4 -4
  26. package/lib/downloader/types.d.ts.map +1 -0
  27. package/lib/downloader/types.js +2 -5
  28. package/lib/downloader/types.js.map +1 -1
  29. package/lib/downloader/worker-pool.d.ts +6 -7
  30. package/lib/downloader/worker-pool.d.ts.map +1 -0
  31. package/lib/downloader/worker-pool.js +7 -35
  32. package/lib/downloader/worker-pool.js.map +1 -1
  33. package/lib/downloader/worker-type.d.ts +4 -3
  34. package/lib/downloader/worker-type.d.ts.map +1 -0
  35. package/lib/downloader/worker-type.js +1 -2
  36. package/lib/downloader/worker.d.ts +1 -0
  37. package/lib/downloader/worker.d.ts.map +1 -0
  38. package/lib/downloader/worker.js +52 -27
  39. package/lib/downloader/worker.js.map +1 -1
  40. package/lib/index.d.ts +9 -8
  41. package/lib/index.d.ts.map +1 -0
  42. package/lib/index.js +7 -33
  43. package/lib/index.js.map +1 -1
  44. package/lib/io.d.ts +2 -1
  45. package/lib/io.d.ts.map +1 -0
  46. package/lib/io.js +17 -25
  47. package/lib/io.js.map +1 -1
  48. package/lib/life-cycle/adapters.d.ts +7 -5
  49. package/lib/life-cycle/adapters.d.ts.map +1 -0
  50. package/lib/life-cycle/adapters.js +18 -30
  51. package/lib/life-cycle/adapters.js.map +1 -1
  52. package/lib/life-cycle/default-life-cycle.d.ts +2 -1
  53. package/lib/life-cycle/default-life-cycle.d.ts.map +1 -0
  54. package/lib/life-cycle/default-life-cycle.js +28 -32
  55. package/lib/life-cycle/default-life-cycle.js.map +1 -1
  56. package/lib/life-cycle/detect-resource-type.d.ts +2 -1
  57. package/lib/life-cycle/detect-resource-type.d.ts.map +1 -0
  58. package/lib/life-cycle/detect-resource-type.js +12 -17
  59. package/lib/life-cycle/detect-resource-type.js.map +1 -1
  60. package/lib/life-cycle/download-resource.d.ts +6 -7
  61. package/lib/life-cycle/download-resource.d.ts.map +1 -0
  62. package/lib/life-cycle/download-resource.js +23 -52
  63. package/lib/life-cycle/download-resource.js.map +1 -1
  64. package/lib/life-cycle/download-streaming-resource.d.ts +6 -5
  65. package/lib/life-cycle/download-streaming-resource.d.ts.map +1 -0
  66. package/lib/life-cycle/download-streaming-resource.js +39 -74
  67. package/lib/life-cycle/download-streaming-resource.js.map +1 -1
  68. package/lib/life-cycle/index.d.ts +16 -15
  69. package/lib/life-cycle/index.d.ts.map +1 -0
  70. package/lib/life-cycle/index.js +14 -59
  71. package/lib/life-cycle/index.js.map +1 -1
  72. package/lib/life-cycle/pipeline-executor.d.ts +7 -6
  73. package/lib/life-cycle/pipeline-executor.d.ts.map +1 -0
  74. package/lib/life-cycle/pipeline-executor.js +1 -2
  75. package/lib/life-cycle/process-css.d.ts +5 -4
  76. package/lib/life-cycle/process-css.d.ts.map +1 -0
  77. package/lib/life-cycle/process-css.js +10 -18
  78. package/lib/life-cycle/process-css.js.map +1 -1
  79. package/lib/life-cycle/process-html-meta.d.ts +4 -3
  80. package/lib/life-cycle/process-html-meta.d.ts.map +1 -0
  81. package/lib/life-cycle/process-html-meta.js +11 -15
  82. package/lib/life-cycle/process-html-meta.js.map +1 -1
  83. package/lib/life-cycle/process-html.d.ts +4 -3
  84. package/lib/life-cycle/process-html.d.ts.map +1 -0
  85. package/lib/life-cycle/process-html.js +27 -31
  86. package/lib/life-cycle/process-html.js.map +1 -1
  87. package/lib/life-cycle/process-site-map.d.ts +4 -3
  88. package/lib/life-cycle/process-site-map.d.ts.map +1 -0
  89. package/lib/life-cycle/process-site-map.js +7 -11
  90. package/lib/life-cycle/process-site-map.js.map +1 -1
  91. package/lib/life-cycle/process-source-map.d.ts +4 -4
  92. package/lib/life-cycle/process-source-map.d.ts.map +1 -0
  93. package/lib/life-cycle/process-source-map.js +16 -21
  94. package/lib/life-cycle/process-source-map.js.map +1 -1
  95. package/lib/life-cycle/process-svg.d.ts +4 -3
  96. package/lib/life-cycle/process-svg.d.ts.map +1 -0
  97. package/lib/life-cycle/process-svg.js +17 -21
  98. package/lib/life-cycle/process-svg.js.map +1 -1
  99. package/lib/life-cycle/read-or-copy-local-resource.d.ts +4 -3
  100. package/lib/life-cycle/read-or-copy-local-resource.d.ts.map +1 -0
  101. package/lib/life-cycle/read-or-copy-local-resource.js +15 -42
  102. package/lib/life-cycle/read-or-copy-local-resource.js.map +1 -1
  103. package/lib/life-cycle/save-html-to-disk.d.ts +6 -4
  104. package/lib/life-cycle/save-html-to-disk.d.ts.map +1 -0
  105. package/lib/life-cycle/save-html-to-disk.js +24 -33
  106. package/lib/life-cycle/save-html-to-disk.js.map +1 -1
  107. package/lib/life-cycle/save-resource-to-disk.d.ts +4 -3
  108. package/lib/life-cycle/save-resource-to-disk.d.ts.map +1 -0
  109. package/lib/life-cycle/save-resource-to-disk.js +10 -17
  110. package/lib/life-cycle/save-resource-to-disk.js.map +1 -1
  111. package/lib/life-cycle/skip-links.d.ts +1 -0
  112. package/lib/life-cycle/skip-links.d.ts.map +1 -0
  113. package/lib/life-cycle/skip-links.js +6 -10
  114. package/lib/life-cycle/skip-links.js.map +1 -1
  115. package/lib/life-cycle/types.d.ts +8 -7
  116. package/lib/life-cycle/types.d.ts.map +1 -0
  117. package/lib/life-cycle/types.js +1 -2
  118. package/lib/logger/config-logger.d.ts +2 -1
  119. package/lib/logger/config-logger.d.ts.map +1 -0
  120. package/lib/logger/config-logger.js +4 -30
  121. package/lib/logger/config-logger.js.map +1 -1
  122. package/lib/logger/logger-worker.d.ts +3 -2
  123. package/lib/logger/logger-worker.d.ts.map +1 -0
  124. package/lib/logger/logger-worker.js +11 -13
  125. package/lib/logger/logger-worker.js.map +1 -1
  126. package/lib/logger/logger.d.ts +2 -1
  127. package/lib/logger/logger.d.ts.map +1 -0
  128. package/lib/logger/logger.js +15 -17
  129. package/lib/logger/logger.js.map +1 -1
  130. package/lib/options.d.ts +8 -8
  131. package/lib/options.d.ts.map +1 -0
  132. package/lib/options.js +22 -32
  133. package/lib/options.js.map +1 -1
  134. package/lib/resource.d.ts +3 -4
  135. package/lib/resource.d.ts.map +1 -0
  136. package/lib/resource.js +34 -70
  137. package/lib/resource.js.map +1 -1
  138. package/lib/sources.d.ts +2 -1
  139. package/lib/sources.d.ts.map +1 -0
  140. package/lib/sources.js +9 -12
  141. package/lib/sources.js.map +1 -1
  142. package/lib/types.d.ts +1 -0
  143. package/lib/types.d.ts.map +1 -0
  144. package/lib/types.js +1 -2
  145. package/lib/util.d.ts +4 -3
  146. package/lib/util.d.ts.map +1 -0
  147. package/lib/util.js +17 -34
  148. package/lib/util.js.map +1 -1
  149. package/package.json +19 -21
  150. package/src/downloader/adjust-concurrency.ts +2 -2
  151. package/src/downloader/index.ts +8 -8
  152. package/src/downloader/main.ts +50 -28
  153. package/src/downloader/multi.ts +11 -10
  154. package/src/downloader/pipeline-executor-impl.ts +7 -7
  155. package/src/downloader/single.ts +9 -6
  156. package/src/downloader/types.ts +3 -3
  157. package/src/downloader/worker-pool.ts +9 -9
  158. package/src/downloader/worker-type.ts +3 -3
  159. package/src/downloader/worker.ts +51 -29
  160. package/src/index.ts +8 -8
  161. package/src/io.ts +6 -6
  162. package/src/life-cycle/adapters.ts +7 -6
  163. package/src/life-cycle/css-url-parser.d.ts +1 -1
  164. package/src/life-cycle/default-life-cycle.ts +15 -15
  165. package/src/life-cycle/detect-resource-type.ts +2 -2
  166. package/src/life-cycle/download-resource.ts +18 -20
  167. package/src/life-cycle/download-streaming-resource.ts +20 -18
  168. package/src/life-cycle/index.ts +15 -15
  169. package/src/life-cycle/pipeline-executor.ts +6 -6
  170. package/src/life-cycle/process-css.ts +6 -5
  171. package/src/life-cycle/process-html-meta.ts +7 -6
  172. package/src/life-cycle/process-html.ts +21 -13
  173. package/src/life-cycle/process-site-map.ts +7 -6
  174. package/src/life-cycle/process-source-map.ts +5 -4
  175. package/src/life-cycle/process-svg.ts +10 -9
  176. package/src/life-cycle/read-or-copy-local-resource.ts +9 -7
  177. package/src/life-cycle/save-html-to-disk.ts +9 -13
  178. package/src/life-cycle/save-resource-to-disk.ts +6 -6
  179. package/src/life-cycle/types.ts +7 -7
  180. package/src/logger/config-logger.ts +5 -3
  181. package/src/logger/logger-worker.ts +8 -4
  182. package/src/logger/logger.ts +6 -4
  183. package/src/options.ts +15 -19
  184. package/src/resource.ts +10 -5
  185. package/src/sources.ts +1 -1
  186. package/src/util.ts +6 -10
  187. package/tsconfig.json +6 -2
@@ -1,46 +1,49 @@
1
- import {parentPort, workerData} from 'worker_threads';
2
- import {
3
- DownloadOptions,
4
- mergeOverrideOptions,
5
- StaticDownloadOptions
6
- } from '../options';
7
- import type {DownloadResource, SubmitResourceFunc} from '../life-cycle/types';
8
- import {
9
- normalizeResource,
10
- prepareResourceForClone,
11
- RawResource,
12
- Resource
13
- } from '../resource';
14
- import {skip} from '../logger/logger';
15
- import {importDefaultFromPath} from '../util';
16
- import {DownloadWorkerMessage, WorkerMessageType} from './types';
17
- import {PipelineExecutorImpl} from './pipeline-executor-impl';
1
+ import {parentPort, workerData} from 'node:worker_threads';
2
+ import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
3
+ import {mergeOverrideOptions} from '../options.js';
4
+ import type {
5
+ DownloadResource,
6
+ SubmitResourceFunc
7
+ } from '../life-cycle/types.js';
8
+ import type {RawResource, Resource} from '../resource.js';
9
+ import {normalizeResource, prepareResourceForClone} from '../resource.js';
10
+ import {skip} from '../logger/logger.js';
11
+ import {importDefaultFromPath} from '../util.js';
12
+ import type {DownloadWorkerMessage} from './types.js';
13
+ import {WorkerMessageType} from './types.js';
14
+ import {PipelineExecutorImpl} from './pipeline-executor-impl.js';
18
15
  // noinspection ES6PreferShortImport
19
- import type {PipelineExecutor} from '../life-cycle/pipeline-executor';
20
- import type {WorkerTaskMessage} from './worker-type';
16
+ import type {PipelineExecutor} from '../life-cycle/pipeline-executor.js';
17
+ import type {WorkerTaskMessage} from './worker-type.js';
21
18
 
22
19
  const {pathToOptions, overrideOptions}: {
23
20
  pathToOptions: string,
24
21
  overrideOptions?: Partial<StaticDownloadOptions>
25
22
  } = workerData;
26
23
 
27
- const options: DownloadOptions =
28
- // eslint-disable-next-line @typescript-eslint/no-var-requires
29
- mergeOverrideOptions(importDefaultFromPath(pathToOptions), overrideOptions);
24
+ const asyncOptions: Promise<DownloadOptions> = importDefaultFromPath(pathToOptions);
30
25
 
31
- const pipeline: PipelineExecutor =
32
- new PipelineExecutorImpl(options, options.req, options);
26
+ const asyncPipeline = asyncOptions.then(options => {
27
+ options = mergeOverrideOptions(options, overrideOptions);
33
28
 
34
- options.configureLogger(options.localRoot, options.logSubDir || '');
29
+ const pipeline: PipelineExecutor =
30
+ new PipelineExecutorImpl(options, options.req, options);
35
31
 
36
- const init = pipeline.init(pipeline);
32
+ options.configureLogger(options.localRoot, options.logSubDir || '');
33
+
34
+ const init = pipeline.init(pipeline);
35
+ if (init && (init as Promise<void>).then) {
36
+ return init.then(() => pipeline);
37
+ }
38
+ return pipeline;
39
+ });
37
40
 
38
41
  parentPort?.addListener('message', async (msg: WorkerTaskMessage<RawResource>) => {
39
42
  const collectedResource: RawResource[] = [];
40
43
  let error: Error | unknown | void;
41
44
  let redirectedUrl: string | undefined;
42
45
  try {
43
- await init;
46
+ const pipeline = await asyncPipeline;
44
47
  const res = msg.body;
45
48
  const downloadResource: DownloadResource = normalizeResource(res) as DownloadResource;
46
49
  const submit: SubmitResourceFunc = (resources: Resource | Resource[]) => {
@@ -67,8 +70,27 @@ parentPort?.addListener('message', async (msg: WorkerTaskMessage<RawResource>) =
67
70
  redirectedUrl = processedResource.redirectedUrl;
68
71
  }
69
72
  } catch (e) {
70
- // TODO: handle if object could not be cloned here
71
- error = e;
73
+ // handle if object could not be cloned here
74
+ // https://github.com/website-local/website-scrap-engine/issues/340
75
+ try {
76
+ // should always be
77
+ if (typeof structuredClone === 'function') {
78
+ error = structuredClone(e);
79
+ } else {
80
+ // this is the old behavior before this
81
+ error = e;
82
+ }
83
+ } catch {
84
+ // can not clone, so no need to get the full error here
85
+ if (e && typeof e === 'object') {
86
+ const clone: Record<string, string> = {};
87
+ for (const k in e) {
88
+ clone[k] = String((e as Record<string, unknown>)[k]);
89
+ }
90
+ } else {
91
+ error = String(e);
92
+ }
93
+ }
72
94
  } finally {
73
95
  const message: DownloadWorkerMessage = {
74
96
  taskId: msg.taskId,
package/src/index.ts CHANGED
@@ -1,8 +1,8 @@
1
- export * as logger from './logger/logger';
2
- export * as downloader from './downloader/index';
3
- export * as lifeCycle from './life-cycle/index';
4
- export * as io from './io';
5
- export * as options from './options';
6
- export * as resource from './resource';
7
- export {SourceDefinition} from './sources';
8
- export * as util from './util';
1
+ export * as logger from './logger/logger.js';
2
+ export * as downloader from './downloader/index.js';
3
+ export * as lifeCycle from './life-cycle/index.js';
4
+ export * as io from './io.js';
5
+ export * as options from './options.js';
6
+ export * as resource from './resource.js';
7
+ export type {SourceDefinition} from './sources.js';
8
+ export * as util from './util.js';
package/src/io.ts CHANGED
@@ -1,9 +1,9 @@
1
- import type {ObjectEncodingOptions} from 'fs';
2
- import fs from 'fs';
3
- import {dirname} from 'path';
1
+ import type {ObjectEncodingOptions} from 'node:fs';
2
+ import fs from 'node:fs';
3
+ import {dirname} from 'node:path';
4
4
  import {mkdirp} from 'mkdirp';
5
- import type {ResourceBody, ResourceEncoding} from './resource';
6
- import {error as errorLogger, mkdir as mkdirLogger} from './logger/logger';
5
+ import type {ResourceBody, ResourceEncoding} from './resource.js';
6
+ import {error as errorLogger, mkdir as mkdirLogger} from './logger/logger.js';
7
7
 
8
8
  export const mkdirRetry = async (dir: string, retry = 3): Promise<void> => {
9
9
  let error: unknown | void;
@@ -40,7 +40,7 @@ export const writeFile = async (
40
40
  await mkdirRetry(dir);
41
41
  }
42
42
  let fileData: Uint8Array | string;
43
- let options: ObjectEncodingOptions | void;
43
+ let options: ObjectEncodingOptions | void = void 0;
44
44
  if (typeof data === 'string') {
45
45
  fileData = data;
46
46
  options = {encoding};
@@ -1,5 +1,6 @@
1
1
  import {load} from 'cheerio';
2
- import {Resource, ResourceEncoding, ResourceType} from '../resource';
2
+ import type {Resource, ResourceEncoding} from '../resource.js';
3
+ import {ResourceType} from '../resource.js';
3
4
  import type {
4
5
  AsyncResult,
5
6
  DownloadResource,
@@ -7,11 +8,11 @@ import type {
7
8
  ProcessResourceAfterDownloadFunc,
8
9
  ProcessResourceBeforeDownloadFunc,
9
10
  SubmitResourceFunc
10
- } from './types';
11
- import {toString} from '../util';
12
- import type {StaticDownloadOptions} from '../options';
13
- import type {PipelineExecutor} from './pipeline-executor';
14
- import type {Cheerio, CheerioStatic} from '../types';
11
+ } from './types.js';
12
+ import {toString} from '../util.js';
13
+ import type {StaticDownloadOptions} from '../options.js';
14
+ import type {PipelineExecutor} from './pipeline-executor.js';
15
+ import type {Cheerio, CheerioStatic} from '../types.js';
15
16
 
16
17
  export interface SkipProcessFunc {
17
18
  (url: string, element: Cheerio | null, parent: Resource | null): boolean;
@@ -7,5 +7,5 @@
7
7
  declare function parseCssUrls(cssText: string): string[];
8
8
 
9
9
  declare module 'css-url-parser' {
10
- export = parseCssUrls;
10
+ export default parseCssUrls;
11
11
  }
@@ -1,18 +1,18 @@
1
- import type {ProcessingLifeCycle} from './types';
2
- import {skipLinks} from './skip-links';
3
- import {detectResourceType} from './detect-resource-type';
4
- import {createResource} from '../resource';
5
- import {downloadResource} from './download-resource';
6
- import {processHtml} from './process-html';
7
- import {processHtmlMetaRefresh} from './process-html-meta';
8
- import {processCss} from './process-css';
9
- import {processSiteMap} from './process-site-map';
10
- import {processSvg} from './process-svg';
11
- import {saveHtmlToDisk} from './save-html-to-disk';
12
- import {saveResourceToDisk} from './save-resource-to-disk';
13
- import {processRedirectedUrl} from './adapters';
14
- import {downloadStreamingResource} from './download-streaming-resource';
15
- import {readOrCopyLocalResource} from './read-or-copy-local-resource';
1
+ import type {ProcessingLifeCycle} from './types.js';
2
+ import {skipLinks} from './skip-links.js';
3
+ import {detectResourceType} from './detect-resource-type.js';
4
+ import {createResource} from '../resource.js';
5
+ import {downloadResource} from './download-resource.js';
6
+ import {processHtml} from './process-html.js';
7
+ import {processHtmlMetaRefresh} from './process-html-meta.js';
8
+ import {processCss} from './process-css.js';
9
+ import {processSiteMap} from './process-site-map.js';
10
+ import {processSvg} from './process-svg.js';
11
+ import {saveHtmlToDisk} from './save-html-to-disk.js';
12
+ import {saveResourceToDisk} from './save-resource-to-disk.js';
13
+ import {processRedirectedUrl} from './adapters.js';
14
+ import {downloadStreamingResource} from './download-streaming-resource.js';
15
+ import {readOrCopyLocalResource} from './read-or-copy-local-resource.js';
16
16
 
17
17
  /**
18
18
  * Get a copy of default life cycle
@@ -1,5 +1,5 @@
1
- import {ResourceType} from '../resource';
2
- import {arrayToMap, isSiteMap} from '../util';
1
+ import {ResourceType} from '../resource.js';
2
+ import {arrayToMap, isSiteMap} from '../util.js';
3
3
 
4
4
  // immutable
5
5
  export const binaryExtension = arrayToMap([
@@ -1,24 +1,22 @@
1
- import got, {
2
- BeforeRetryHook,
3
- NormalizedOptions,
4
- Options,
5
- RequestError,
6
- TimeoutError
7
- } from 'got';
8
- import type {Response} from 'got/dist/source/as-promise';
9
- import type {DownloadResource, RequestOptions} from './types';
10
- import {generateSavePath, Resource, ResourceType} from '../resource';
11
- import type {StaticDownloadOptions} from '../options';
12
- import * as logger from '../logger/logger';
13
- import {isUrlHttp, sleep} from '../util';
1
+ import type {BeforeRetryHook, OptionsInit, RequestError, Response} from 'got';
2
+ import got, {TimeoutError} from 'got';
3
+ import type {DownloadResource, RequestOptions} from './types.js';
4
+ import type {Resource} from '../resource.js';
5
+ import {generateSavePath, ResourceType} from '../resource.js';
6
+ import type {StaticDownloadOptions} from '../options.js';
7
+ import * as logger from '../logger/logger.js';
8
+ import {isUrlHttp, sleep} from '../util.js';
14
9
  import URI from 'urijs';
15
10
 
16
11
  /** Take logs before retry */
17
12
  export const beforeRetryHook: BeforeRetryHook = (
18
- options: NormalizedOptions,
19
- error: RequestError | undefined,
13
+ error: RequestError,
20
14
  retryCount: number | undefined
21
15
  ) => {
16
+ const options = error.options;
17
+ if (!options) {
18
+ return;
19
+ }
22
20
  if (!error) {
23
21
  logger.retry.warn(retryCount, String(options.url));
24
22
  return;
@@ -49,15 +47,15 @@ export interface DownloadError extends Partial<Error> {
49
47
  */
50
48
  export async function getRetry(
51
49
  url: string,
52
- options: Options
50
+ options: OptionsInit
53
51
  ): Promise<Response<Buffer | string> | void> {
54
- let res: Response<Buffer | string> | void;
55
- let err: DownloadError | void, optionsClone: Options;
52
+ let res: Response<Buffer | string> | void = void 0;
53
+ let err: DownloadError | void = void 0, optionsClone: OptionsInit;
56
54
  for (let i = 0; i < 25; i++) {
57
55
  err = void 0;
58
56
  try {
59
57
  optionsClone = Object.assign({}, options);
60
- res = (await got(url, optionsClone)) as typeof res;
58
+ res = (await got(url, optionsClone)) as Response<Buffer | string>;
61
59
  if (!res || !res.body || !res.body.length) {
62
60
  logger.retry.warn(i, url, 'manually retry on empty response or body',
63
61
  res && res.body);
@@ -102,7 +100,7 @@ export async function requestForResource(
102
100
  options?: StaticDownloadOptions
103
101
  ): Promise<DownloadResource | Resource | void> {
104
102
  const downloadLink: string = encodeURI(decodeURI(res.downloadLink));
105
- const reqOptions: Options = Object.assign({}, requestOptions);
103
+ const reqOptions: OptionsInit = Object.assign({}, requestOptions);
106
104
  reqOptions.responseType = 'buffer';
107
105
  if (res.refUrl && res.refUrl !== downloadLink) {
108
106
  const headers = Object.assign({}, reqOptions.headers);
@@ -1,21 +1,23 @@
1
- import got, {HTTPError, RequestError} from 'got';
2
- import type {Response} from 'got/dist/source/core';
3
- import path from 'path';
4
- import {constants, createWriteStream, promises as fs, WriteStream} from 'fs';
5
- import {Resource, ResourceType} from '../resource';
1
+ import path from 'node:path';
2
+ import type {WriteStream} from 'node:fs';
3
+ import {constants, createWriteStream, promises as fs} from 'node:fs';
4
+ import {pipeline} from 'node:stream';
5
+ import {promisify} from 'node:util';
6
+ import type {RequestError, Response} from 'got';
7
+ import got, {HTTPError} from 'got';
8
+ import type {Resource} from '../resource.js';
9
+ import {ResourceType} from '../resource.js';
6
10
  import type {
7
11
  AsyncResult,
8
12
  DownloadResource,
9
13
  DownloadResourceFunc,
10
14
  RequestOptions
11
- } from './types';
12
- import {mkdirRetry} from '../io';
13
- import {pipeline} from 'stream';
14
- import {promisify} from 'util';
15
- import {error as errorLogger, retry as retryLogger} from '../logger/logger';
16
- import type {StaticDownloadOptions} from '../options';
17
- import type {PipelineExecutor} from './pipeline-executor';
18
- import {isUrlHttp} from '../util';
15
+ } from './types.js';
16
+ import {mkdirRetry} from '../io.js';
17
+ import {error as errorLogger, retry as retryLogger} from '../logger/logger.js';
18
+ import type {StaticDownloadOptions} from '../options.js';
19
+ import type {PipelineExecutor} from './pipeline-executor.js';
20
+ import {isUrlHttp} from '../util.js';
19
21
 
20
22
  const promisifyPipeline = promisify(pipeline);
21
23
 
@@ -108,7 +110,7 @@ export async function streamingDownloadToFile(
108
110
  rangeStart = undefined;
109
111
  }
110
112
 
111
- if (response.request.aborted) {
113
+ if (response.request.isAborted) {
112
114
  // Canceled while downloading
113
115
  //- will throw a `CancelError` or `TimeoutError` error
114
116
  return;
@@ -129,9 +131,9 @@ export async function streamingDownloadToFile(
129
131
  return;
130
132
  }
131
133
 
132
- if (request._isAboutToError) {
133
- return;
134
- }
134
+ // if (request._isAboutToError) {
135
+ // return;
136
+ // }
135
137
 
136
138
  resolve(response);
137
139
  });
@@ -222,7 +224,7 @@ export async function optionallySetLastModifiedTime(
222
224
  res: Resource, options: StaticDownloadOptions
223
225
  ): Promise<void> {
224
226
  // https://github.com/website-local/website-scrap-engine/issues/174
225
- let mtime: number | void;
227
+ let mtime: number | void = void 0;
226
228
  if (options.preferRemoteLastModifiedTime && res.meta?.headers?.['last-modified']) {
227
229
  mtime = Date.parse(res.meta.headers?.['last-modified']);
228
230
  }
@@ -1,21 +1,21 @@
1
- export * as adapter from './adapters';
2
- export {defaultLifeCycle} from './default-life-cycle';
3
- export {detectResourceType} from './detect-resource-type';
1
+ export * as adapter from './adapters.js';
2
+ export {defaultLifeCycle} from './default-life-cycle.js';
3
+ export {detectResourceType} from './detect-resource-type.js';
4
4
  export {
5
5
  beforeRetryHook, getRetry, requestForResource, downloadResource
6
- } from './download-resource';
6
+ } from './download-resource.js';
7
7
  export {
8
8
  streamingDownloadToFile,
9
9
  downloadStreamingResource,
10
10
  downloadStreamingResourceWithHook
11
- } from './download-streaming-resource';
12
- export {PipelineExecutor} from './pipeline-executor';
13
- export {processCssText, processCss} from './process-css';
14
- export {processHtml} from './process-html';
15
- export {processHtmlMetaRefresh} from './process-html-meta';
16
- export {processSiteMap} from './process-site-map';
17
- export {processSvg} from './process-svg';
18
- export {getResourceBodyFromHtml, saveHtmlToDisk} from './save-html-to-disk';
19
- export {saveResourceToDisk} from './save-resource-to-disk';
20
- export {skipLinks} from './skip-links';
21
- export * as types from './types';
11
+ } from './download-streaming-resource.js';
12
+ export type {PipelineExecutor} from './pipeline-executor.js';
13
+ export {processCssText, processCss} from './process-css.js';
14
+ export {processHtml} from './process-html.js';
15
+ export {processHtmlMetaRefresh} from './process-html-meta.js';
16
+ export {processSiteMap} from './process-site-map.js';
17
+ export {processSvg} from './process-svg.js';
18
+ export {getResourceBodyFromHtml, saveHtmlToDisk} from './save-html-to-disk.js';
19
+ export {saveResourceToDisk} from './save-resource-to-disk.js';
20
+ export {skipLinks} from './skip-links.js';
21
+ export * as types from './types.js';
@@ -1,14 +1,14 @@
1
- import type {Resource, ResourceEncoding, ResourceType} from '../resource';
2
- import type {StaticDownloadOptions} from '../options';
1
+ import type {Resource, ResourceEncoding, ResourceType} from '../resource.js';
2
+ import type {StaticDownloadOptions} from '../options.js';
3
3
  import type {
4
4
  AsyncResult,
5
5
  DownloadResource,
6
6
  RequestOptions,
7
7
  SubmitResourceFunc
8
- } from './types';
9
- import type {Cheerio} from '../types';
10
- import type {DownloaderWithMeta} from '../downloader/types';
11
- import type {WorkerInfo} from '../downloader/worker-pool';
8
+ } from './types.js';
9
+ import type {Cheerio} from '../types.js';
10
+ import type {DownloaderWithMeta} from '../downloader/types.js';
11
+ import type {WorkerInfo} from '../downloader/worker-pool.js';
12
12
 
13
13
  export interface PipelineExecutor {
14
14
  /**
@@ -1,9 +1,10 @@
1
1
  import parseCssUrls from 'css-url-parser';
2
- import type {DownloadResource, SubmitResourceFunc} from './types';
3
- import type {StaticDownloadOptions} from '../options';
4
- import {Resource, ResourceType} from '../resource';
5
- import {toString} from '../util';
6
- import type {PipelineExecutor} from './pipeline-executor';
2
+ import type {DownloadResource, SubmitResourceFunc} from './types.js';
3
+ import type {StaticDownloadOptions} from '../options.js';
4
+ import type {Resource} from '../resource.js';
5
+ import {ResourceType} from '../resource.js';
6
+ import {toString} from '../util.js';
7
+ import type {PipelineExecutor} from './pipeline-executor.js';
7
8
 
8
9
  export async function processCssText(
9
10
  cssText: string,
@@ -1,9 +1,10 @@
1
- import type {DownloadResource, SubmitResourceFunc} from './types';
2
- import type {StaticDownloadOptions} from '../options';
3
- import type {PipelineExecutor} from './pipeline-executor';
4
- import {Resource, ResourceType} from '../resource';
5
- import {parseHtml} from './adapters';
6
- import {skip} from '../logger/logger';
1
+ import type {DownloadResource, SubmitResourceFunc} from './types.js';
2
+ import type {StaticDownloadOptions} from '../options.js';
3
+ import type {PipelineExecutor} from './pipeline-executor.js';
4
+ import type {Resource} from '../resource.js';
5
+ import {ResourceType} from '../resource.js';
6
+ import {parseHtml} from './adapters.js';
7
+ import {skip} from '../logger/logger.js';
7
8
 
8
9
  /**
9
10
  * Originally create by https://github.com/stevenvachon at
@@ -1,13 +1,18 @@
1
- import srcset, {SrcSetDefinition} from 'srcset';
2
- import {sources as defaultSources} from '../sources';
3
- import type {DownloadResource, SubmitResourceFunc} from './types';
4
- import type {StaticDownloadOptions} from '../options';
5
- import {Resource, ResourceType} from '../resource';
6
- import {processCssText} from './process-css';
7
- import {error, skip} from '../logger/logger';
8
- import type {PipelineExecutor} from './pipeline-executor';
9
- import {parseHtml} from './adapters';
10
- import type {Cheerio, CheerioStatic} from '../types';
1
+ import type {SrcSetDefinition} from 'srcset';
2
+ import {parseSrcset, stringifySrcset} from 'srcset';
3
+ import {sources as defaultSources} from '../sources.js';
4
+ import type {DownloadResource, SubmitResourceFunc} from './types.js';
5
+ import type {StaticDownloadOptions} from '../options.js';
6
+ import type {Resource} from '../resource.js';
7
+ import {ResourceType} from '../resource.js';
8
+ import {processCssText} from './process-css.js';
9
+ import {error, skip} from '../logger/logger.js';
10
+ import type {PipelineExecutor} from './pipeline-executor.js';
11
+ import {parseHtml} from './adapters.js';
12
+ import type {Cheerio, CheerioStatic} from '../types.js';
13
+
14
+ type Writeable<T> = { -readonly [P in keyof T]: T[P] };
15
+ type WriteableSrcSet = Writeable<SrcSetDefinition>;
11
16
 
12
17
  export async function processHtml(
13
18
  res: DownloadResource,
@@ -54,7 +59,7 @@ export async function processHtml(
54
59
  let links: string[], replaceValue: string | SrcSetDefinition[];
55
60
  if (attr === 'srcset') {
56
61
  try {
57
- replaceValue = srcset.parse(attrValue);
62
+ replaceValue = parseSrcset(attrValue);
58
63
  } catch (e) {
59
64
  error.info('skipping invalid srcset', attrValue, e);
60
65
  // should invalid srcset being removed?
@@ -104,7 +109,10 @@ export async function processHtml(
104
109
  submit(resource);
105
110
  }
106
111
  if (attr === 'srcset') {
107
- (replaceValue as SrcSetDefinition[])[linkIndex].url = resource.replacePath;
112
+ // 20241005: It's ok to do this
113
+ // I've looked into the source code of srcset 5.0.1
114
+ // and there is nothing preventing the return value to change
115
+ (replaceValue as WriteableSrcSet[])[linkIndex].url = resource.replacePath;
108
116
  } else {
109
117
  replaceValue = resource.replacePath;
110
118
  // historical workaround here
@@ -114,7 +122,7 @@ export async function processHtml(
114
122
  }
115
123
  }
116
124
  if (attr === 'srcset') {
117
- elem.attr(attr, srcset.stringify(replaceValue as SrcSetDefinition[]));
125
+ elem.attr(attr, stringifySrcset(replaceValue as SrcSetDefinition[]));
118
126
  } else if (attr) {
119
127
  elem.attr(attr, replaceValue as string);
120
128
  } else {
@@ -1,10 +1,11 @@
1
1
  import {load} from 'cheerio';
2
- import type {DownloadResource, SubmitResourceFunc} from './types';
3
- import type {StaticDownloadOptions} from '../options';
4
- import {Resource, ResourceType} from '../resource';
5
- import {toString} from '../util';
6
- import type {PipelineExecutor} from './pipeline-executor';
7
- import type {CheerioStatic} from '../types';
2
+ import type {DownloadResource, SubmitResourceFunc} from './types.js';
3
+ import type {StaticDownloadOptions} from '../options.js';
4
+ import type {Resource} from '../resource.js';
5
+ import {ResourceType} from '../resource.js';
6
+ import {toString} from '../util.js';
7
+ import type {PipelineExecutor} from './pipeline-executor.js';
8
+ import type {CheerioStatic} from '../types.js';
8
9
 
9
10
  export async function processSiteMap(
10
11
  res: DownloadResource,
@@ -1,7 +1,8 @@
1
- import type {DownloadResource, SubmitResourceFunc} from './types';
2
- import type {StaticDownloadOptions} from '../options';
3
- import {Resource, ResourceEncoding, ResourceType} from '../resource';
4
- import type {PipelineExecutor} from './pipeline-executor';
1
+ import type {DownloadResource, SubmitResourceFunc} from './types.js';
2
+ import type {StaticDownloadOptions} from '../options.js';
3
+ import type {Resource, ResourceEncoding} from '../resource.js';
4
+ import {ResourceType} from '../resource.js';
5
+ import type {PipelineExecutor} from './pipeline-executor.js';
5
6
 
6
7
  // https://developer.mozilla.org/docs/Web/HTTP/Headers/SourceMap
7
8
  export const SOURCE_MAP_HEADER = 'SourceMap'.toLowerCase();
@@ -1,12 +1,13 @@
1
- import type {SourceDefinition} from '../sources';
2
- import type {DownloadResource, SubmitResourceFunc} from './types';
3
- import type {StaticDownloadOptions} from '../options';
4
- import {Resource, ResourceType} from '../resource';
5
- import {error, skip} from '../logger/logger';
6
- import type {PipelineExecutor} from './pipeline-executor';
7
- import {parseHtml} from './adapters';
8
- import {getResourceBodyFromHtml} from './save-html-to-disk';
9
- import type {Cheerio, CheerioStatic} from '../types';
1
+ import type {SourceDefinition} from '../sources.js';
2
+ import type {DownloadResource, SubmitResourceFunc} from './types.js';
3
+ import type {StaticDownloadOptions} from '../options.js';
4
+ import type {Resource} from '../resource.js';
5
+ import {ResourceType} from '../resource.js';
6
+ import {error, skip} from '../logger/logger.js';
7
+ import type {PipelineExecutor} from './pipeline-executor.js';
8
+ import {parseHtml} from './adapters.js';
9
+ import {getResourceBodyFromHtml} from './save-html-to-disk.js';
10
+ import type {Cheerio, CheerioStatic} from '../types.js';
10
11
 
11
12
  const svgSelectors: SourceDefinition[] = [
12
13
  {selector: '*[xlink\\:href]', attr: 'xlink:href', type: ResourceType.Binary},
@@ -1,9 +1,11 @@
1
- import * as path from 'path';
2
- import {promises, Stats} from 'fs';
3
- import {Resource, ResourceType} from '../resource';
4
- import type {DownloadResource, RequestOptions} from './types';
5
- import type {StaticDownloadOptions} from '../options';
6
- import {error as errorLogger} from '../logger/logger';
1
+ import * as path from 'node:path';
2
+ import type {Stats} from 'node:fs';
3
+ import {promises} from 'node:fs';
4
+ import type {Resource} from '../resource.js';
5
+ import {ResourceType} from '../resource.js';
6
+ import type {DownloadResource, RequestOptions} from './types.js';
7
+ import type {StaticDownloadOptions} from '../options.js';
8
+ import {error as errorLogger} from '../logger/logger.js';
7
9
 
8
10
  const FILE_PREFIX = 'file://';
9
11
 
@@ -27,7 +29,7 @@ export async function readOrCopyLocalResource(
27
29
  return;
28
30
  }
29
31
  // index.html handling
30
- let stats: Stats | void;
32
+ let stats: Stats | void = void 0;
31
33
  if (res.type === ResourceType.Html) {
32
34
  stats = await promises.stat(fileSrcPath);
33
35
  if (stats.isDirectory()) {
@@ -1,16 +1,12 @@
1
- import path from 'path';
1
+ import path from 'node:path';
2
2
  import URI from 'urijs';
3
- import type {DownloadResource} from './types';
4
- import type {StaticDownloadOptions} from '../options';
5
- import {
6
- ResourceBody,
7
- ResourceEncoding,
8
- ResourceType,
9
- urlOfSavePath
10
- } from '../resource';
11
- import {escapePath} from '../util';
12
- import {writeFile} from '../io';
13
- import type {PipelineExecutor} from './pipeline-executor';
3
+ import type {DownloadResource} from './types.js';
4
+ import type {StaticDownloadOptions} from '../options.js';
5
+ import type {ResourceBody, ResourceEncoding} from '../resource.js';
6
+ import {ResourceType, urlOfSavePath} from '../resource.js';
7
+ import {escapePath} from '../util.js';
8
+ import {writeFile} from '../io.js';
9
+ import type {PipelineExecutor} from './pipeline-executor.js';
14
10
 
15
11
  export function getResourceBodyFromHtml(
16
12
  res: DownloadResource & { type: ResourceType.Html },
@@ -46,7 +42,7 @@ export async function saveHtmlToDisk(
46
42
  }
47
43
  const localRoot: string = res.localRoot ?? options.localRoot;
48
44
  // https://github.com/website-local/website-scrap-engine/issues/174
49
- let mtime: number | void;
45
+ let mtime: number | void = void 0;
50
46
  if (options.preferRemoteLastModifiedTime && res.meta?.headers?.['last-modified']) {
51
47
  mtime = Date.parse(res.meta.headers?.['last-modified']);
52
48
  }