website-scrap-engine 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/lib/downloader/adjust-concurrency.d.ts +2 -1
  2. package/lib/downloader/adjust-concurrency.d.ts.map +1 -0
  3. package/lib/downloader/adjust-concurrency.js +4 -8
  4. package/lib/downloader/adjust-concurrency.js.map +1 -1
  5. package/lib/downloader/index.d.ts +9 -8
  6. package/lib/downloader/index.d.ts.map +1 -0
  7. package/lib/downloader/index.js +8 -40
  8. package/lib/downloader/index.js.map +1 -1
  9. package/lib/downloader/main.d.ts +15 -6
  10. package/lib/downloader/main.d.ts.map +1 -0
  11. package/lib/downloader/main.js +49 -32
  12. package/lib/downloader/main.js.map +1 -1
  13. package/lib/downloader/multi.d.ts +7 -5
  14. package/lib/downloader/multi.d.ts.map +1 -0
  15. package/lib/downloader/multi.js +10 -17
  16. package/lib/downloader/multi.js.map +1 -1
  17. package/lib/downloader/pipeline-executor-impl.d.ts +8 -7
  18. package/lib/downloader/pipeline-executor-impl.d.ts.map +1 -0
  19. package/lib/downloader/pipeline-executor-impl.js +1 -5
  20. package/lib/downloader/pipeline-executor-impl.js.map +1 -1
  21. package/lib/downloader/single.d.ts +4 -3
  22. package/lib/downloader/single.d.ts.map +1 -0
  23. package/lib/downloader/single.js +7 -11
  24. package/lib/downloader/single.js.map +1 -1
  25. package/lib/downloader/types.d.ts +4 -4
  26. package/lib/downloader/types.d.ts.map +1 -0
  27. package/lib/downloader/types.js +2 -5
  28. package/lib/downloader/types.js.map +1 -1
  29. package/lib/downloader/worker-pool.d.ts +6 -7
  30. package/lib/downloader/worker-pool.d.ts.map +1 -0
  31. package/lib/downloader/worker-pool.js +7 -35
  32. package/lib/downloader/worker-pool.js.map +1 -1
  33. package/lib/downloader/worker-type.d.ts +4 -3
  34. package/lib/downloader/worker-type.d.ts.map +1 -0
  35. package/lib/downloader/worker-type.js +1 -2
  36. package/lib/downloader/worker.d.ts +1 -0
  37. package/lib/downloader/worker.d.ts.map +1 -0
  38. package/lib/downloader/worker.js +52 -27
  39. package/lib/downloader/worker.js.map +1 -1
  40. package/lib/index.d.ts +9 -8
  41. package/lib/index.d.ts.map +1 -0
  42. package/lib/index.js +7 -33
  43. package/lib/index.js.map +1 -1
  44. package/lib/io.d.ts +2 -1
  45. package/lib/io.d.ts.map +1 -0
  46. package/lib/io.js +17 -25
  47. package/lib/io.js.map +1 -1
  48. package/lib/life-cycle/adapters.d.ts +7 -5
  49. package/lib/life-cycle/adapters.d.ts.map +1 -0
  50. package/lib/life-cycle/adapters.js +18 -30
  51. package/lib/life-cycle/adapters.js.map +1 -1
  52. package/lib/life-cycle/default-life-cycle.d.ts +2 -1
  53. package/lib/life-cycle/default-life-cycle.d.ts.map +1 -0
  54. package/lib/life-cycle/default-life-cycle.js +28 -32
  55. package/lib/life-cycle/default-life-cycle.js.map +1 -1
  56. package/lib/life-cycle/detect-resource-type.d.ts +2 -1
  57. package/lib/life-cycle/detect-resource-type.d.ts.map +1 -0
  58. package/lib/life-cycle/detect-resource-type.js +12 -17
  59. package/lib/life-cycle/detect-resource-type.js.map +1 -1
  60. package/lib/life-cycle/download-resource.d.ts +6 -7
  61. package/lib/life-cycle/download-resource.d.ts.map +1 -0
  62. package/lib/life-cycle/download-resource.js +23 -52
  63. package/lib/life-cycle/download-resource.js.map +1 -1
  64. package/lib/life-cycle/download-streaming-resource.d.ts +6 -5
  65. package/lib/life-cycle/download-streaming-resource.d.ts.map +1 -0
  66. package/lib/life-cycle/download-streaming-resource.js +39 -74
  67. package/lib/life-cycle/download-streaming-resource.js.map +1 -1
  68. package/lib/life-cycle/index.d.ts +16 -15
  69. package/lib/life-cycle/index.d.ts.map +1 -0
  70. package/lib/life-cycle/index.js +14 -59
  71. package/lib/life-cycle/index.js.map +1 -1
  72. package/lib/life-cycle/pipeline-executor.d.ts +7 -6
  73. package/lib/life-cycle/pipeline-executor.d.ts.map +1 -0
  74. package/lib/life-cycle/pipeline-executor.js +1 -2
  75. package/lib/life-cycle/process-css.d.ts +5 -4
  76. package/lib/life-cycle/process-css.d.ts.map +1 -0
  77. package/lib/life-cycle/process-css.js +10 -18
  78. package/lib/life-cycle/process-css.js.map +1 -1
  79. package/lib/life-cycle/process-html-meta.d.ts +4 -3
  80. package/lib/life-cycle/process-html-meta.d.ts.map +1 -0
  81. package/lib/life-cycle/process-html-meta.js +11 -15
  82. package/lib/life-cycle/process-html-meta.js.map +1 -1
  83. package/lib/life-cycle/process-html.d.ts +4 -3
  84. package/lib/life-cycle/process-html.d.ts.map +1 -0
  85. package/lib/life-cycle/process-html.js +27 -31
  86. package/lib/life-cycle/process-html.js.map +1 -1
  87. package/lib/life-cycle/process-site-map.d.ts +4 -3
  88. package/lib/life-cycle/process-site-map.d.ts.map +1 -0
  89. package/lib/life-cycle/process-site-map.js +7 -11
  90. package/lib/life-cycle/process-site-map.js.map +1 -1
  91. package/lib/life-cycle/process-source-map.d.ts +4 -4
  92. package/lib/life-cycle/process-source-map.d.ts.map +1 -0
  93. package/lib/life-cycle/process-source-map.js +16 -21
  94. package/lib/life-cycle/process-source-map.js.map +1 -1
  95. package/lib/life-cycle/process-svg.d.ts +4 -3
  96. package/lib/life-cycle/process-svg.d.ts.map +1 -0
  97. package/lib/life-cycle/process-svg.js +17 -21
  98. package/lib/life-cycle/process-svg.js.map +1 -1
  99. package/lib/life-cycle/read-or-copy-local-resource.d.ts +4 -3
  100. package/lib/life-cycle/read-or-copy-local-resource.d.ts.map +1 -0
  101. package/lib/life-cycle/read-or-copy-local-resource.js +15 -42
  102. package/lib/life-cycle/read-or-copy-local-resource.js.map +1 -1
  103. package/lib/life-cycle/save-html-to-disk.d.ts +6 -4
  104. package/lib/life-cycle/save-html-to-disk.d.ts.map +1 -0
  105. package/lib/life-cycle/save-html-to-disk.js +24 -33
  106. package/lib/life-cycle/save-html-to-disk.js.map +1 -1
  107. package/lib/life-cycle/save-resource-to-disk.d.ts +4 -3
  108. package/lib/life-cycle/save-resource-to-disk.d.ts.map +1 -0
  109. package/lib/life-cycle/save-resource-to-disk.js +10 -17
  110. package/lib/life-cycle/save-resource-to-disk.js.map +1 -1
  111. package/lib/life-cycle/skip-links.d.ts +1 -0
  112. package/lib/life-cycle/skip-links.d.ts.map +1 -0
  113. package/lib/life-cycle/skip-links.js +6 -10
  114. package/lib/life-cycle/skip-links.js.map +1 -1
  115. package/lib/life-cycle/types.d.ts +8 -7
  116. package/lib/life-cycle/types.d.ts.map +1 -0
  117. package/lib/life-cycle/types.js +1 -2
  118. package/lib/logger/config-logger.d.ts +2 -1
  119. package/lib/logger/config-logger.d.ts.map +1 -0
  120. package/lib/logger/config-logger.js +4 -30
  121. package/lib/logger/config-logger.js.map +1 -1
  122. package/lib/logger/logger-worker.d.ts +3 -2
  123. package/lib/logger/logger-worker.d.ts.map +1 -0
  124. package/lib/logger/logger-worker.js +11 -13
  125. package/lib/logger/logger-worker.js.map +1 -1
  126. package/lib/logger/logger.d.ts +2 -1
  127. package/lib/logger/logger.d.ts.map +1 -0
  128. package/lib/logger/logger.js +15 -17
  129. package/lib/logger/logger.js.map +1 -1
  130. package/lib/options.d.ts +8 -8
  131. package/lib/options.d.ts.map +1 -0
  132. package/lib/options.js +22 -32
  133. package/lib/options.js.map +1 -1
  134. package/lib/resource.d.ts +3 -4
  135. package/lib/resource.d.ts.map +1 -0
  136. package/lib/resource.js +34 -70
  137. package/lib/resource.js.map +1 -1
  138. package/lib/sources.d.ts +2 -1
  139. package/lib/sources.d.ts.map +1 -0
  140. package/lib/sources.js +9 -12
  141. package/lib/sources.js.map +1 -1
  142. package/lib/types.d.ts +1 -0
  143. package/lib/types.d.ts.map +1 -0
  144. package/lib/types.js +1 -2
  145. package/lib/util.d.ts +4 -3
  146. package/lib/util.d.ts.map +1 -0
  147. package/lib/util.js +17 -34
  148. package/lib/util.js.map +1 -1
  149. package/package.json +18 -20
  150. package/src/downloader/adjust-concurrency.ts +2 -2
  151. package/src/downloader/index.ts +8 -8
  152. package/src/downloader/main.ts +50 -28
  153. package/src/downloader/multi.ts +11 -10
  154. package/src/downloader/pipeline-executor-impl.ts +7 -7
  155. package/src/downloader/single.ts +9 -6
  156. package/src/downloader/types.ts +3 -3
  157. package/src/downloader/worker-pool.ts +9 -9
  158. package/src/downloader/worker-type.ts +3 -3
  159. package/src/downloader/worker.ts +51 -29
  160. package/src/index.ts +8 -8
  161. package/src/io.ts +6 -6
  162. package/src/life-cycle/adapters.ts +7 -6
  163. package/src/life-cycle/css-url-parser.d.ts +1 -1
  164. package/src/life-cycle/default-life-cycle.ts +15 -15
  165. package/src/life-cycle/detect-resource-type.ts +2 -2
  166. package/src/life-cycle/download-resource.ts +18 -20
  167. package/src/life-cycle/download-streaming-resource.ts +20 -18
  168. package/src/life-cycle/index.ts +15 -15
  169. package/src/life-cycle/pipeline-executor.ts +6 -6
  170. package/src/life-cycle/process-css.ts +6 -5
  171. package/src/life-cycle/process-html-meta.ts +7 -6
  172. package/src/life-cycle/process-html.ts +21 -13
  173. package/src/life-cycle/process-site-map.ts +7 -6
  174. package/src/life-cycle/process-source-map.ts +5 -4
  175. package/src/life-cycle/process-svg.ts +10 -9
  176. package/src/life-cycle/read-or-copy-local-resource.ts +9 -7
  177. package/src/life-cycle/save-html-to-disk.ts +9 -13
  178. package/src/life-cycle/save-resource-to-disk.ts +6 -6
  179. package/src/life-cycle/types.ts +7 -7
  180. package/src/logger/config-logger.ts +5 -3
  181. package/src/logger/logger-worker.ts +8 -4
  182. package/src/logger/logger.ts +6 -4
  183. package/src/options.ts +15 -19
  184. package/src/resource.ts +10 -5
  185. package/src/sources.ts +1 -1
  186. package/src/util.ts +6 -10
  187. package/tsconfig.json +6 -2
@@ -1,8 +1,8 @@
1
- import path from 'path';
2
- import type {DownloadResource} from './types';
3
- import type {StaticDownloadOptions} from '../options';
4
- import {writeFile} from '../io';
5
- import type {PipelineExecutor} from './pipeline-executor';
1
+ import path from 'node:path';
2
+ import type {DownloadResource} from './types.js';
3
+ import type {StaticDownloadOptions} from '../options.js';
4
+ import {writeFile} from '../io.js';
5
+ import type {PipelineExecutor} from './pipeline-executor.js';
6
6
 
7
7
  export async function saveResourceToDisk(
8
8
  res: DownloadResource,
@@ -10,7 +10,7 @@ export async function saveResourceToDisk(
10
10
  pipeline: PipelineExecutor): Promise<DownloadResource | void> {
11
11
  const localRoot: string = res.localRoot ?? options.localRoot;
12
12
  // https://github.com/website-local/website-scrap-engine/issues/174
13
- let mtime: number | void;
13
+ let mtime: number | void = void 0;
14
14
  if (options.preferRemoteLastModifiedTime && res.meta?.headers?.['last-modified']) {
15
15
  mtime = Date.parse(res.meta.headers?.['last-modified']);
16
16
  }
@@ -1,16 +1,16 @@
1
- import type {Options as GotOptions} from 'got/dist/source/as-promise';
1
+ import type {OptionsInit as GotOptions} from 'got';
2
2
  import type {
3
3
  createResource,
4
4
  GenerateSavePathFn,
5
5
  Resource,
6
6
  ResourceBody,
7
7
  ResourceType
8
- } from '../resource';
9
- import type {StaticDownloadOptions} from '../options';
10
- import type {PipelineExecutor} from './pipeline-executor';
11
- import type {Cheerio} from '../types';
12
- import type {DownloaderWithMeta} from '../downloader/types';
13
- import type {WorkerInfo} from '../downloader/worker-pool';
8
+ } from '../resource.js';
9
+ import type {StaticDownloadOptions} from '../options.js';
10
+ import type {PipelineExecutor} from './pipeline-executor.js';
11
+ import type {Cheerio} from '../types.js';
12
+ import type {DownloaderWithMeta} from '../downloader/types.js';
13
+ import type {WorkerInfo} from '../downloader/worker-pool.js';
14
14
 
15
15
  export type AsyncResult<T> = T | Promise<T>;
16
16
 
@@ -1,8 +1,10 @@
1
- import {configure, Log4js} from 'log4js';
2
- import * as path from 'path';
1
+ import type {Log4js} from 'log4js';
2
+ // https://github.com/jestjs/jest/issues/11563
3
+ import log4js from 'log4js';
4
+ import * as path from 'node:path';
3
5
 
4
6
  export const configureLogger = (localRoot: string, subDir: string): Log4js =>
5
- configure({
7
+ log4js.configure({
6
8
  appenders: {
7
9
  'retry': {
8
10
  type: 'file',
@@ -1,7 +1,11 @@
1
- import {getLogger, Logger} from 'log4js';
2
- import {parentPort} from 'worker_threads';
3
- import type {LogWorkerMessage, WorkerLog} from '../downloader/worker-type';
4
- import {WorkerMessageType} from '../downloader/types';
1
+ import type {Logger} from 'log4js';
2
+ // https://github.com/jestjs/jest/issues/11563
3
+ import log4js from 'log4js';
4
+ import {parentPort} from 'node:worker_threads';
5
+ import type {LogWorkerMessage, WorkerLog} from '../downloader/worker-type.js';
6
+ import {WorkerMessageType} from '../downloader/types.js';
7
+
8
+ const getLogger = log4js.getLogger;
5
9
 
6
10
  export const logLevels = [
7
11
  'trace', 'debug', 'info', 'warn', 'error', 'fatal', 'mark'
@@ -1,9 +1,11 @@
1
- import {getLogger as getMainLogger, Logger} from 'log4js';
2
- import {isMainThread} from 'worker_threads';
3
- import {getWorkerLogger} from './logger-worker';
1
+ import type {Logger} from 'log4js';
2
+ // https://github.com/jestjs/jest/issues/11563
3
+ import log4js from 'log4js';
4
+ import {isMainThread} from 'node:worker_threads';
5
+ import {getWorkerLogger} from './logger-worker.js';
4
6
 
5
7
  const getLogger: typeof getWorkerLogger =
6
- isMainThread ? getMainLogger : getWorkerLogger;
8
+ isMainThread ? log4js.getLogger : getWorkerLogger;
7
9
 
8
10
  export const notFound: Logger = getLogger('notFound');
9
11
  export const retry: Logger = getLogger('retry');
package/src/options.ts CHANGED
@@ -1,22 +1,18 @@
1
- import got from 'got';
2
- import type {
3
- RetryFunction,
4
- RetryObject,
5
- TimeoutError
6
- } from 'got/dist/source/as-promise/types';
7
- import type {RequestError} from 'got/dist/source/core';
8
- import {createResource, ResourceEncoding, ResourceType} from './resource';
9
- import type {ProcessingLifeCycle, RequestOptions} from './life-cycle/types';
1
+ import type {RequestError, RetryFunction, RetryObject, TimeoutError} from 'got';
2
+ import got, {Options} from 'got';
3
+ import type {ResourceEncoding, ResourceType} from './resource.js';
4
+ import {createResource} from './resource.js';
5
+ import type {ProcessingLifeCycle, RequestOptions} from './life-cycle/types.js';
10
6
  // noinspection ES6PreferShortImport
11
- import {beforeRetryHook} from './life-cycle/download-resource';
12
- import {error} from './logger/logger';
7
+ import {beforeRetryHook} from './life-cycle/download-resource.js';
8
+ import {error} from './logger/logger.js';
13
9
  // noinspection ES6PreferShortImport
14
- import {adjust} from './downloader/adjust-concurrency';
15
- import {configureLogger} from './logger/config-logger';
16
- import type {DownloaderWithMeta} from './downloader/types';
17
- import {weakAssign} from './util';
18
- import type {SourceDefinition} from './sources';
19
- import type {CheerioOptionsInterface} from './types';
10
+ import {adjust} from './downloader/adjust-concurrency.js';
11
+ import {configureLogger} from './logger/config-logger.js';
12
+ import type {DownloaderWithMeta} from './downloader/types.js';
13
+ import {weakAssign} from './util.js';
14
+ import type {SourceDefinition} from './sources.js';
15
+ import type {CheerioOptionsInterface} from './types.js';
20
16
 
21
17
  /**
22
18
  * Extra options for custom life cycle
@@ -379,8 +375,8 @@ export function mergeOverrideOptions(
379
375
  overrideOptions.meta = Object.assign(opt.meta, overrideOptions.meta);
380
376
  }
381
377
  if (opt.req && overrideOptions.req) {
382
- overrideOptions.req =
383
- got.mergeOptions(opt.req, overrideOptions.req);
378
+ const options = got.defaults.options;
379
+ overrideOptions.req = new Options(opt.req, overrideOptions.req, options);
384
380
  }
385
381
  return checkDownloadOptions(Object.assign(opt, overrideOptions));
386
382
  }
package/src/resource.ts CHANGED
@@ -1,9 +1,14 @@
1
1
  import URI from 'urijs';
2
- import type {IncomingHttpHeaders} from 'http';
3
- import * as path from 'path';
4
- import {escapePath, isUrlHttp, orderUrlSearch, simpleHashString} from './util';
5
- import type {CheerioStatic} from './types';
6
- import {error as log} from './logger/logger';
2
+ import type {IncomingHttpHeaders} from 'node:http';
3
+ import * as path from 'node:path';
4
+ import {
5
+ escapePath,
6
+ isUrlHttp,
7
+ orderUrlSearch,
8
+ simpleHashString
9
+ } from './util.js';
10
+ import type {CheerioStatic} from './types.js';
11
+ import {error as log} from './logger/logger.js';
7
12
 
8
13
  export enum ResourceType {
9
14
  /**
package/src/sources.ts CHANGED
@@ -1,4 +1,4 @@
1
- import {ResourceType} from './resource';
1
+ import {ResourceType} from './resource.js';
2
2
 
3
3
  export interface SourceDefinition {
4
4
  selector: string;
package/src/util.ts CHANGED
@@ -1,5 +1,5 @@
1
- import {createHash} from 'crypto';
2
- import type {ResourceBody, ResourceEncoding} from './resource';
1
+ import {createHash} from 'node:crypto';
2
+ import type {ResourceBody, ResourceEncoding} from './resource.js';
3
3
 
4
4
  const forbiddenChar = /[:*?"<>|&]|%3A|%2A|%3F|%22|%3C|%3E|%7C|%26/ig;
5
5
 
@@ -39,14 +39,10 @@ export const toString = (body: ResourceBody, encoding: ResourceEncoding): string
39
39
  return stringValue;
40
40
  };
41
41
 
42
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
43
- export const importDefaultFromPath = (path: string): any => {
44
- // eslint-disable-next-line @typescript-eslint/no-var-requires,@typescript-eslint/no-explicit-any
45
- const mod: any = require(path);
46
- if (mod && mod.__esModule && mod.default) {
47
- return mod.default;
48
- }
49
- return mod;
42
+ export const importDefaultFromPath = <T>(path: string): Promise<T> => {
43
+ return import(path).then(mod => {
44
+ return mod.default || mod;
45
+ });
50
46
  };
51
47
 
52
48
  export const orderUrlSearch = (search: string): string => {
package/tsconfig.json CHANGED
@@ -1,6 +1,7 @@
1
1
  {
2
2
  "compilerOptions": {
3
- "module": "commonjs",
3
+ "module": "node16",
4
+ "moduleResolution": "node16",
4
5
  "target": "es2018",
5
6
  "sourceMap": true,
6
7
  "newLine": "lf",
@@ -8,7 +9,10 @@
8
9
  "declaration": true,
9
10
  "esModuleInterop": true,
10
11
  "removeComments": false,
11
- "strict": true
12
+ "strict": true,
13
+ "declarationMap": true,
14
+ "allowJs": true,
15
+ "verbatimModuleSyntax": true
12
16
  },
13
17
  "include": [
14
18
  "src"