website-scrap-engine 0.8.6 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/README.md +245 -39
  2. package/lib/downloader/main.d.ts.map +1 -1
  3. package/lib/downloader/main.js +15 -16
  4. package/lib/downloader/main.js.map +1 -1
  5. package/lib/downloader/multi.d.ts.map +1 -1
  6. package/lib/downloader/multi.js +3 -4
  7. package/lib/downloader/multi.js.map +1 -1
  8. package/lib/downloader/pipeline-executor-impl.d.ts +6 -3
  9. package/lib/downloader/pipeline-executor-impl.d.ts.map +1 -1
  10. package/lib/downloader/pipeline-executor-impl.js +86 -2
  11. package/lib/downloader/pipeline-executor-impl.js.map +1 -1
  12. package/lib/downloader/single.d.ts.map +1 -1
  13. package/lib/downloader/single.js +4 -5
  14. package/lib/downloader/single.js.map +1 -1
  15. package/lib/downloader/worker-pool.d.ts.map +1 -1
  16. package/lib/downloader/worker-pool.js +12 -7
  17. package/lib/downloader/worker-pool.js.map +1 -1
  18. package/lib/downloader/worker-type.d.ts +2 -2
  19. package/lib/downloader/worker-type.d.ts.map +1 -1
  20. package/lib/downloader/worker.js +2 -4
  21. package/lib/downloader/worker.js.map +1 -1
  22. package/lib/index.d.ts +2 -0
  23. package/lib/index.d.ts.map +1 -1
  24. package/lib/index.js +1 -0
  25. package/lib/index.js.map +1 -1
  26. package/lib/life-cycle/adapters.d.ts +7 -1
  27. package/lib/life-cycle/adapters.d.ts.map +1 -1
  28. package/lib/life-cycle/adapters.js +6 -0
  29. package/lib/life-cycle/adapters.js.map +1 -1
  30. package/lib/life-cycle/default-life-cycle.d.ts.map +1 -1
  31. package/lib/life-cycle/default-life-cycle.js +3 -1
  32. package/lib/life-cycle/default-life-cycle.js.map +1 -1
  33. package/lib/life-cycle/default-status-listener.d.ts +4 -0
  34. package/lib/life-cycle/default-status-listener.d.ts.map +1 -0
  35. package/lib/life-cycle/default-status-listener.js +38 -0
  36. package/lib/life-cycle/default-status-listener.js.map +1 -0
  37. package/lib/life-cycle/download-resource.d.ts.map +1 -1
  38. package/lib/life-cycle/download-resource.js +13 -3
  39. package/lib/life-cycle/download-resource.js.map +1 -1
  40. package/lib/life-cycle/index.d.ts +1 -0
  41. package/lib/life-cycle/index.d.ts.map +1 -1
  42. package/lib/life-cycle/index.js +1 -0
  43. package/lib/life-cycle/index.js.map +1 -1
  44. package/lib/life-cycle/pipeline-executor.d.ts +10 -3
  45. package/lib/life-cycle/pipeline-executor.d.ts.map +1 -1
  46. package/lib/life-cycle/types.d.ts +56 -4
  47. package/lib/life-cycle/types.d.ts.map +1 -1
  48. package/lib/logger/default-logger.d.ts +3 -0
  49. package/lib/logger/default-logger.d.ts.map +1 -0
  50. package/lib/logger/default-logger.js +11 -0
  51. package/lib/logger/default-logger.js.map +1 -0
  52. package/lib/logger/log4js-adapter.d.ts +3 -0
  53. package/lib/logger/log4js-adapter.d.ts.map +1 -0
  54. package/lib/logger/log4js-adapter.js +143 -0
  55. package/lib/logger/log4js-adapter.js.map +1 -0
  56. package/lib/logger/logger-worker.d.ts +3 -4
  57. package/lib/logger/logger-worker.d.ts.map +1 -1
  58. package/lib/logger/logger-worker.js +21 -20
  59. package/lib/logger/logger-worker.js.map +1 -1
  60. package/lib/logger/logger.d.ts +13 -11
  61. package/lib/logger/logger.d.ts.map +1 -1
  62. package/lib/logger/logger.js +32 -14
  63. package/lib/logger/logger.js.map +1 -1
  64. package/lib/logger/types.d.ts +23 -0
  65. package/lib/logger/types.d.ts.map +1 -0
  66. package/lib/logger/types.js +2 -0
  67. package/lib/logger/types.js.map +1 -0
  68. package/lib/options.d.ts +5 -4
  69. package/lib/options.d.ts.map +1 -1
  70. package/lib/options.js +6 -4
  71. package/lib/options.js.map +1 -1
  72. package/package.json +10 -8
  73. package/src/downloader/main.ts +15 -14
  74. package/src/downloader/multi.ts +3 -5
  75. package/src/downloader/pipeline-executor-impl.ts +98 -2
  76. package/src/downloader/single.ts +4 -5
  77. package/src/downloader/worker-pool.ts +12 -6
  78. package/src/downloader/worker-type.ts +2 -2
  79. package/src/downloader/worker.ts +2 -7
  80. package/src/index.ts +2 -0
  81. package/src/life-cycle/adapters.ts +13 -0
  82. package/src/life-cycle/default-life-cycle.ts +3 -1
  83. package/src/life-cycle/default-status-listener.ts +40 -0
  84. package/src/life-cycle/download-resource.ts +13 -4
  85. package/src/life-cycle/index.ts +1 -0
  86. package/src/life-cycle/pipeline-executor.ts +16 -2
  87. package/src/life-cycle/types.ts +79 -3
  88. package/src/logger/default-logger.ts +12 -0
  89. package/src/logger/log4js-adapter.ts +147 -0
  90. package/src/logger/logger-worker.ts +24 -23
  91. package/src/logger/logger.ts +36 -16
  92. package/src/logger/types.ts +35 -0
  93. package/src/options.ts +11 -7
  94. package/lib/logger/config-logger.d.ts +0 -3
  95. package/lib/logger/config-logger.d.ts.map +0 -1
  96. package/lib/logger/config-logger.js +0 -92
  97. package/lib/logger/config-logger.js.map +0 -1
  98. package/src/logger/config-logger.ts +0 -95
package/package.json CHANGED
@@ -1,9 +1,9 @@
1
1
  {
2
2
  "name": "website-scrap-engine",
3
- "version": "0.8.6",
3
+ "version": "0.9.0",
4
4
  "description": "Configurable website scraper in typescript",
5
- "main": "lib",
6
- "types": "lib",
5
+ "main": "lib/index.js",
6
+ "types": "lib/index.d.ts",
7
7
  "type": "module",
8
8
  "engines": {
9
9
  "node": ">=18.17.0"
@@ -17,13 +17,12 @@
17
17
  "copy": "node copy-src.js",
18
18
  "prepack": "npm run clean && npm run build",
19
19
  "postshrinkwrap": "node package-lock-resolved.js",
20
- "postinstall": "npm run postshrinkwrap && node -e \"require('fs').copyFileSync('fake-undici.d.ts', 'node_modules/cheerio/node_modules/undici/index.d.ts')\""
20
+ "postinstall": "node -e \"var f='fake-undici.d.ts',t='node_modules/cheerio/node_modules/undici/index.d.ts';try{require('fs').copyFileSync(f,t)}catch(e){}\""
21
21
  },
22
22
  "dependencies": {
23
23
  "cheerio": "^1.2.0",
24
24
  "css-url-parser": "^1.1.4",
25
25
  "got": "^13.0.0",
26
- "log4js": "^6.9.1",
27
26
  "mkdirp": "^3.0.1",
28
27
  "p-queue": "^8.1.1",
29
28
  "srcset": "^5.0.3",
@@ -33,13 +32,16 @@
33
32
  "@jest/globals": "^30.1.1",
34
33
  "@types/node": "^25.5.0",
35
34
  "@types/urijs": "^1.19.26",
36
- "@typescript-eslint/eslint-plugin": "^8.57.1",
37
- "@typescript-eslint/parser": "^8.57.1",
35
+ "@typescript-eslint/eslint-plugin": "^8.58.0",
36
+ "@typescript-eslint/parser": "^8.58.0",
38
37
  "eslint": "^9.39.2",
39
38
  "jest": "^30.3.0",
40
- "ts-jest": "^29.4.6",
39
+ "ts-jest": "^29.4.9",
41
40
  "typescript": "^5.9.3"
42
41
  },
42
+ "optionalDependencies": {
43
+ "log4js": "^6.9.1"
44
+ },
43
45
  "overrides": {
44
46
  "undici": "npm:@favware/skip-dependency@1.2.2"
45
47
  },
@@ -1,14 +1,16 @@
1
1
  import PQueue from 'p-queue';
2
- import type {HTTPError} from 'got';
3
2
  import URI from 'urijs';
4
3
  import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
5
4
  import {mergeOverrideOptions} from '../options.js';
6
5
  import type {RawResource, Resource} from '../resource.js';
7
6
  import {normalizeResource, ResourceType} from '../resource.js';
8
- import {error, notFound, skip} from '../logger/logger.js';
7
+ import {skip} from '../logger/logger.js';
8
+ import {setLogger} from '../logger/logger.js';
9
+ import {createDefaultLogger} from '../logger/default-logger.js';
9
10
  import {importDefaultFromPath} from '../util.js';
10
11
  import type {DownloaderStats, DownloaderWithMeta} from './types.js';
11
12
  import {PipelineExecutorImpl} from './pipeline-executor-impl.js';
13
+ import type {InitSubmitFunc} from '../life-cycle/types.js';
12
14
 
13
15
  export abstract class AbstractDownloader implements DownloaderWithMeta {
14
16
  readonly queue: PQueue;
@@ -41,7 +43,7 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
41
43
  // https://github.com/website-local/website-scrap-engine/issues/1113
42
44
  this.queue.concurrency = options.concurrency;
43
45
  this._pipeline = new PipelineExecutorImpl(options, options.req, options);
44
- options.configureLogger(options.localRoot, options.logSubDir || '');
46
+ setLogger((options.createLogger ?? createDefaultLogger)(options));
45
47
  return this._internalInit(options).then(() => {
46
48
  this._isInit = true;
47
49
  });
@@ -89,9 +91,12 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
89
91
  await this._initOptions;
90
92
  }
91
93
  const pipeline = this.pipeline;
92
- await pipeline.init(pipeline, this);
94
+ const submit: InitSubmitFunc = (url: string) => {
95
+ urlArr.push(url);
96
+ };
97
+ await pipeline.init(pipeline, this, submit);
93
98
  // noinspection DuplicatedCode
94
- for (let i = 0, l = urlArr.length; i < l; i++) {
99
+ for (let i = 0; i < urlArr.length; i++) {
95
100
  let url: string | void = urlArr[i];
96
101
  url = await pipeline.linkRedirect(url, null, null);
97
102
  if (!url) continue;
@@ -114,6 +119,7 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
114
119
  // noinspection DuplicatedCode
115
120
  if (res.depth > this.options.maxDepth) {
116
121
  skip.info('skipped max depth', res.url, res.refUrl, res.depth);
122
+ this.pipeline.notifyStatusChange(res, 'dispose');
117
123
  return false;
118
124
  }
119
125
  let url: string;
@@ -146,15 +152,10 @@ export abstract class AbstractDownloader implements DownloaderWithMeta {
146
152
  }
147
153
 
148
154
  handleError(err: Error | unknown | null, cause: string, resource: RawResource): void {
149
- // force cast in case of typescript 4.4
150
- if (err && (err as {name?: string}).name === 'HTTPError' &&
151
- (err as HTTPError)?.response?.statusCode === 404) {
152
- notFound.error(resource.url, resource.downloadLink, resource.refUrl);
153
- } else if (err) {
154
- error.error(cause, resource.url, resource.downloadLink, resource.refUrl, err);
155
- } else {
156
- error.error(cause, resource.url, resource.downloadLink, resource.refUrl);
157
- }
155
+ resource.meta = resource.meta || {};
156
+ resource.meta['error'] = err;
157
+ resource.meta['errorCause'] = cause;
158
+ this.pipeline.notifyStatusChange(resource, 'error');
158
159
  }
159
160
 
160
161
 
@@ -5,7 +5,6 @@ import type {RawResource, Resource} from '../resource.js';
5
5
  import type {DownloadWorkerMessage} from './types.js';
6
6
  import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
7
7
  import type {DownloadResource} from '../life-cycle/types.js';
8
- import {skip} from '../logger/logger.js';
9
8
  import {AbstractDownloader} from './main.js';
10
9
 
11
10
  export interface MultiThreadDownloaderOptions extends StaticDownloadOptions {
@@ -52,7 +51,7 @@ export class MultiThreadDownloader extends AbstractDownloader {
52
51
  if (this.options.initialUrl) {
53
52
  return this.addInitialResource(this.options.initialUrl);
54
53
  } else {
55
- return this._initOptions.then(() => this.pipeline.init(this.pipeline, this));
54
+ return this.addInitialResource([]);
56
55
  }
57
56
  }
58
57
 
@@ -68,7 +67,7 @@ export class MultiThreadDownloader extends AbstractDownloader {
68
67
  try {
69
68
  r = await this.pipeline!.download(res);
70
69
  if (!r) {
71
- skip.debug('discarded after download', res.url, res.rawUrl, res.refUrl);
70
+ await this.pipeline.notifyStatusChange(res, 'download');
72
71
  return;
73
72
  }
74
73
  } catch (e) {
@@ -94,8 +93,7 @@ export class MultiThreadDownloader extends AbstractDownloader {
94
93
  }
95
94
  this.downloadedUrl.add(res.url);
96
95
  if (!msg) {
97
- skip.info('discarded in post-processing',
98
- res.url, res.rawUrl, res.refUrl);
96
+ await this.pipeline.notifyStatusChange(res, 'processAfterDownload');
99
97
  return;
100
98
  }
101
99
  if (msg.error) {
@@ -1,14 +1,22 @@
1
+ import path from 'node:path';
2
+ import {existsSync, statSync} from 'node:fs';
3
+ import type {Stats} from 'node:fs';
1
4
  import type {StaticDownloadOptions} from '../options.js';
2
5
  import type {
3
6
  CreateResourceArgument,
7
+ RawResource,
4
8
  Resource,
5
9
  ResourceEncoding,
6
10
  ResourceType
7
11
  } from '../resource.js';
8
12
  import type {
9
13
  DownloadResource,
14
+ ExistingResourceAction,
15
+ ExistingResourceStage,
16
+ InitSubmitFunc,
10
17
  ProcessingLifeCycle,
11
18
  RequestOptions,
19
+ ResourceStatus,
12
20
  SubmitResourceFunc
13
21
  } from '../life-cycle/types.js';
14
22
  // noinspection ES6PreferShortImport
@@ -28,11 +36,12 @@ export class PipelineExecutorImpl implements PipelineExecutor {
28
36
 
29
37
  async init(
30
38
  pipeline: PipelineExecutor,
31
- downloader?: DownloaderWithMeta
39
+ downloader?: DownloaderWithMeta,
40
+ submit?: InitSubmitFunc
32
41
  ): Promise<void> {
33
42
  if (!this.lifeCycle.init) return;
34
43
  for (const init of this.lifeCycle.init) {
35
- await init(pipeline, downloader);
44
+ await init(pipeline, downloader, submit);
36
45
  }
37
46
  }
38
47
 
@@ -156,6 +165,22 @@ export class PipelineExecutorImpl implements PipelineExecutor {
156
165
  if (!options) {
157
166
  options = this.options;
158
167
  }
168
+ if (this.lifeCycle.existingResource) {
169
+ const action = this._checkExistingResource(res, 'download');
170
+ if (action === 'skip') {
171
+ res.shouldBeDiscardedFromDownload = true;
172
+ return undefined;
173
+ }
174
+ if (action === 'ifModifiedSince') {
175
+ const mtime = this._getExistingFileMtime(res);
176
+ if (mtime) {
177
+ requestOptions = Object.assign({}, requestOptions);
178
+ requestOptions.headers = Object.assign({}, requestOptions.headers, {
179
+ 'if-modified-since': mtime
180
+ });
181
+ }
182
+ }
183
+ }
159
184
  let downloadedResource: DownloadResource | Resource | void = res;
160
185
  for (const download of this.lifeCycle.download) {
161
186
  if ((downloadedResource = await download(
@@ -204,6 +229,29 @@ export class PipelineExecutorImpl implements PipelineExecutor {
204
229
  if (!options) {
205
230
  options = this.options;
206
231
  }
232
+ if (this.lifeCycle.existingResource) {
233
+ const action = this._checkExistingResource(res, 'saveToDisk');
234
+ if (action === 'skip' || action === 'skipSave') {
235
+ return undefined;
236
+ }
237
+ if (action === 'ifModifiedSince') {
238
+ const remoteLastMod = res.meta?.headers?.['last-modified'];
239
+ if (remoteLastMod) {
240
+ const localPath = path.join(
241
+ res.localRoot ?? this.options.localRoot,
242
+ decodeURI(res.savePath)
243
+ );
244
+ try {
245
+ const localMtime = statSync(localPath).mtime;
246
+ if (new Date(remoteLastMod as string) <= localMtime) {
247
+ return undefined;
248
+ }
249
+ } catch {
250
+ // file removed between check and stat, proceed with save
251
+ }
252
+ }
253
+ }
254
+ }
207
255
  let downloadedResource: DownloadResource | void = res;
208
256
  for (const saveToDisk of this.lifeCycle.saveToDisk) {
209
257
  if ((downloadedResource = await saveToDisk(
@@ -229,4 +277,52 @@ export class PipelineExecutorImpl implements PipelineExecutor {
229
277
  }
230
278
  }
231
279
 
280
+ async notifyStatusChange(
281
+ res: Resource | RawResource,
282
+ status: ResourceStatus
283
+ ): Promise<void> {
284
+ if (!this.lifeCycle.statusChange?.length) return;
285
+ for (const listener of this.lifeCycle.statusChange) {
286
+ try {
287
+ const r = listener(res, status, this.options, this);
288
+ if (r) await r;
289
+ } catch {
290
+ // swallow
291
+ }
292
+ }
293
+ }
294
+
295
+ private _checkExistingResource(
296
+ res: Resource, stage: ExistingResourceStage
297
+ ): ExistingResourceAction | void {
298
+ const localPath = path.join(
299
+ res.localRoot ?? this.options.localRoot,
300
+ decodeURI(res.savePath)
301
+ );
302
+ if (!existsSync(localPath)) return undefined;
303
+ let stat: Stats;
304
+ try {
305
+ stat = statSync(localPath);
306
+ } catch {
307
+ // TOCTOU: file deleted between existsSync and statSync
308
+ return undefined;
309
+ }
310
+ if (!stat.isFile()) return undefined;
311
+ return this.lifeCycle.existingResource!({
312
+ res, stage, localPath, stat, options: this.options
313
+ });
314
+ }
315
+
316
+ private _getExistingFileMtime(res: Resource): string | undefined {
317
+ const localPath = path.join(
318
+ res.localRoot ?? this.options.localRoot,
319
+ decodeURI(res.savePath)
320
+ );
321
+ try {
322
+ return statSync(localPath).mtime.toUTCString();
323
+ } catch {
324
+ return undefined;
325
+ }
326
+ }
327
+
232
328
  }
@@ -1,7 +1,6 @@
1
1
  import {AbstractDownloader} from './main.js';
2
2
  import type {Resource} from '../resource.js';
3
3
  import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
4
- import {skip} from '../logger/logger.js';
5
4
  import type {
6
5
  DownloadResource,
7
6
  SubmitResourceFunc
@@ -20,7 +19,7 @@ export class SingleThreadDownloader extends AbstractDownloader {
20
19
  if (options.initialUrl) {
21
20
  return this.addInitialResource(options.initialUrl);
22
21
  } else {
23
- return this.pipeline.init(this.pipeline, this);
22
+ return this.addInitialResource([]);
24
23
  }
25
24
  }
26
25
 
@@ -29,7 +28,7 @@ export class SingleThreadDownloader extends AbstractDownloader {
29
28
  try {
30
29
  r = await this.pipeline.download(res);
31
30
  if (!r) {
32
- skip.debug('discarded after download', res.url, res.rawUrl, res.refUrl);
31
+ await this.pipeline.notifyStatusChange(res, 'download');
33
32
  return;
34
33
  }
35
34
  } catch (e) {
@@ -51,9 +50,9 @@ export class SingleThreadDownloader extends AbstractDownloader {
51
50
  const processedResource: DownloadResource | void =
52
51
  await this.pipeline.processAfterDownload(r, submit);
53
52
  if (!processedResource) {
54
- skip.warn('skipped downloaded resource', r.url, r.refUrl);
53
+ await this.pipeline.notifyStatusChange(r, 'processAfterDownload');
55
54
  } else if (await this.pipeline.saveToDisk(processedResource)) {
56
- skip.warn('downloaded resource not saved', r.url, r.refUrl);
55
+ await this.pipeline.notifyStatusChange(r, 'saveToDisk');
57
56
  }
58
57
  if (processedResource && processedResource.redirectedUrl &&
59
58
  processedResource.redirectedUrl !== processedResource.url) {
@@ -1,7 +1,7 @@
1
1
  import type {MessagePort, WorkerOptions} from 'node:worker_threads';
2
2
  import {Worker} from 'node:worker_threads';
3
3
  import type {URL} from 'node:url';
4
- import * as logger from '../logger/logger.js';
4
+ import {error as errorLogger, getLogger} from '../logger/logger.js';
5
5
  import type {LogWorkerMessage} from './worker-type.js';
6
6
  import type {
7
7
  PendingPromise,
@@ -63,7 +63,7 @@ export class WorkerPool<T = unknown, R extends WorkerMessage = WorkerMessage> {
63
63
  }
64
64
 
65
65
  workerOnError(info: WorkerInfo, err: Error): void {
66
- logger.error.error('worker error', info.id, err);
66
+ errorLogger.error('worker error', info.id, err);
67
67
  }
68
68
 
69
69
  onMessage(info: WorkerInfo, message: WorkerMessage): void {
@@ -76,14 +76,20 @@ export class WorkerPool<T = unknown, R extends WorkerMessage = WorkerMessage> {
76
76
 
77
77
  takeLog(info: WorkerInfo, message: LogWorkerMessage): void {
78
78
  if (!message?.body) {
79
- logger.error.warn('Invalid formatted log', info.id);
79
+ errorLogger.warn('Invalid formatted log', info.id);
80
80
  return;
81
81
  }
82
- const content = message?.body?.content;
82
+ const level = message.body.level;
83
+ const logType = message.body.logType;
84
+ if (!level || !logType) {
85
+ return;
86
+ }
87
+ const log = getLogger();
88
+ const content = message.body.content;
83
89
  if (content?.length) {
84
- logger?.[message.body.logger]?.[message.body.level]?.(info.id, ...content);
90
+ log[level](logType, info.id, ...content);
85
91
  } else {
86
- logger?.[message.body.logger]?.[message.body.level]?.(info.id);
92
+ log[level](logType, info.id);
87
93
  }
88
94
  }
89
95
 
@@ -1,9 +1,9 @@
1
1
  import type {logLevels} from '../logger/logger-worker.js';
2
- import type * as logger from '../logger/logger.js';
2
+ import type {LogType} from '../logger/types.js';
3
3
  import type {WorkerMessage, WorkerMessageType} from './types.js';
4
4
 
5
5
  export interface WorkerLog<T = unknown> {
6
- logger: keyof typeof logger;
6
+ logType: LogType;
7
7
  level: typeof logLevels[number];
8
8
  content: T[];
9
9
  }
@@ -7,7 +7,6 @@ import type {
7
7
  } from '../life-cycle/types.js';
8
8
  import type {RawResource, Resource} from '../resource.js';
9
9
  import {normalizeResource, prepareResourceForClone} from '../resource.js';
10
- import {skip} from '../logger/logger.js';
11
10
  import {importDefaultFromPath} from '../util.js';
12
11
  import type {DownloadWorkerMessage} from './types.js';
13
12
  import {WorkerMessageType} from './types.js';
@@ -29,8 +28,6 @@ const asyncPipeline = asyncOptions.then(options => {
29
28
  const pipeline: PipelineExecutor =
30
29
  new PipelineExecutorImpl(options, options.req, options);
31
30
 
32
- options.configureLogger(options.localRoot, options.logSubDir || '');
33
-
34
31
  const init = pipeline.init(pipeline);
35
32
  if (init && (init as Promise<void>).then) {
36
33
  return init.then(() => pipeline);
@@ -58,11 +55,9 @@ parentPort?.addListener('message', async (msg: WorkerTaskMessage<RawResource>) =
58
55
  const processedResource: DownloadResource | void =
59
56
  await pipeline.processAfterDownload(downloadResource, submit);
60
57
  if (!processedResource) {
61
- skip.warn('skipped downloaded resource',
62
- downloadResource.url, downloadResource.refUrl);
58
+ await pipeline.notifyStatusChange(downloadResource, 'processAfterDownload');
63
59
  } else if (await pipeline.saveToDisk(processedResource)) {
64
- skip.warn('downloaded resource not saved',
65
- downloadResource.url, downloadResource.refUrl);
60
+ await pipeline.notifyStatusChange(downloadResource, 'saveToDisk');
66
61
  }
67
62
 
68
63
  if (processedResource && processedResource.redirectedUrl &&
package/src/index.ts CHANGED
@@ -1,4 +1,6 @@
1
1
  export * as logger from './logger/logger.js';
2
+ export type {Logger, LogType, CategoryLogger} from './logger/types.js';
3
+ export {createDefaultLogger} from './logger/default-logger.js';
2
4
  export * as downloader from './downloader/index.js';
3
5
  export * as lifeCycle from './life-cycle/index.js';
4
6
  export * as io from './io.js';
@@ -4,6 +4,7 @@ import {ResourceType} from '../resource.js';
4
4
  import type {
5
5
  AsyncResult,
6
6
  DownloadResource,
7
+ ExistingResourceFunc,
7
8
  LinkRedirectFunc,
8
9
  ProcessResourceAfterDownloadFunc,
9
10
  ProcessResourceBeforeDownloadFunc,
@@ -132,3 +133,15 @@ export const processHtmlAsync = (fn: AsyncHtmlProcessFunc): ProcessResourceAfter
132
133
  return res;
133
134
  };
134
135
 
136
+ /** Skip download if local file already exists */
137
+ export const skipExisting = (): ExistingResourceFunc =>
138
+ ({stage}) => stage === 'download' ? 'skip' : 'overwrite';
139
+
140
+ /** Re-download only if remote is newer (If-Modified-Since) */
141
+ export const preferNewerRemote = (): ExistingResourceFunc =>
142
+ () => 'ifModifiedSince';
143
+
144
+ /** Always overwrite (current default behavior, explicit) */
145
+ export const alwaysOverwrite = (): ExistingResourceFunc =>
146
+ () => 'overwrite';
147
+
@@ -13,6 +13,7 @@ import {saveResourceToDisk} from './save-resource-to-disk.js';
13
13
  import {processRedirectedUrl} from './adapters.js';
14
14
  import {downloadStreamingResource} from './download-streaming-resource.js';
15
15
  import {readOrCopyLocalResource} from './read-or-copy-local-resource.js';
16
+ import {defaultStatusListener} from './default-status-listener.js';
16
17
 
17
18
  /**
18
19
  * Get a copy of default life cycle
@@ -37,6 +38,7 @@ export const defaultLifeCycle = (): ProcessingLifeCycle => ({
37
38
  processSiteMap
38
39
  ],
39
40
  saveToDisk: [saveHtmlToDisk, saveResourceToDisk],
40
- dispose: []
41
+ dispose: [],
42
+ statusChange: [defaultStatusListener]
41
43
  });
42
44
 
@@ -0,0 +1,40 @@
1
+ import type {RawResource, Resource} from '../resource.js';
2
+ import type {ResourceStatus} from './types.js';
3
+ import {error, notFound, skip} from '../logger/logger.js';
4
+
5
+ export const defaultStatusListener = (
6
+ res: Resource | RawResource,
7
+ status: ResourceStatus
8
+ ): void => {
9
+ switch (status) {
10
+ case 'processBeforeDownload':
11
+ break;
12
+ case 'createResource':
13
+ break;
14
+ case 'download':
15
+ skip.debug('discarded after download', res.url, res.rawUrl, res.refUrl);
16
+ break;
17
+ case 'processAfterDownload':
18
+ skip.warn('skipped downloaded resource', res.url, res.refUrl);
19
+ break;
20
+ case 'saveToDisk':
21
+ skip.warn('downloaded resource not saved', res.url, res.refUrl);
22
+ break;
23
+ case 'error':
24
+ if (res.meta?.['errorCause']) {
25
+ const err = res.meta['error'];
26
+ const cause = res.meta['errorCause'] as string;
27
+ if (err && (err as {name?: string}).name === 'HTTPError' &&
28
+ (err as {response?: {statusCode?: number}})?.response?.statusCode === 404) {
29
+ notFound.error(res.url, res.downloadLink, res.refUrl);
30
+ } else if (err) {
31
+ error.error(cause, res.url, res.downloadLink, res.refUrl, err);
32
+ } else {
33
+ error.error(cause, res.url, res.downloadLink, res.refUrl);
34
+ }
35
+ }
36
+ break;
37
+ case 'dispose':
38
+ break;
39
+ }
40
+ };
@@ -1,5 +1,5 @@
1
1
  import type {BeforeRetryHook, OptionsInit, RequestError, Response} from 'got';
2
- import got, {TimeoutError} from 'got';
2
+ import got, {HTTPError, TimeoutError} from 'got';
3
3
  import type {DownloadResource, RequestOptions} from './types.js';
4
4
  import type {Resource} from '../resource.js';
5
5
  import {generateSavePath, ResourceType} from '../resource.js';
@@ -117,8 +117,16 @@ export async function requestForResource(
117
117
  }
118
118
  logger.request.info(res.url, downloadLink, res.refUrl,
119
119
  res.encoding, res.type);
120
- const response: Response<string | Buffer> | void =
121
- await getRetry(downloadLink, reqOptions);
120
+ let response: Response<string | Buffer> | void;
121
+ try {
122
+ response = await getRetry(downloadLink, reqOptions);
123
+ } catch (e) {
124
+ if (e instanceof HTTPError &&
125
+ (e as HTTPError).response.statusCode === 304) {
126
+ return undefined;
127
+ }
128
+ throw e;
129
+ }
122
130
  if (!response) {
123
131
  const resource = res as Resource;
124
132
  delete resource.downloadStartTimestamp;
@@ -195,7 +203,8 @@ export async function downloadResource(
195
203
  }
196
204
  }
197
205
  if (nonHtml) {
198
- logger.error.warn('Detected non-html content type',
206
+ logger.error.warn('Detected non-html content type for resource typed as',
207
+ downloadedResource.type,
199
208
  downloadedResource.downloadLink, downloadedResource.rawUrl, contentType);
200
209
  }
201
210
  }
@@ -1,5 +1,6 @@
1
1
  export * as adapter from './adapters.js';
2
2
  export {defaultLifeCycle} from './default-life-cycle.js';
3
+ export {defaultStatusListener} from './default-status-listener.js';
3
4
  export {detectResourceType} from './detect-resource-type.js';
4
5
  export {
5
6
  beforeRetryHook, getRetry, requestForResource, downloadResource
@@ -1,9 +1,11 @@
1
- import type {Resource, ResourceEncoding, ResourceType} from '../resource.js';
1
+ import type {RawResource, Resource, ResourceEncoding, ResourceType} from '../resource.js';
2
2
  import type {StaticDownloadOptions} from '../options.js';
3
3
  import type {
4
4
  AsyncResult,
5
5
  DownloadResource,
6
+ InitSubmitFunc,
6
7
  RequestOptions,
8
+ ResourceStatus,
7
9
  SubmitResourceFunc
8
10
  } from './types.js';
9
11
  import type {Cheerio} from '../types.js';
@@ -16,7 +18,8 @@ export interface PipelineExecutor {
16
18
  */
17
19
  init(
18
20
  pipeline: PipelineExecutor,
19
- downloader?: DownloaderWithMeta
21
+ downloader?: DownloaderWithMeta,
22
+ submit?: InitSubmitFunc
20
23
  ): AsyncResult<void>;
21
24
 
22
25
  /**
@@ -99,4 +102,15 @@ export interface PipelineExecutor {
99
102
  workerExitCode?: number
100
103
  ): AsyncResult<void>;
101
104
 
105
+ /**
106
+ * Notify status change listeners.
107
+ *
108
+ * All listeners always run (void return does not short-circuit),
109
+ * and thrown errors are swallowed.
110
+ */
111
+ notifyStatusChange(
112
+ res: Resource | RawResource,
113
+ status: ResourceStatus
114
+ ): AsyncResult<void>;
115
+
102
116
  }