rezo 1.0.67 → 1.0.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/adapters/index.cjs +6 -6
  2. package/dist/cache/index.cjs +9 -9
  3. package/dist/crawler/index.cjs +40 -40
  4. package/dist/entries/crawler.cjs +4 -4
  5. package/dist/index.cjs +27 -27
  6. package/dist/internal/agents/index.cjs +10 -10
  7. package/dist/proxy/index.cjs +4 -4
  8. package/dist/queue/index.cjs +8 -8
  9. package/dist/responses/universal/index.cjs +11 -11
  10. package/dist/wget/asset-extractor.cjs +556 -0
  11. package/dist/wget/asset-extractor.js +553 -0
  12. package/dist/wget/asset-organizer.cjs +230 -0
  13. package/dist/wget/asset-organizer.js +227 -0
  14. package/dist/wget/download-cache.cjs +221 -0
  15. package/dist/wget/download-cache.js +218 -0
  16. package/dist/wget/downloader.cjs +607 -0
  17. package/dist/wget/downloader.js +604 -0
  18. package/dist/wget/file-writer.cjs +349 -0
  19. package/dist/wget/file-writer.js +346 -0
  20. package/dist/wget/filter-lists.cjs +1330 -0
  21. package/dist/wget/filter-lists.js +1330 -0
  22. package/dist/wget/index.cjs +633 -0
  23. package/dist/wget/index.d.ts +8486 -0
  24. package/dist/wget/index.js +614 -0
  25. package/dist/wget/link-converter.cjs +297 -0
  26. package/dist/wget/link-converter.js +294 -0
  27. package/dist/wget/progress.cjs +271 -0
  28. package/dist/wget/progress.js +266 -0
  29. package/dist/wget/resume.cjs +166 -0
  30. package/dist/wget/resume.js +163 -0
  31. package/dist/wget/robots.cjs +303 -0
  32. package/dist/wget/robots.js +300 -0
  33. package/dist/wget/types.cjs +200 -0
  34. package/dist/wget/types.js +197 -0
  35. package/dist/wget/url-filter.cjs +351 -0
  36. package/dist/wget/url-filter.js +348 -0
  37. package/package.json +6 -1
@@ -0,0 +1,604 @@
1
+ import { normalize, relative, join } from "node:path";
2
+ import { RezoQueue } from '../queue/queue.js';
3
+ import { RezoHeaders } from '../utils/headers.js';
4
+ import { WgetError as WgetErrorClass } from './types.js';
5
+ import { AssetExtractor } from './asset-extractor.js';
6
+ import { UrlFilter } from './url-filter.js';
7
+ import { FileWriter } from './file-writer.js';
8
+ import { RobotsHandler } from './robots.js';
9
+ import { ResumeHandler } from './resume.js';
10
+ import { ProgressReporter, parseSize } from './progress.js';
11
+ import { LinkConverter } from './link-converter.js';
12
+ import { DownloadCache } from './download-cache.js';
13
+ const DEFAULT_OPTIONS = {
14
+ outputDir: ".",
15
+ depth: 5,
16
+ timeout: 30,
17
+ tries: 3,
18
+ waitRetry: 10,
19
+ wait: 0,
20
+ concurrency: 1,
21
+ robots: true,
22
+ userAgent: "Rezo-Wget/1.0",
23
+ maxRedirects: 20,
24
+ retryConnrefused: true
25
+ };
26
+
27
+ export class Downloader {
28
+ options;
29
+ http;
30
+ queue;
31
+ assetExtractor;
32
+ urlFilter;
33
+ fileWriter;
34
+ robots;
35
+ resumeHandler;
36
+ progressReporter;
37
+ linkConverter;
38
+ visitedUrls = new Set;
39
+ queuedUrls = new Set;
40
+ urlMap = new Map;
41
+ stats;
42
+ aborted = false;
43
+ quotaBytes = 1 / 0;
44
+ totalDownloaded = 0;
45
+ cache = null;
46
+ eventHandlers = new Map;
47
+ constructor(options = {}, http) {
48
+ this.options = { ...DEFAULT_OPTIONS, ...options };
49
+ this.http = http;
50
+ this.queue = new RezoQueue({
51
+ concurrency: 1 / 0
52
+ });
53
+ this.assetExtractor = new AssetExtractor;
54
+ this.urlFilter = new UrlFilter(this.options);
55
+ this.fileWriter = new FileWriter(this.options);
56
+ this.robots = new RobotsHandler(this.options);
57
+ this.resumeHandler = new ResumeHandler(this.options);
58
+ this.progressReporter = new ProgressReporter(this.options);
59
+ this.linkConverter = new LinkConverter(this.options);
60
+ this.stats = {
61
+ urlsDownloaded: 0,
62
+ urlsFailed: 0,
63
+ urlsSkipped: 0,
64
+ bytesDownloaded: 0,
65
+ filesWritten: 0,
66
+ startTime: Date.now()
67
+ };
68
+ if (this.options.quota) {
69
+ this.quotaBytes = parseSize(this.options.quota);
70
+ }
71
+ }
72
+ on(event, handler) {
73
+ if (!this.eventHandlers.has(event)) {
74
+ this.eventHandlers.set(event, new Set);
75
+ }
76
+ this.eventHandlers.get(event).add(handler);
77
+ return this;
78
+ }
79
+ off(event, handler) {
80
+ const handlers = this.eventHandlers.get(event);
81
+ if (handlers) {
82
+ handlers.delete(handler);
83
+ }
84
+ return this;
85
+ }
86
+ emit(event, data) {
87
+ const handlers = this.eventHandlers.get(event);
88
+ if (handlers) {
89
+ for (const handler of Array.from(handlers)) {
90
+ try {
91
+ handler(data);
92
+ } catch (error) {
93
+ console.error(`Error in ${event} handler:`, error);
94
+ }
95
+ }
96
+ }
97
+ }
98
+ debug(message) {
99
+ if (this.options.debug && !this.options.quiet) {
100
+ console.log(`[DEBUG] ${message}`);
101
+ }
102
+ }
103
+ async download(urls) {
104
+ const urlArray = Array.isArray(urls) ? urls : [urls];
105
+ this.debug(`Starting wget download`);
106
+ this.debug(` URLs: ${urlArray.length}`);
107
+ this.debug(` Concurrency: ${this.options.concurrency} (handled by Rezo HTTP queue)`);
108
+ this.debug(` Recursive: ${this.options.recursive}`);
109
+ this.debug(` Depth: ${this.options.depth}`);
110
+ this.debug(` Page requisites: ${this.options.pageRequisites}`);
111
+ this.debug(` Convert links: ${this.options.convertLinks}`);
112
+ this.debug(` Output dir: ${this.options.outputDir}`);
113
+ this.debug(` Organize assets: ${this.options.organizeAssets}`);
114
+ this.debug(` Cache enabled: ${this.options.cache !== false}`);
115
+ if (this.options.proxy) {
116
+ const proxy = this.options.proxy;
117
+ if (typeof proxy === "string") {
118
+ this.debug(` Proxy: ${proxy}`);
119
+ } else {
120
+ this.debug(` Proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
121
+ }
122
+ }
123
+ if (this.options.cache !== false) {
124
+ const baseUrl = urlArray[0];
125
+ this.cache = new DownloadCache(this.options.outputDir || ".", baseUrl);
126
+ await this.cache.load();
127
+ const cacheStats = this.cache.stats();
128
+ this.debug(` Cache loaded: ${cacheStats.entries} entries, ${cacheStats.totalBytes} bytes`);
129
+ }
130
+ for (const url of urlArray) {
131
+ this.urlFilter.addStartUrl(url);
132
+ }
133
+ if (this.options.mirror) {
134
+ this.debug(`Mirror mode enabled - setting recursive=true, depth=Infinity, timestamping=true`);
135
+ this.options.recursive = true;
136
+ this.options.depth = 1 / 0;
137
+ this.options.timestamping = true;
138
+ }
139
+ for (const url of urlArray) {
140
+ this.fileWriter.markAsEntry(url);
141
+ await this.addToQueue(url, 0, null, "document");
142
+ }
143
+ await this.processQueue();
144
+ if (this.options.convertLinks && this.urlMap.size > 0) {
145
+ this.debug(`Converting links in ${this.urlMap.size} downloaded files`);
146
+ await this.convertLinks();
147
+ }
148
+ this.stats.endTime = Date.now();
149
+ this.stats.duration = this.stats.endTime - this.stats.startTime;
150
+ this.stats.urlMap = this.urlMap;
151
+ this.debug(`Download complete in ${this.stats.duration}ms`);
152
+ this.debug(` Downloaded: ${this.stats.urlsDownloaded}`);
153
+ this.debug(` Failed: ${this.stats.urlsFailed}`);
154
+ this.debug(` Skipped: ${this.stats.urlsSkipped}`);
155
+ this.debug(` Bytes: ${this.stats.bytesDownloaded}`);
156
+ if (this.cache) {
157
+ await this.cache.save();
158
+ this.debug(` Cache saved: ${this.cache.filePath}`);
159
+ }
160
+ this.emit("finish", {
161
+ stats: this.stats,
162
+ success: this.stats.urlsFailed === 0
163
+ });
164
+ return this.stats;
165
+ }
166
+ async addToQueue(url, depth, parentUrl, assetType) {
167
+ const normalizedUrl = this.normalizeUrl(url);
168
+ if (!normalizedUrl) {
169
+ this.emitSkip(url, "invalid-url", "Invalid URL", depth, parentUrl);
170
+ return;
171
+ }
172
+ if (this.visitedUrls.has(normalizedUrl)) {
173
+ this.emitSkip(normalizedUrl, "already-downloaded", "Already downloaded", depth, parentUrl);
174
+ return;
175
+ }
176
+ if (this.queuedUrls.has(normalizedUrl)) {
177
+ this.emitSkip(normalizedUrl, "already-queued", "Already in queue", depth, parentUrl);
178
+ return;
179
+ }
180
+ if (this.totalDownloaded >= this.quotaBytes) {
181
+ this.emitSkip(normalizedUrl, "quota-exceeded", "Download quota exceeded", depth, parentUrl);
182
+ return;
183
+ }
184
+ const filterResult = this.urlFilter.shouldDownload(normalizedUrl, parentUrl || normalizedUrl, depth);
185
+ if (!filterResult.allowed) {
186
+ this.emitSkip(normalizedUrl, filterResult.reason, filterResult.message, depth, parentUrl);
187
+ this.stats.urlsSkipped++;
188
+ return;
189
+ }
190
+ if (this.options.robots !== false && !this.options.noRobots) {
191
+ await this.fetchRobots(normalizedUrl);
192
+ if (!this.robots.isAllowed(normalizedUrl)) {
193
+ this.emitSkip(normalizedUrl, "robots-disallowed", "Blocked by robots.txt", depth, parentUrl);
194
+ this.stats.urlsSkipped++;
195
+ return;
196
+ }
197
+ }
198
+ this.queuedUrls.add(normalizedUrl);
199
+ const item = {
200
+ url: normalizedUrl,
201
+ depth,
202
+ parentUrl,
203
+ priority: depth,
204
+ assetType,
205
+ retryCount: 0,
206
+ proxyRetryCount: 0
207
+ };
208
+ this.debug(`Queue add: ${normalizedUrl} (depth=${depth}, type=${assetType}, queue size=${this.queuedUrls.size})`);
209
+ this.queue.add(() => this.processItem(item), { priority: depth });
210
+ }
211
+ async processQueue() {
212
+ this.debug(`Processing queue with ${this.queuedUrls.size} URLs`);
213
+ await this.queue.onIdle();
214
+ this.debug(`Queue processing complete`);
215
+ }
216
+ async processItem(item) {
217
+ if (this.aborted)
218
+ return;
219
+ await this.applyRateLimit();
220
+ this.debug(`Downloading: ${item.url}`);
221
+ this.emit("start", {
222
+ url: item.url,
223
+ filename: this.fileWriter.getOutputPath(item.url),
224
+ depth: item.depth,
225
+ parentUrl: item.parentUrl,
226
+ assetType: item.assetType,
227
+ timestamp: Date.now()
228
+ });
229
+ try {
230
+ const result = await this.downloadFile(item);
231
+ this.visitedUrls.add(item.url);
232
+ const outputDir = normalize(this.options.outputDir || ".");
233
+ const normalizedFilename = normalize(result.filename);
234
+ const relativePath = relative(outputDir, normalizedFilename);
235
+ this.urlMap.set(item.url, relativePath);
236
+ this.debug(`Downloaded: ${item.url} -> ${relativePath} (${result.size} bytes, ${result.duration}ms)`);
237
+ this.stats.urlsDownloaded++;
238
+ this.stats.bytesDownloaded += result.size;
239
+ this.stats.filesWritten++;
240
+ this.totalDownloaded += result.size;
241
+ this.emit("complete", {
242
+ url: item.url,
243
+ finalUrl: result.finalUrl,
244
+ filename: result.filename,
245
+ size: result.size,
246
+ mimeType: result.mimeType,
247
+ statusCode: result.statusCode,
248
+ duration: result.duration,
249
+ fromCache: result.fromCache,
250
+ resumed: result.resumed,
251
+ depth: item.depth
252
+ });
253
+ if (this.shouldExtractAssets(item, result)) {
254
+ this.debug(`Extracting assets from ${item.url}`);
255
+ await this.extractAndQueueAssets(item, result);
256
+ }
257
+ } catch (error) {
258
+ await this.handleError(item, error);
259
+ }
260
+ }
261
+ async downloadFile(item) {
262
+ const startTime = Date.now();
263
+ const outputPath = this.fileWriter.getOutputPath(item.url);
264
+ let resumed = false;
265
+ if (this.cache && item.assetType !== "document") {
266
+ const cacheCheck = await this.cache.check(item.url);
267
+ if (cacheCheck.cached && cacheCheck.entry && cacheCheck.filename) {
268
+ const outputDir = normalize(this.options.outputDir || ".");
269
+ const fullPath = join(outputDir, cacheCheck.filename);
270
+ this.debug(`Cache hit: ${item.url} -> ${fullPath}`);
271
+ const tracker = this.progressReporter.createTracker(item.url, fullPath);
272
+ tracker.start(cacheCheck.entry.totalBytes, cacheCheck.entry.contentType);
273
+ tracker.update(cacheCheck.entry.totalBytes);
274
+ this.emit("progress", tracker.getProgress());
275
+ return {
276
+ url: item.url,
277
+ finalUrl: item.url,
278
+ filename: fullPath,
279
+ size: cacheCheck.entry.totalBytes,
280
+ mimeType: cacheCheck.entry.contentType,
281
+ statusCode: 200,
282
+ duration: Date.now() - startTime,
283
+ fromCache: true,
284
+ resumed: false
285
+ };
286
+ }
287
+ }
288
+ const resumeInfo = await this.resumeHandler.getResumeInfo(outputPath);
289
+ const resumeHeaders = this.resumeHandler.getResumeHeaders(resumeInfo);
290
+ const timestampHeaders = await this.resumeHandler.getTimestampHeaders(outputPath);
291
+ const headers = new RezoHeaders({
292
+ ...resumeHeaders,
293
+ ...timestampHeaders
294
+ });
295
+ if (item.parentUrl) {
296
+ headers.set("Referer", item.parentUrl);
297
+ }
298
+ const response = await this.http.get(item.url, {
299
+ headers,
300
+ responseType: "buffer",
301
+ timeout: this.options.timeout,
302
+ maxRedirects: this.options.maxRedirects,
303
+ rejectUnauthorized: !this.options.noCheckCertificate,
304
+ proxy: this.options.proxy
305
+ });
306
+ if (response.status === 304) {
307
+ return {
308
+ url: item.url,
309
+ finalUrl: item.url,
310
+ filename: outputPath,
311
+ size: resumeInfo.bytesDownloaded,
312
+ mimeType: response.headers.get("content-type") || "application/octet-stream",
313
+ statusCode: 304,
314
+ duration: Date.now() - startTime,
315
+ fromCache: true,
316
+ resumed: false
317
+ };
318
+ }
319
+ const contentType = response.headers.get("content-type");
320
+ const contentLengthHeader = response.headers.get("content-length");
321
+ const contentLength = contentLengthHeader ? parseInt(contentLengthHeader, 10) : null;
322
+ const contentFilterResult = this.checkContentTypeFilter(contentType, contentLength);
323
+ if (!contentFilterResult.allowed) {
324
+ throw new WgetErrorClass(contentFilterResult.message || "Content filtered", "CONTENT_FILTERED", item.url, response.status);
325
+ }
326
+ if (response.status === 206 && resumeInfo.canResume) {
327
+ resumed = true;
328
+ }
329
+ const content = response.data;
330
+ const mimeType = this.getMimeType(response);
331
+ const writtenPath = await this.fileWriter.write(item.url, content, mimeType);
332
+ const tracker = this.progressReporter.createTracker(item.url, writtenPath);
333
+ tracker.start(content.length, mimeType);
334
+ tracker.update(content.length);
335
+ this.emit("progress", tracker.getProgress());
336
+ const lastModified = response.headers.get("last-modified");
337
+ if (lastModified) {
338
+ const mtime = this.resumeHandler.parseLastModified(lastModified);
339
+ if (mtime) {
340
+ await this.fileWriter.setMtime(writtenPath, mtime);
341
+ }
342
+ }
343
+ if (this.options.deleteAfter) {
344
+ await this.fileWriter.deleteFile(writtenPath);
345
+ }
346
+ if (this.cache) {
347
+ const outputDir = normalize(this.options.outputDir || ".");
348
+ const relativePath = relative(outputDir, writtenPath);
349
+ this.cache.set(item.url, {
350
+ filenames: [relativePath],
351
+ bytesDownloaded: content.length,
352
+ totalBytes: content.length,
353
+ percent: 100,
354
+ contentType: mimeType,
355
+ lastDownloaded: Date.now(),
356
+ etag: response.headers.get("etag") || undefined,
357
+ lastModified: response.headers.get("last-modified") || undefined
358
+ });
359
+ }
360
+ return {
361
+ url: item.url,
362
+ finalUrl: item.url,
363
+ filename: writtenPath,
364
+ size: content.length,
365
+ mimeType,
366
+ statusCode: response.status,
367
+ duration: Date.now() - startTime,
368
+ fromCache: false,
369
+ resumed
370
+ };
371
+ }
372
+ shouldExtractAssets(item, result) {
373
+ if (!this.options.recursive && !this.options.pageRequisites) {
374
+ return false;
375
+ }
376
+ const maxDepth = this.options.depth ?? this.options.maxDepth ?? 5;
377
+ if (maxDepth !== 0 && maxDepth !== 1 / 0 && item.depth >= maxDepth) {
378
+ return false;
379
+ }
380
+ const extractableTypes = ["text/html", "text/css", "text/xml", "application/xml", "image/svg+xml"];
381
+ const baseMime = result.mimeType.split(";")[0].trim().toLowerCase();
382
+ return extractableTypes.some((type) => baseMime.includes(type.split("/")[1]));
383
+ }
384
+ async extractAndQueueAssets(item, result) {
385
+ const content = await this.readFile(result.filename);
386
+ if (!content)
387
+ return;
388
+ const assets = this.assetExtractor.extract(content, result.mimeType, item.url, {
389
+ strictComments: this.options.strictComments,
390
+ followTags: this.options.followTags,
391
+ ignoreTags: this.options.ignoreTags
392
+ });
393
+ const filteredAssets = this.assetExtractor.filterAssets(assets, this.options);
394
+ this.debug(` Found ${assets.length} assets, ${filteredAssets.length} will be queued`);
395
+ const skippedAssets = [];
396
+ for (const asset of assets) {
397
+ if (!filteredAssets.includes(asset)) {
398
+ skippedAssets.push({ asset, reason: "pattern-rejected" });
399
+ }
400
+ }
401
+ this.emit("assets", {
402
+ url: item.url,
403
+ filename: result.filename,
404
+ contentType: result.mimeType,
405
+ assets,
406
+ filteredAssets,
407
+ skippedAssets
408
+ });
409
+ const nextDepth = item.depth + 1;
410
+ for (const asset of filteredAssets) {
411
+ const assetDepth = asset.required && this.options.pageRequisites ? item.depth : nextDepth;
412
+ await this.addToQueue(asset.url, assetDepth, item.url, asset.type);
413
+ }
414
+ }
415
+ async handleError(item, error) {
416
+ const wgetError = this.toWgetError(error, item.url);
417
+ const isProxyError = wgetError.isProxyError();
418
+ this.debug(`Error downloading ${item.url}: ${wgetError.code} - ${wgetError.message}`);
419
+ if (isProxyError) {
420
+ this.debug(` Proxy error detected (proxy retry ${item.proxyRetryCount})`);
421
+ }
422
+ const maxTries = this.options.tries || 3;
423
+ const maxProxyRetries = this.options.maxProxyRetries ?? 3;
424
+ const retryProxyErrors = this.options.retryProxyErrors ?? true;
425
+ let shouldRetry = false;
426
+ let isProxyRetry = false;
427
+ if (isProxyError && retryProxyErrors && item.proxyRetryCount < maxProxyRetries) {
428
+ shouldRetry = true;
429
+ isProxyRetry = true;
430
+ } else if (!isProxyError && item.retryCount < maxTries - 1 && wgetError.isRetryable()) {
431
+ shouldRetry = true;
432
+ }
433
+ this.emit("error", {
434
+ url: item.url,
435
+ error: wgetError,
436
+ statusCode: wgetError.statusCode || null,
437
+ retryCount: isProxyRetry ? item.proxyRetryCount : item.retryCount,
438
+ willRetry: shouldRetry,
439
+ depth: item.depth,
440
+ parentUrl: item.parentUrl
441
+ });
442
+ if (shouldRetry) {
443
+ const baseDelay = 1000;
444
+ const maxDelay = (this.options.waitRetry || 10) * 1000;
445
+ const retryCount = isProxyRetry ? item.proxyRetryCount : item.retryCount;
446
+ const delay = Math.min(baseDelay * Math.pow(2, retryCount), maxDelay);
447
+ this.debug(` Will retry in ${delay}ms (attempt ${retryCount + 2}/${isProxyRetry ? maxProxyRetries : maxTries})`);
448
+ this.emit("retry", {
449
+ url: item.url,
450
+ error: wgetError,
451
+ attempt: retryCount + 2,
452
+ maxAttempts: isProxyRetry ? maxProxyRetries : maxTries,
453
+ delayMs: delay
454
+ });
455
+ await this.sleep(delay);
456
+ const retryItem = isProxyRetry ? { ...item, proxyRetryCount: item.proxyRetryCount + 1 } : { ...item, retryCount: item.retryCount + 1 };
457
+ this.queue.add(() => this.processItem(retryItem), { priority: item.priority });
458
+ } else {
459
+ this.debug(` Not retrying - marking as failed`);
460
+ this.visitedUrls.add(item.url);
461
+ this.stats.urlsFailed++;
462
+ }
463
+ }
464
+ async fetchRobots(url) {
465
+ if (this.robots.hasFetched(url))
466
+ return;
467
+ await this.robots.fetch(url, async (robotsUrl) => {
468
+ try {
469
+ const response = await this.http.get(robotsUrl, {
470
+ timeout: 1e4
471
+ });
472
+ return response.data;
473
+ } catch {
474
+ return null;
475
+ }
476
+ });
477
+ const domain = new URL(url).hostname;
478
+ const parsed = this.robots.getParsed(domain);
479
+ this.emit("robots", {
480
+ domain,
481
+ url: this.robots.getRobotsUrl(url),
482
+ found: !!parsed,
483
+ rulesCount: this.robots.getRulesCount(domain),
484
+ crawlDelay: this.robots.getCrawlDelay(domain)
485
+ });
486
+ }
487
+ async convertLinks() {
488
+ this.linkConverter.setEventHandler((event) => {
489
+ this.emit("convert", event);
490
+ });
491
+ await this.linkConverter.convertLinks(this.options.outputDir || ".", this.urlMap);
492
+ }
493
+ async applyRateLimit() {
494
+ if (!this.options.wait || this.options.wait <= 0)
495
+ return;
496
+ let waitTime = this.options.wait * 1000;
497
+ if (this.options.randomWait) {
498
+ const factor = 0.5 + Math.random();
499
+ waitTime *= factor;
500
+ }
501
+ await this.sleep(waitTime);
502
+ }
503
+ normalizeUrl(url) {
504
+ try {
505
+ const parsed = new URL(url);
506
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
507
+ return null;
508
+ }
509
+ return parsed.href;
510
+ } catch {
511
+ return null;
512
+ }
513
+ }
514
+ getMimeType(response) {
515
+ const contentType = response.headers.get("content-type");
516
+ if (contentType) {
517
+ return contentType.split(";")[0].trim();
518
+ }
519
+ return "application/octet-stream";
520
+ }
521
+ async readFile(path) {
522
+ try {
523
+ const { promises: fs } = await import("node:fs");
524
+ return await fs.readFile(path, "utf-8");
525
+ } catch {
526
+ return null;
527
+ }
528
+ }
529
+ checkContentTypeFilter(contentType, contentLength) {
530
+ if (this.options.excludeMimeTypes && this.options.excludeMimeTypes.length > 0 && contentType) {
531
+ const normalizedMime = contentType.split(";")[0].trim().toLowerCase();
532
+ for (const excludeMime of this.options.excludeMimeTypes) {
533
+ if (normalizedMime === excludeMime.toLowerCase()) {
534
+ return {
535
+ allowed: false,
536
+ reason: "pattern-rejected",
537
+ message: `MIME type ${normalizedMime} is excluded`
538
+ };
539
+ }
540
+ }
541
+ }
542
+ if (contentLength !== null && contentLength > 0) {
543
+ if (this.options.maxFileSize && contentLength > this.options.maxFileSize) {
544
+ return {
545
+ allowed: false,
546
+ reason: "quota-exceeded",
547
+ message: `File size ${contentLength} bytes exceeds max ${this.options.maxFileSize} bytes`
548
+ };
549
+ }
550
+ if (this.options.minFileSize && contentLength < this.options.minFileSize) {
551
+ return {
552
+ allowed: false,
553
+ reason: "pattern-rejected",
554
+ message: `File size ${contentLength} bytes below min ${this.options.minFileSize} bytes`
555
+ };
556
+ }
557
+ }
558
+ return { allowed: true };
559
+ }
560
+ toWgetError(error, url) {
561
+ if (error instanceof WgetErrorClass) {
562
+ return error;
563
+ }
564
+ if (error instanceof Error) {
565
+ return WgetErrorClass.fromNetworkError(url, error);
566
+ }
567
+ return new WgetErrorClass(String(error), "UNKNOWN_ERROR", url);
568
+ }
569
+ emitSkip(url, reason, message, depth, parentUrl) {
570
+ this.emit("skip", {
571
+ url,
572
+ reason,
573
+ message,
574
+ depth,
575
+ parentUrl
576
+ });
577
+ }
578
+ sleep(ms) {
579
+ return new Promise((resolve) => setTimeout(resolve, ms));
580
+ }
581
+ abort() {
582
+ this.aborted = true;
583
+ this.queue.clear();
584
+ }
585
+ getStats() {
586
+ return { ...this.stats };
587
+ }
588
+ getUrlMap() {
589
+ return new Map(this.urlMap);
590
+ }
591
+ async destroy() {
592
+ this.aborted = true;
593
+ this.queue.clear();
594
+ this.visitedUrls.clear();
595
+ this.queuedUrls.clear();
596
+ this.urlMap.clear();
597
+ this.eventHandlers.clear();
598
+ if (this.cache) {
599
+ await this.cache.destroy();
600
+ this.cache = null;
601
+ }
602
+ }
603
+ }
604
+ export default Downloader;